In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
from datetime import datetime
from scipy.stats import skew
from scipy.stats.stats import pearsonr
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
warnings.filterwarnings('ignore')
sns.set(style='whitegrid')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Obteniendo la data

In [125]:
train_df=pd.read_csv('files_nuevoReto/googleplay_train.csv')
test_df=pd.read_csv('files_nuevoReto/googleplay_test.csv')
combine=[train_df,test_df]

In [126]:
#Convirtiendo los valores que no son numericos dentro de la columna Size a NaN para luego rellenarlas con su media
def converting(combine,col):
    for dataset in combine:
        dataset[col]=pd.to_numeric(dataset[col], errors='coerce')
converting(combine,'Size')

In [127]:
def splittingDate(data):
    mes=[datetime.strptime(date, "%B %d, %Y").month for date in data['Last Updated']] #Obtenemos el mes
    año=[datetime.strptime(date, "%B %d, %Y").year for date in data['Last Updated']] #Obtenemos el dia y el año [dia,año]
    #dia=[datetime.strptime(date, "%B %d, %Y").day for date in data['Last Updated']] #Obtenemos el dia y el año [dia,año]
    data['Month_updated']=mes
    data['Year_updated']=año
    #data['Day_updated']=dia
    #eliminamos la columna que se dividió
    data.drop('Last Updated',axis=1,inplace=True)

In [128]:
splittingDate(train_df)
splittingDate(test_df)

In [129]:
#Juntamos toda la data en un solo dataset para trabajar sobre ella, se separá para el entrenamiento
all_data = pd.concat((train_df.drop(['App','Rating'],axis=1),test_df.drop(['App'],axis=1)))

Escalamiento de los valores numericos que se encuentran muy alejados

In [130]:
#Transformamos el target(Rating) usando una funcion de numpy que aplica el logaritmo
#train_df["Rating"] = np.log1p(train_df["Rating"])
columnas_numericas = all_data.dtypes[all_data.dtypes != "object"].index
#Calculamos el sesgo existente de cada columna numerica con el fin de saber si se aplicará un escalamiento de estos o no
columnas_sesgadas = train_df[columnas_numericas].apply(lambda x: skew(x.dropna())) #comp
#Para datos distribuidos normalmente, la asimetría debe ser aproximadamente 0, por lo tanto si el resultado es mas de 0.75 significa que estan muy sesgados
columnas_sesgadas = columnas_sesgadas[columnas_sesgadas > 0.75]
columnas_sesgadas = columnas_sesgadas.index

all_data[columnas_sesgadas] = np.log1p(all_data[columnas_sesgadas])

In [131]:
columnas_numericas

Index(['Installs', 'Price', 'Reviews', 'Size', 'Month_updated',
       'Year_updated'],
      dtype='object')

In [132]:
columnas_categoricas=[col for col in all_data.columns if col not in columnas_numericas]
columnas_categoricas

['Android Ver', 'Category', 'Content Rating', 'Genres', 'Type']

In [133]:
all_data.head()

Unnamed: 0,Android Ver,Category,Content Rating,Genres,Installs,Price,Reviews,Size,Type,Month_updated,Year_updated
0,Varies with device,NEWS_AND_MAGAZINES,Everyone,News & Magazines,13.815512,0.0,9.247925,,Free,7,2018
1,2.2,FAMILY,Everyone,Education,8.517393,0.0,3.7612,6.318968,Free,5,2017
2,4.0.3,TOOLS,Everyone,Tools,11.512935,0.0,8.020928,8.972819,Free,6,2017
3,3.0,FAMILY,Teen,Role Playing,10.819798,2.396986,8.602086,9.704122,Paid,6,2018
4,Varies with device,TOOLS,Everyone,Tools,13.815512,0.0,10.6846,,Free,8,2017


In [134]:
#transformando los años a categorico
#all_data['Year_updated']=all_data['Year_updated'].astype(str)
#all_data['Month_updated']=all_data['Month_updated'].astype(str)
#all_data['Day_updated']=all_data['Day_updated'].astype(str)

In [135]:
all_data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Android Ver,9057,26.0,4.1,1987.0,,,,,,,
Category,9344,33.0,FAMILY,1743.0,,,,,,,
Content Rating,8976,6.0,Everyone,7100.0,,,,,,,
Genres,9344,115.0,Tools,732.0,,,,,,,
Installs,9344,,,,12.2993,3.8375,0.693147,9.21044,13.1224,15.4249,20.7233
Price,9094,,,,0.111635,0.478088,0.0,0.0,0.0,0.0,5.99396
Reviews,9344,,,,8.37954,3.84623,0.693147,5.22977,8.68313,11.3091,18.1742
Size,7711,,,,9.43868,1.30386,2.25129,8.59936,9.5706,10.428,11.5367
Type,9344,2.0,Free,8701.0,,,,,,,
Month_updated,9344,,,,6.45163,2.50967,1.0,5.0,7.0,8.0,12.0


In [136]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9344 entries, 0 to 1855
Data columns (total 11 columns):
Android Ver       9057 non-null object
Category          9344 non-null object
Content Rating    8976 non-null object
Genres            9344 non-null object
Installs          9344 non-null float64
Price             9094 non-null float64
Reviews           9344 non-null float64
Size              7711 non-null float64
Type              9344 non-null object
Month_updated     9344 non-null int64
Year_updated      9344 non-null int64
dtypes: float64(4), int64(2), object(5)
memory usage: 876.0+ KB


Rellenando las columnas numericas que contienen valores vacios

In [137]:
#Podemos rellenar los valores vacíos de Size con la relacion que tiene con las demas caracteristicas
#Pero ahora solo rellenaremos con la media
#means={columna:{mean:value,is_null:value}}
def refill_colnum(data,columns,means=None):
    train=means is None
    if train:
        means={}
    for col in columns:
        if train:
            means[col]={}
            means[col]['mean']=data[col].mean()
        if train and data[col].isna().sum() > 15:
            means[col]['is_null']=True
        #if 'is_null' in means[col]:
         #   data[f'{col}Missing']=data[col].isnull()
        data[col]=data[col].fillna(means[col]['mean'])
    return means

In [138]:
refill_colnum(all_data,columnas_numericas)

{'Installs': {'mean': 12.299329759863443},
 'Price': {'mean': 0.11163483877012781, 'is_null': True},
 'Reviews': {'mean': 8.379542556369637},
 'Size': {'mean': 9.438679247226464, 'is_null': True},
 'Month_updated': {'mean': 6.451626712328767},
 'Year_updated': {'mean': 2017.4197345890411}}

Rellenando las columnas categoricas que contienen valores vacios

In [139]:
def refillUnknown(data,columns):
    for col in columns:
        if data[col].isna().sum() >0:
            #data[f'{col}Missing']=data[col].isnull()
            data[col]=data[col].fillna('Unkown')

In [140]:
refillUnknown(all_data,columnas_categoricas)

In [141]:
all_data=pd.get_dummies(all_data)

In [142]:
all_data.head()

Unnamed: 0,Installs,Price,Reviews,Size,Month_updated,Year_updated,Android Ver_1.0,Android Ver_1.5,Android Ver_1.6,Android Ver_2.0,...,Genres_Travel & Local,Genres_Travel & Local;Action & Adventure,Genres_Trivia,Genres_Video Players & Editors,Genres_Video Players & Editors;Creativity,Genres_Video Players & Editors;Music & Video,Genres_Weather,Genres_Word,Type_Free,Type_Paid
0,13.815512,0.0,9.247925,9.438679,7,2018,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,8.517393,0.0,3.7612,6.318968,5,2017,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,11.512935,0.0,8.020928,8.972819,6,2017,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,10.819798,2.396986,8.602086,9.704122,6,2018,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,13.815512,0.0,10.6846,9.438679,8,2017,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Separando la data en entrenamiento y prueba

In [143]:
#creating matrices for sklearn:
X_to_train = all_data[:train_df.shape[0]]
X_test = all_data[train_df.shape[0]:]
y=train_df.Rating
y_transform = np.log1p(train_df.Rating)

In [144]:
y.head()

0    3.7
1    4.3
2    3.9
3    4.3
4    4.2
Name: Rating, dtype: float64

In [189]:
from sklearn.model_selection import train_test_split
X_train,X_validation,y_train,y_validation=train_test_split(X_to_train,y,test_size=0.1,random_state=2019)

In [190]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [191]:
from sklearn.metrics import mean_squared_error, r2_score
y_train_pred = lr.predict(X_train)

y_validation_pred=lr.predict(X_validation)

mean_squared_error(y_train, y_train_pred),mean_squared_error(y_validation, y_validation_pred)
#0.2196

(0.21528523975419167, 493720217240650.3)

In [192]:
X_test.shape,test_df.shape

((1856, 190), (1856, 12))

In [193]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV,Lasso
model_ridge = Ridge(alpha=20,random_state=2019).fit(X_train,y_train)

In [194]:
y_train_pred = model_ridge.predict(X_train)

y_validation_pred=model_ridge.predict(X_validation)

mean_squared_error(y_train, y_train_pred),mean_squared_error(y_validation, y_validation_pred)
#0.2194

(0.21684886717071492, 0.20937797866055866)

## **Creando el output**

In [183]:
y_test_pred=model_ridge.predict(X_test)

In [184]:
submission=pd.read_csv('files_nuevoReto/rating_sammple_submission.csv')

In [185]:
submission.head()

Unnamed: 0,Id,Rating
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0


In [186]:
submission['Rating']=y_test_pred

In [187]:
submission.to_csv('nuevo_v2_ridgeModel_sample_submission.csv', index=False)

In [188]:
submission.shape

(1856, 2)