# Submission based on guided lesson

In [4]:
import pandas as pd

In [5]:
diamonds = pd.read_csv('../../data/raw/diamonds_train.csv')
diamonds_predict = pd.read_csv('../../data/raw/diamonds_predict.csv')

In [6]:
#empaquetador de funcionalidades donde podremos tener todo. Es como una plantilla que vale para cualquier modelo de Machine Learning con optimización de hyperparámetros

In [7]:
diamonds.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [8]:
NUM_FEATS = ['carat', 'depth', 'table', 'x', 'y', 'z']
CAT_FEATS = ['cut', 'color', 'clarity']
TARGET = 'price'
FEATS = NUM_FEATS + CAT_FEATS

In [9]:
FEATS

['carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color', 'clarity']

In [10]:
#PIPELINE para encadenar cosas automáticamente

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [12]:
#preprocesamiento para valores numéricos

In [13]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), 
                                      ('scaler', StandardScaler())])

In [14]:
numeric_transformer

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])

In [15]:
numeric_transformer.fit_transform(diamonds[NUM_FEATS]) #aquí aplica, entrena y transforma usando lo anterior

array([[ 0.8670056 ,  0.45201864,  0.24798091,  0.97880679,  0.92198533,
         1.02265738],
       [-1.00455749,  0.8710986 , -0.19974534, -1.22673789, -1.17981558,
        -1.1292594 ],
       [-0.18443434,  2.61726508, -1.09519783, -0.09728557, -0.17688154,
         0.16189067],
       ...,
       [ 0.44642962,  0.66155862, -0.64747158,  0.56971383,  0.5993022 ,
         0.6783507 ],
       [-0.98352869,  0.10278535, -1.4086062 , -1.13780463, -1.10132509,
        -1.11491329],
       [ 0.93009199,  0.172632  ,  0.24798091,  0.97880679,  1.00047582,
         1.02265738]])

In [16]:
#preprocesamiento para valores categóricas

In [17]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder #onehotencoding es igual que get_dummies

In [18]:
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', 
                                                                    fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [19]:
categorical_transformer.fit_transform(diamonds[CAT_FEATS]).todense() #esto es lo que nos genera lo de arriba, sin necesidad de standarizar estas variables de 0 y 1

matrix([[0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.]])

In [20]:
# coger lo que hay hasta ahora y hacer un pre-proceso

In [21]:
from sklearn.compose import ColumnTransformer

In [22]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                                ('cat', categorical_transformer, CAT_FEATS)])

In [23]:
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['carat', 'depth', 'table', 'x', 'y', 'z']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['cut', 'color', 'clarity'])])

In [24]:
preprocessor.fit_transform(diamonds)

array([[ 0.8670056 ,  0.45201864,  0.24798091, ...,  1.        ,
         0.        ,  0.        ],
       [-1.00455749,  0.8710986 , -0.19974534, ...,  1.        ,
         0.        ,  0.        ],
       [-0.18443434,  2.61726508, -1.09519783, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.44642962,  0.66155862, -0.64747158, ...,  0.        ,
         0.        ,  0.        ],
       [-0.98352869,  0.10278535, -1.4086062 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.93009199,  0.172632  ,  0.24798091, ...,  0.        ,
         0.        ,  0.        ]])

In [25]:
pd.DataFrame(preprocessor.fit_transform(diamonds)) #se pierde el valor de las columnas con lo que es muy dificil saber a que corresponde

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.140380,0.661559,-0.199745,1.218927,1.140014,1.280887,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
40451,2.570338,-3.249854,1.143433,2.295019,2.195276,1.711271,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
40452,0.446430,0.661559,-0.647472,0.569714,0.599302,0.678351,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
40453,-0.983529,0.102785,-1.408606,-1.137805,-1.101325,-1.114913,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Simple model

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
diamonds_train, diamonds_test = train_test_split(diamonds) #dividimos nuestro modelo

In [28]:
diamonds_train.shape

(30341, 10)

In [29]:
diamonds_test.shape

(10114, 10)

In [30]:
from sklearn.ensemble import ExtraTreesRegressor

## nuevo pipeline

In [31]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', ExtraTreesRegressor(n_jobs=-1))])

In [32]:
diamonds_train[FEATS]

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
24718,1.01,65.3,58.0,6.29,6.24,4.09,Fair,D,VS2
5627,0.47,58.4,60.0,5.16,5.12,3.00,Fair,G,SI1
1086,0.35,60.1,62.0,4.57,4.52,2.73,Premium,H,VVS1
7800,0.37,59.7,62.0,4.70,4.68,2.80,Premium,F,VS2
28019,1.22,63.0,55.0,6.83,6.78,4.29,Ideal,E,VVS2
...,...,...,...,...,...,...,...,...,...
3263,0.43,63.8,57.0,4.80,4.82,3.07,Good,F,SI1
12424,1.01,58.3,62.0,6.49,6.43,3.77,Premium,J,SI2
12985,2.00,62.0,62.0,8.02,7.91,4.94,Premium,J,VS1
28049,0.41,61.4,57.0,4.79,4.76,2.93,Ideal,E,VVS2


In [33]:
diamonds_train[TARGET]

24718     6366
5627      1086
1086       956
7800       894
28019    12036
         ...  
3263       739
12424     2683
12985    11793
28049     1243
14062     1807
Name: price, Length: 30341, dtype: int64

In [34]:
model.fit(diamonds_train[FEATS], diamonds_train[TARGET])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                 

In [35]:
#ya entrenado el modelo cargamos

In [36]:
from sklearn.metrics import mean_squared_error

In [37]:
y_train_pred = model.predict(diamonds_train[FEATS])

In [38]:
#y_train = model.predict(diamonds_train[FEATS])

In [39]:
y_train_real = diamonds_train[TARGET]

In [40]:
mean_squared_error(y_pred=y_train_pred, y_true=y_train_real, squared=False)

6.199859211581581

In [41]:
y_test_real = diamonds_test[TARGET]

In [42]:
y_test_pred = model.predict(diamonds_test[FEATS])

In [43]:
mean_squared_error(y_pred=y_test_pred, y_true=y_test_real, squared=False) #hay mucho sobreajuste

541.7364412503775

In [44]:
# Validación cruzada

In [45]:
from sklearn.model_selection import cross_val_score

In [46]:
scores = cross_val_score(model, 
                         diamonds[FEATS], 
                         diamonds[TARGET], 
                         scoring='neg_root_mean_squared_error', #pasar el nombre de una métrica que se puede ver en una lista
                         cv=5, n_jobs=-1) #cv son el nº de trozos para entrenar, cuanto mayor más lento. n_jobs será cuantas cpu utiliza a la vez

In [47]:
scores #error obtenido en cada uno de los trozos. los coeficientes son referencia donde cuando error sea muy bajo entonces ya podremos hacer submission

array([-534.60463669, -551.1587177 , -543.9130345 , -580.90695811,
       -551.23278973])

In [48]:
import numpy as np
np.mean(-scores) #media de error. Lo ponemos negativo porque la métrica puesta en scoring es negative root mean squared error

552.3632273463961

In [49]:
#Los hiperparámetros que están por encima de los parámetros del modelo, 
#definiendo antes los hyperparámetros que son los que definen la tipología del modelo, afectando al rendimiento

In [50]:
#optimización del modelo utilizando una búsqueda de malla o combinaciones

In [51]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [120]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'], #se utiliza el preprocesor, el num y el imputer con guion bajo para ir pasando de nivel
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
} #diccionario con los valores que queremos optimizar

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, #para cuando vaya entrenando saque output de información
                                 scoring='neg_root_mean_squared_error', #que me optimice
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.3min


KeyboardInterrupt: 

In [121]:
#lo paramos porque consume mucho ahora mismos

In [None]:
grid_search.best_params_

In [123]:
grid_search.best_score_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_score_'

In [124]:
# guardar el modelo

In [None]:
from joblib import dump, load
dump(grid_search, 'model.joblib')

In [None]:
grid_search.best_score_

In [None]:
submission_df = pd.DataFrame({'id': diamonds_predict.id, 'price': grid_search.predict(diamonds_predict[FEATS]).clip(300, 18000)})

In [None]:
submission_df.to_csv('modelo_cv_pepino.csv', index=False)

In [None]:
#ver www.pedro-munoz.tech