In [325]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error as mse 
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

# Armando un SK Learn Pipeline. 

### Primero las funciones básicas de preprocesamiento. 

In [326]:
def basic_preprocessing(path):
    """
    Una función básica para preprocesar los datos de entrenamiento. 
    Parameters:
    -------
    path: str
          path en tu compu donde está el dataset
    
    Returns:
    --------
    data: pandas dataframe
          Un dataframe listo para el pipeline de sklearn. 
    """
    data = pd.read_csv(path)
    data['MS SubClass'] = data['MS SubClass'].astype('category')
    data = data[data['Sale Condition'] == "Normal"]
    data = data.drop(columns = ['MS Zoning', 'Lot Frontage', 'Alley', 'Mas Vnr Type', 'Mas Vnr Area',
     'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Electrical',
     'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Qual', 'Garage Cond',
     'Pool QC', 'Fence', 'Misc Feature', 'Condition 1', 'Condition 2', 'Exterior 1st', 'Exterior 2nd', 
    'Heating', 'Heating QC'])
    data = data[data['Gr Liv Area'] < 4_000]
    return data 

Es mejor no hacer transformaciones de columnas aquí porque eso lo haremos en el pipeline. 

### Después iniciamos el pipeline. 

No está tan sencillo crearlo pero tampoco es el infierno. Primero necesitamos crear una clase con algunas transformaciones básicas. 

No voy a tomar todas las columnas, es solo un ejemplo. 

In [327]:
pd.set_option('display.max_columns', 40)
data = basic_preprocessing('casas_entrena.csv')
data.head()

Unnamed: 0,MS SubClass,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exter Qual,Exter Cond,Foundation,...,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Garage Cars,Garage Area,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,120,3072,Pave,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,TwnhsE,1Story,7,5,2004,2004,Hip,CompShg,Gd,TA,PConc,...,1,Gd,7,Typ,1,2,388,Y,143,20,0,0,0,0,0,9,2006,WD,Normal,225000
2,120,3013,Pave,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,TwnhsE,1Story,7,5,2005,2005,Gable,CompShg,Gd,TA,PConc,...,1,Gd,6,Typ,1,2,440,Y,142,20,0,0,0,0,0,4,2006,WD,Normal,213490
4,120,3196,Pave,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,TwnhsE,1Story,8,5,2003,2003,Gable,CompShg,Gd,TA,PConc,...,1,Gd,7,Typ,1,2,400,Y,143,20,0,0,0,0,0,5,2006,WD,Normal,215000
6,20,3182,Pave,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,1Fam,1Story,7,5,2007,2007,Gable,CompShg,Gd,TA,PConc,...,1,Gd,6,Typ,1,2,388,Y,100,16,0,0,0,0,0,3,2008,WD,Normal,159895
7,120,3203,Pave,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,TwnhsE,1Story,7,5,2006,2006,Gable,CompShg,Gd,TA,PConc,...,1,Gd,6,Typ,0,2,437,Y,100,116,0,0,0,0,0,1,2010,WD,Normal,160000


In [328]:
data.shape

(1208, 54)

# Separando en features y labels. 

In [329]:
X = data.iloc[:,:-1].copy()
Y = data.SalePrice

In [330]:
idx = {col: X.columns.get_loc(col) for col in X.columns}

In [331]:
idx["Overall Qual"]

11

In [332]:
class CombinedAttributesAdder2(BaseEstimator, TransformerMixin): #dejar esto así
    def __init__(self, house_condition = True): #te permite hacer pruebas 
        self.house_condition = house_condition
    def fit(self, X, y=None):
        return self #no hay que hacer nada más 
    def transform(self, X, y=None):
        #supongamos que queremos calidad de la casa al cuadrado
        qual_squared = X[:, idx["Overall Qual"]] ** 2
        #y otra que sea calidad * metros cuadrados
        qual_m2 = X[:, 1] * X[:,0]
        if self.house_condition: #pudo haber sido el año o alguna otra variable 
            cond = X[:,2]
            return np.c_[X, qual_squared, qual_m2, cond]
        else:
            return np.c_[X, qual_squared, qual_m2]

### Ahora se entenderá más claro qué es esto. 

Básicamente, esta clase nos ayuda a definir las nuevas variables que queremos agregar y es algo que usaremos en nuestro pipeline. El siguiente paso es crear un pipeline para las variables numéricas. 

In [333]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder2()),
    ('std_scaler', StandardScaler()),
])

## ¿Cómo se usa esto? 

Seleccionemos primero las variables numéricas: 

In [334]:
num_attribs = X.select_dtypes(include = "number").columns.values
num_attribs

array(['Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF',
       'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd',
       'Fireplaces', 'Garage Cars', 'Garage Area', 'Wood Deck SF',
       'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch',
       'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold'], dtype=object)

In [335]:
len(num_attribs)

32

In [336]:
data_numeric = X[num_attribs]
numeric_transformed = numeric_pipeline.fit_transform(data_numeric)

In [337]:
numeric_transformed.shape

(1208, 35)

Son las 32 variables numéricas más otras 3 que se agregaron en la clase de AttributesAdder2. 

### Pipeline completo (numérico más categórico).

Una vez que tenemos esto, en realidad completar el pipeline no es tan difícil. 

In [338]:
cat_attribs = X.select_dtypes(include = 'object').columns.values
len(cat_attribs)

20

In [339]:
full_pipeline = ColumnTransformer([
    ("numeric", numeric_pipeline, num_attribs),
    ("categorical", OneHotEncoder(handle_unknown='ignore'), cat_attribs),
])

In [340]:
data_prepared = full_pipeline.fit_transform(X)

In [341]:
data_prepared.shape #es lo que usaremos para predecir

(1208, 148)

Terminamos teniendo 196 variables porque son demasiadas categóricas transformamdas en OneHotEncoding() (recordemos que por cada n clases, se crean n variables adicionales). 

Después podemos quitar más variables. Lo importante ahora es estandarizar todo. 

## Entrenando el modelo con validación cruzada 

(Primero quiero poner esta parte del pipeline y después hacer lo de grid searching, o la búsqueda de los mejores hiperparámetros). 

In [342]:
ridge_reg = Ridge()

In [343]:
scores = cross_val_score(ridge_reg, data_prepared, Y, scoring = "neg_mean_squared_error", cv = 5)
ridge_reg_scores = np.sqrt(-scores)

In [344]:
ridge_reg_scores

array([21530.57569061, 22571.76338959, 22714.28575127, 27371.64881815,
       23330.6046707 ])

In [314]:
#import joblib
#joblib.dump(modelo_uno, "modelo_uno.pkl")
#y luego lo cargas: 
#modelo_cargado = joblib.load("modelo_uno.pkl")

# Grid Search. 

In [345]:
param_grid = [
    {'alpha': [.001, .01, .1, 1, 10, 15, 20], 
     'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
]

In [346]:
ridge_reg = Ridge()

In [347]:
grid_search = GridSearchCV(ridge_reg, param_grid, cv = 5, scoring = "neg_mean_squared_error",
                          return_train_score = True)

In [348]:
grid_search.fit(data_prepared, Y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'alpha': [0.001, 0.01, 0.1, 1, 10, 15, 20], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [349]:
grid_search.best_params_

{'alpha': 10, 'solver': 'saga'}

In [350]:
def clean_test_dataset(path):
    data = pd.read_csv(path)
    #Arreglando tipos de datos
    data['MS SubClass'] = data['MS SubClass'].astype('category')
    data = data.drop(columns = ['MS Zoning', 'Lot Frontage', 'Alley', 'Mas Vnr Type', 'Mas Vnr Area',
     'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Electrical',
     'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Qual', 'Garage Cond',
     'Pool QC', 'Fence', 'Misc Feature', 'Condition 1', 'Condition 2', 'Exterior 1st', 'Exterior 2nd',
    'Heating', 'Heating QC'])
    return data

In [351]:
datos_prueba = clean_test_dataset('casas_prueba.csv')

In [352]:
prueba_prepared = full_pipeline.transform(datos_prueba.iloc[:,:-1])

In [353]:
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(prueba_prepared)

In [354]:
len(final_predictions)

1203

In [355]:
submissions = pd.DataFrame({'id': [e for e in range(1,1204)], 'SalePrice': final_predictions})

In [356]:
submissions.head()

Unnamed: 0,id,SalePrice
0,1,239311.140338
1,2,218319.31854
2,3,142500.0035
3,4,287138.63324
4,5,185401.37187


In [358]:
submissions.to_csv('submissions_21_octubre.csv')

### Placeholder. 

Vamos a seguir explorando las etiquetas que no estén. Quizá me tome unos 30 minutos pero está bien. después de eso ya podemos empezar a predecir con nuevos modelos :) Y le puedo avanzar en tiempos libres. Me quedé a punto de tirar "Heating" tanto hasta arriba como abajo. 

In [293]:
datos_prueba.columns

Index(['MS SubClass', 'Lot Area', 'Street', 'Lot Shape', 'Land Contour',
       'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exter Qual', 'Exter Cond',
       'Foundation', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF',
       'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Garage Cars', 'Garage Area', 'Paved Drive',
       'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch',
       'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold',
       'Sale Type', 'Sale Condition', 'id'],
      dtype='object')

In [324]:
datos_prueba['Heating QC'].value_counts()

Ex    592
TA    367
Gd    202
Fa     41
Po      1
Name: Heating QC, dtype: int64

# Resumen (hasta el momento).  

In [None]:
#primero usamos una función 