# Feature Engineering and Model selection

## Setup

In [145]:
import pandas as pd
import numpy as np

In [146]:
df = pd.read_csv("../../data/housing_data.csv", index_col="id")

# Missing neighborhood set to NaN
df.neighborhood = df.neighborhood.apply(lambda x: np.nan if x == "BA" else x)

# Drop null prices and areas over 2000m^2
df = df[~(df.prices.isna())]
df = df[~(df.areas > 2000)]

### Our data

In [147]:
df.head()

Unnamed: 0_level_0,areas,bedrooms,bathrooms,parkingSpots,prices,type,neighborhood
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2669329904,324.0,5.0,3.0,4.0,340000.0,house,Sao Tome
2667338201,51.0,2.0,1.5,1.0,295000.0,,Santa Teresa
2643720430,94.0,3.0,4.0,2.0,629000.0,apartment,Cidade Jardim
2681509397,66.0,2.0,2.0,2.0,460000.0,apartment,Imbuí
2671493238,82.0,3.0,3.0,2.0,649990.0,apartment,Piatã


In [148]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1022 entries, 2669329904 to 2665920996
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   areas         1022 non-null   float64
 1   bedrooms      1021 non-null   float64
 2   bathrooms     1021 non-null   float64
 3   parkingSpots  853 non-null    float64
 4   prices        1022 non-null   float64
 5   type          1017 non-null   object 
 6   neighborhood  999 non-null    object 
dtypes: float64(5), object(2)
memory usage: 63.9+ KB


## Feature Engineering

### Neighborhood Price/Area

Since this relationship behaves differently for apartments/houses. We'll have different values for each. 

In [149]:
neighborhood_house_price_sum =  df[df.type == "house"].groupby("neighborhood").prices.sum()
neighborhood_house_area_sum =  df[df.type == "house"].groupby("neighborhood").areas.sum()

neighborhood_apartment_price_sum =  df[df.type == "apartment"].groupby("neighborhood").prices.sum()
neighborhood_apartment_area_sum =  df[df.type == "apartment"].groupby("neighborhood").areas.sum()

def calculate_neighborhood_area_price(x):
    if x.type == "house" and x.neighborhood in neighborhood_house_price_sum:
        return neighborhood_house_price_sum[x.neighborhood] / neighborhood_house_area_sum[x.neighborhood]
    if x.type == "apartment" and x.neighborhood in neighborhood_house_price_sum:
        return neighborhood_apartment_price_sum[x.neighborhood] / neighborhood_apartment_area_sum[x.neighborhood]
    else: return np.nan

df["neighborhood_area_price"] = df.apply(calculate_neighborhood_area_price, axis=1)

In [150]:
df.head()

Unnamed: 0_level_0,areas,bedrooms,bathrooms,parkingSpots,prices,type,neighborhood,neighborhood_area_price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2669329904,324.0,5.0,3.0,4.0,340000.0,house,Sao Tome,747.782003
2667338201,51.0,2.0,1.5,1.0,295000.0,,Santa Teresa,
2643720430,94.0,3.0,4.0,2.0,629000.0,apartment,Cidade Jardim,
2681509397,66.0,2.0,2.0,2.0,460000.0,apartment,Imbuí,5906.22598
2671493238,82.0,3.0,3.0,2.0,649990.0,apartment,Piatã,7359.345794


## Model Evaluation

### Prepare training data

In [151]:
df.type = df.type.replace({"house": 0, "apartment": 1})

X = df.drop("prices", axis=1)
y = df.prices

In [152]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import GridSearchCV

### Set Pipeline function

In [153]:
def create_pipeline(model, training_data):
    numerical_cols = training_data.select_dtypes(include='number').columns
    categorical_cols = training_data.select_dtypes(include='object').columns

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median')

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    #Bundling
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    return Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

### Random Forest Regressor

In [154]:
from sklearn.ensemble import RandomForestRegressor

pipe = create_pipeline(RandomForestRegressor(random_state=1, n_jobs=6), X)

#### Grid Search

In [155]:
params = {
    "model__n_estimators" : [100, 250, 300, 500],
    'model__bootstrap': [True, False],
    "model__max_depth" : [2, 5, 10, None]
}

cv = GridSearchCV(pipe, params, cv=5, scoring=["r2", "neg_mean_absolute_error"], refit="r2", verbose=2)

In [156]:
cv.fit(X, y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=100; total time=   0.2s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=100; total time=   0.1s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=100; total time=   0.1s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=100; total time=   0.1s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=100; total time=   0.1s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=250; total time=   0.3s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=250; total time=   0.3s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=250; total time=   0.3s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=250; total time=   0.2s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=250; total time=   0.3s
[CV] END model__bootstrap=True, model__m

In [157]:
random_forest_results = pd.DataFrame(cv.cv_results_)
random_forest_results[["mean_fit_time", "mean_score_time", "param_model__bootstrap", "param_model__n_estimators", "param_model__max_depth",
          "mean_test_r2", "rank_test_r2", "mean_test_neg_mean_absolute_error", "std_test_neg_mean_absolute_error"]].sort_values("rank_test_r2").head()

Unnamed: 0,mean_fit_time,mean_score_time,param_model__bootstrap,param_model__n_estimators,param_model__max_depth,mean_test_r2,rank_test_r2,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error
12,0.420008,0.03402,True,100,,0.793749,1,-48173.139775,6391.149572
8,0.23566,0.030639,True,100,10.0,0.793496,2,-48209.548673,6529.880278
10,0.727216,0.056292,True,300,10.0,0.792518,3,-48074.29786,6085.555381
14,1.076205,0.058763,True,300,,0.792499,4,-47972.302853,5938.228594
15,1.830624,0.091984,True,500,,0.792011,5,-48069.722494,5875.808953


Best: bootstrap - True; estimators - 100; max_depth - None; r2 - 0.793749; neg_mean_absolute_error - -48173.139775; std_test_neg_mean_absolute_error - 6391.149572

### XGBoost

In [158]:
from xgboost import XGBRegressor

pipe = create_pipeline(XGBRegressor(random_state=1, n_jobs=6), X)

#### GridSearchCV

In [159]:
params = {
    "model__n_estimators" : [200, 300, 500],
    'model__learning_rate': [0.01, 0.05, 0.1],
    "model__max_depth" : [2, 5, 10, None]
}

cv = GridSearchCV(pipe, params, cv=5, scoring=["r2", "neg_mean_absolute_error"], refit="r2", verbose=2)

In [160]:
cv.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, mo

In [161]:
xgboost_results = pd.DataFrame(cv.cv_results_)
xgboost_results[["mean_fit_time", "mean_score_time", "param_model__n_estimators", "param_model__learning_rate", "param_model__max_depth",
          "mean_test_r2", "rank_test_r2","mean_test_neg_mean_absolute_error", "std_test_neg_mean_absolute_error"]].sort_values("rank_test_r2").head()

Unnamed: 0,mean_fit_time,mean_score_time,param_model__n_estimators,param_model__learning_rate,param_model__max_depth,mean_test_r2,rank_test_r2,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error
27,0.083838,0.007507,200,0.1,5,0.798123,1,-47258.508938,8326.93306
17,0.163581,0.007617,500,0.05,5,0.79806,2,-47376.391054,8533.855535
28,0.113547,0.007008,300,0.1,5,0.797736,3,-47539.986904,8616.497098
16,0.106386,0.00721,300,0.05,5,0.795974,4,-47492.204659,7971.479286
29,0.180249,0.008916,500,0.1,5,0.795664,5,-48055.879109,8789.64589


Best: estimators - 200; learning_rate - 0.1; max_depth - 5 ; r2 - 0.798123; neg_mean_absolute_error - -47258.508938; std_test_neg_mean_absolute_error - 8326.933060