# Feature Engineering and Model selection

## Setup

In [158]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline 
sns.set(style="dark",palette="deep")
pd.set_option("display.max.rows",None)
pd.set_option("display.max.columns",None)

In [159]:
df = pd.read_csv("../../data/housing_data.csv", index_col="id")

# Missing neighborhood set to NaN
df.neighborhood = df.neighborhood.apply(lambda x: np.nan if x == "BA" else x)

# Drop null prices and areas over 2000m^2
df = df[~(df.prices.isna())]
df = df[~(df.areas > 2000)]

### Our data

In [160]:
df.head()

Unnamed: 0_level_0,areas,bedrooms,bathrooms,parkingSpots,prices,type,neighborhood
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2669329904,324.0,5.0,3.0,4.0,340000.0,house,Sao Tome
2667338201,51.0,2.0,1.5,1.0,295000.0,,Santa Teresa
2643720430,94.0,3.0,4.0,2.0,629000.0,apartment,Cidade Jardim
2681509397,66.0,2.0,2.0,2.0,460000.0,apartment,Imbuí
2671493238,82.0,3.0,3.0,2.0,649990.0,apartment,Piatã


In [161]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 993 entries, 2669329904 to 2674156135
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   areas         993 non-null    float64
 1   bedrooms      992 non-null    float64
 2   bathrooms     992 non-null    float64
 3   parkingSpots  829 non-null    float64
 4   prices        993 non-null    float64
 5   type          988 non-null    object 
 6   neighborhood  970 non-null    object 
dtypes: float64(5), object(2)
memory usage: 62.1+ KB


## Feature Engineering

### Neighborhood Price/Area

Since this relationship behaves differently for apartments/houses. We'll have different values for each. 

In [162]:
neighborhood_house_price_sum =  df[df.type == "house"].groupby("neighborhood").prices.sum()
neighborhood_house_area_sum =  df[df.type == "house"].groupby("neighborhood").areas.sum()

neighborhood_apartment_price_sum =  df[df.type == "apartment"].groupby("neighborhood").prices.sum()
neighborhood_apartment_area_sum =  df[df.type == "apartment"].groupby("neighborhood").areas.sum()

def calculate_neighborhood_area_price(x):
    if x.type == "house" and x.neighborhood in neighborhood_house_price_sum:
        return neighborhood_house_price_sum[x.neighborhood] / neighborhood_house_area_sum[x.neighborhood]
    if x.type == "apartment" and x.neighborhood in neighborhood_house_price_sum:
        return neighborhood_apartment_price_sum[x.neighborhood] / neighborhood_apartment_area_sum[x.neighborhood]
    else: return np.nan

df["neighborhood_area_price"] = df.apply(calculate_neighborhood_area_price, axis=1)

In [163]:
df.head()

Unnamed: 0_level_0,areas,bedrooms,bathrooms,parkingSpots,prices,type,neighborhood,neighborhood_area_price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2669329904,324.0,5.0,3.0,4.0,340000.0,house,Sao Tome,736.074271
2667338201,51.0,2.0,1.5,1.0,295000.0,,Santa Teresa,
2643720430,94.0,3.0,4.0,2.0,629000.0,apartment,Cidade Jardim,
2681509397,66.0,2.0,2.0,2.0,460000.0,apartment,Imbuí,5906.22598
2671493238,82.0,3.0,3.0,2.0,649990.0,apartment,Piatã,7359.345794


## Model Evaluation

### Prepare training data

In [164]:
df.type = df.type.replace({"house": 0, "apartment": 1})

X = df.drop("prices", axis=1)
y = df.prices

### Set Pipeline function

In [165]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [166]:
def create_pipeline(model, training_data):
    numerical_cols = training_data.select_dtypes(include='number').columns
    categorical_cols = training_data.select_dtypes(include='object').columns

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median')

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    #Bundling
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    return Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

### Random Forest Regressor

In [167]:
from sklearn.ensemble import RandomForestRegressor

pipe = create_pipeline(RandomForestRegressor(random_state=1, n_jobs=6), X)

#### Grid Search

In [168]:
from sklearn.model_selection import GridSearchCV

params = {
    "model__n_estimators" : [100, 250, 300, 500],
    'model__bootstrap': [True, False]
}

cv = GridSearchCV(pipe, params, cv=5, scoring=["r2", "neg_mean_absolute_error"], refit="r2", verbose=2)

In [169]:
cv.fit(X, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .....model__bootstrap=True, model__n_estimators=100; total time=   0.4s


[CV] END .....model__bootstrap=True, model__n_estimators=100; total time=   0.3s
[CV] END .....model__bootstrap=True, model__n_estimators=100; total time=   0.3s
[CV] END .....model__bootstrap=True, model__n_estimators=100; total time=   0.4s
[CV] END .....model__bootstrap=True, model__n_estimators=100; total time=   0.3s
[CV] END .....model__bootstrap=True, model__n_estimators=250; total time=   0.8s
[CV] END .....model__bootstrap=True, model__n_estimators=250; total time=   1.0s
[CV] END .....model__bootstrap=True, model__n_estimators=250; total time=   0.9s
[CV] END .....model__bootstrap=True, model__n_estimators=250; total time=   0.8s
[CV] END .....model__bootstrap=True, model__n_estimators=250; total time=   0.8s
[CV] END .....model__bootstrap=True, model__n_estimators=300; total time=   1.1s
[CV] END .....model__bootstrap=True, model__n_estimators=300; total time=   1.1s
[CV] END .....model__bootstrap=True, model__n_estimators=300; total time=   1.0s
[CV] END .....model__bootstr

In [170]:
results = pd.DataFrame(cv.cv_results_)
results[["mean_fit_time", "mean_score_time", "param_model__bootstrap", "param_model__n_estimators",
          "mean_test_r2", "rank_test_r2", "mean_test_neg_mean_absolute_error", "std_test_neg_mean_absolute_error"]].sort_values("rank_test_r2")

Unnamed: 0,mean_fit_time,mean_score_time,param_model__bootstrap,param_model__n_estimators,mean_test_r2,rank_test_r2,mean_test_neg_mean_absolute_error
3,1.73933,0.101391,True,500,0.8009,1,-47839.231809
2,1.101853,0.076376,True,300,0.799271,2,-47877.450103
1,0.89561,0.068744,True,250,0.798738,3,-47860.716504
0,0.412256,0.036388,True,100,0.798477,4,-47767.219051
4,0.519509,0.040444,False,100,0.760447,5,-54075.75634
5,1.067888,0.086539,False,250,0.760059,6,-54092.237823
6,1.25843,0.081525,False,300,0.759713,7,-54120.396093
7,1.999715,0.111641,False,500,0.759347,8,-54138.556017


Best: bootstrap - True; estimators - 500; r2 - 0.800900; neg_mean_absolute_error - -47839.231809; std_test_neg_mean_absolute_error - 8441.349634

### XGBoost

In [171]:
from xgboost import XGBRegressor

pipe = create_pipeline(XGBRegressor(random_state=1, n_jobs=4), X)

#### GridSearchCV

In [172]:
params = {
    "model__n_estimators" : [200, 300, 500],
    'model__learning_rate': [0.01, 0.05, 0.1],
    "model__max_depth" : [5, 6, 7, None]
}

cv = GridSearchCV(pipe, params, cv=5, scoring=["r2", "neg_mean_absolute_error"], refit="r2", verbose=2)

In [173]:
cv.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END model__learning_rate=0.01, model__max_depth=5, model__n_estimators=200; total time=   0.1s
[CV] END model__learning_rate=0.01, model__max_depth=5, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=5, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=5, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=5, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=5, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=5, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=5, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=5, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, mo

In [178]:
results = pd.DataFrame(cv.cv_results_)
results[["mean_fit_time", "mean_score_time", "param_model__n_estimators", "param_model__learning_rate", "param_model__max_depth",
          "mean_test_r2", "rank_test_r2","mean_test_neg_mean_absolute_error", "std_test_neg_mean_absolute_error"]].sort_values("rank_test_r2").head()

Unnamed: 0,mean_fit_time,mean_score_time,param_model__n_estimators,param_model__learning_rate,param_model__max_depth,mean_test_r2,rank_test_r2,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error
13,0.118728,0.008026,300,0.05,5,0.805891,1,-47368.169143,8441.349634
12,0.085954,0.007482,200,0.05,5,0.805889,2,-47446.045519,8342.869925
24,0.083836,0.007017,200,0.1,5,0.805832,3,-47585.028317,8719.872604
14,0.173063,0.008012,500,0.05,5,0.804156,4,-47712.958329,8619.839876
25,0.11326,0.007511,300,0.1,5,0.802653,5,-48053.704977,8837.754335


Best: estimators - 300; learning_rate - 0.05; max_depth - 5 ; r2 - 0.805891; neg_mean_absolute_error - -47368.169143; std_test_neg_mean_absolute_error - 8441.349634