# Feature Engineering and Model selection

## Setup

In [88]:
import pandas as pd
import numpy as np

In [89]:
df = pd.read_csv("../data/housing_data.csv", index_col="id")

# Missing neighborhood set to NaN
df.neighborhood = df.neighborhood.apply(lambda x: np.nan if x == "BA" else x)

# Drop null prices and areas over 2000m^2
df = df[~(df.prices.isna())]
df = df[~(df.areas > 2000)]

### Our data

In [90]:
df.head()

Unnamed: 0_level_0,areas,bedrooms,bathrooms,parkingSpots,prices,type,neighborhood
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2669329904,324.0,5.0,3.0,4.0,340000.0,house,Sao Tome
2667338201,51.0,2.0,1.5,1.0,295000.0,,Santa Teresa
2643720430,94.0,3.0,4.0,2.0,629000.0,apartment,Cidade Jardim
2681509397,66.0,2.0,2.0,2.0,460000.0,apartment,Imbuí
2671493238,82.0,3.0,3.0,2.0,649990.0,apartment,Piatã


In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1093 entries, 2669329904 to 2682107804
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   areas         1093 non-null   float64
 1   bedrooms      1092 non-null   float64
 2   bathrooms     1092 non-null   float64
 3   parkingSpots  913 non-null    float64
 4   prices        1093 non-null   float64
 5   type          1088 non-null   object 
 6   neighborhood  1069 non-null   object 
dtypes: float64(5), object(2)
memory usage: 68.3+ KB


## Feature Engineering

### Neighborhood Price/Area

Since this relationship behaves differently for apartments/houses. We'll have different values for each. 

In [92]:
neighborhood_house_price_sum =  df[df.type == "house"].groupby("neighborhood").prices.sum()
neighborhood_house_area_sum =  df[df.type == "house"].groupby("neighborhood").areas.sum()

neighborhood_apartment_price_sum =  df[df.type == "apartment"].groupby("neighborhood").prices.sum()
neighborhood_apartment_area_sum =  df[df.type == "apartment"].groupby("neighborhood").areas.sum()

neighborhood_house_area_price = neighborhood_house_price_sum / neighborhood_house_area_sum
neighborhood_apartment_area_price = neighborhood_apartment_price_sum / neighborhood_apartment_area_sum


neighborhoods_area_price = pd.DataFrame({
    "neighborhood_apartment_area_price": neighborhood_apartment_area_price,
    "neighborhood_house_area_price": neighborhood_house_area_price
    })

In [93]:
def set_neighborhood_area_price(x):
    if x.type == "house" and x.neighborhood in neighborhood_house_area_price:
        return neighborhoods_area_price.loc[x.neighborhood]["neighborhood_house_area_price"]
    elif x.type == "apartment" and x.neighborhood in neighborhood_apartment_area_price:
        return neighborhoods_area_price.loc[x.neighborhood]["neighborhood_apartment_area_price"]
    else: return np.nan

df["neighborhood_area_price"] = df.apply(set_neighborhood_area_price, axis=1)
df.head()

Unnamed: 0_level_0,areas,bedrooms,bathrooms,parkingSpots,prices,type,neighborhood,neighborhood_area_price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2669329904,324.0,5.0,3.0,4.0,340000.0,house,Sao Tome,791.738382
2667338201,51.0,2.0,1.5,1.0,295000.0,,Santa Teresa,
2643720430,94.0,3.0,4.0,2.0,629000.0,apartment,Cidade Jardim,6811.20944
2681509397,66.0,2.0,2.0,2.0,460000.0,apartment,Imbuí,6091.301222
2671493238,82.0,3.0,3.0,2.0,649990.0,apartment,Piatã,6921.176471


## Model Evaluation

### Prepare training data

In [94]:
df.type = df.type.replace({"house": 0, "apartment": 1})

X = df.drop("prices", axis=1)
y = df.prices

In [95]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import GridSearchCV

### Set Pipeline function

In [96]:
def create_pipeline(model, training_data):
    numerical_cols = training_data.select_dtypes(include='number').columns
    categorical_cols = training_data.select_dtypes(include='object').columns

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median')

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    #Bundling
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    return Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

### Random Forest Regressor

In [97]:
from sklearn.ensemble import RandomForestRegressor

pipe = create_pipeline(RandomForestRegressor(random_state=1, n_jobs=6), X)

#### Grid Search

In [98]:
params = {
    "model__n_estimators" : [100, 250, 300, 500],
    'model__bootstrap': [True, False],
    "model__max_depth" : [2, 5, 10, None]
}

cv = GridSearchCV(pipe, params, cv=5, scoring=["r2", "neg_mean_absolute_error"], refit="r2", verbose=2)

In [99]:
cv.fit(X, y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=100; total time=   0.1s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=100; total time=   0.1s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=100; total time=   0.1s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=100; total time=   0.1s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=100; total time=   0.1s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=250; total time=   0.3s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=250; total time=   0.3s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=250; total time=   0.3s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=250; total time=   0.2s
[CV] END model__bootstrap=True, model__max_depth=2, model__n_estimators=25

In [100]:
random_forest_results = pd.DataFrame(cv.cv_results_)
random_forest_results[["mean_fit_time", "mean_score_time", "param_model__bootstrap", "param_model__n_estimators", "param_model__max_depth",
          "mean_test_r2", "rank_test_r2", "mean_test_neg_mean_absolute_error", "std_test_neg_mean_absolute_error"]].sort_values("rank_test_r2").head()

Unnamed: 0,mean_fit_time,mean_score_time,param_model__bootstrap,param_model__n_estimators,param_model__max_depth,mean_test_r2,rank_test_r2,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error
12,0.41367,0.030209,True,100,,0.847245,1,-41570.670935,6111.525103
14,1.131139,0.057286,True,300,,0.846965,2,-41782.79539,6353.022848
13,0.912024,0.05154,True,250,,0.846522,3,-41794.885865,6402.512134
15,1.833111,0.083958,True,500,,0.844567,4,-41901.289611,6333.126408
10,0.725104,0.05653,True,300,10.0,0.843565,5,-41665.203435,6308.913996


### XGBoost

In [101]:
from xgboost import XGBRegressor

pipe = create_pipeline(XGBRegressor(random_state=1, n_jobs=6), X)

#### GridSearchCV

In [102]:
params = {
    "model__n_estimators" : [200, 300, 500],
    'model__learning_rate': [0.01, 0.05, 0.1],
    "model__max_depth" : [2, 5, 10, None]
}

cv = GridSearchCV(pipe, params, cv=5, scoring=["r2", "neg_mean_absolute_error"], refit="r2", verbose=2)

In [103]:
cv.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=200; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, model__max_depth=2, model__n_estimators=300; total time=   0.0s
[CV] END model__learning_rate=0.01, mo

In [104]:
xgboost_results = pd.DataFrame(cv.cv_results_)
xgboost_results[["mean_fit_time", "mean_score_time", "param_model__n_estimators", "param_model__learning_rate", "param_model__max_depth",
          "mean_test_r2", "rank_test_r2","mean_test_neg_mean_absolute_error", "std_test_neg_mean_absolute_error"]].sort_values("rank_test_r2").head()

Unnamed: 0,mean_fit_time,mean_score_time,param_model__n_estimators,param_model__learning_rate,param_model__max_depth,mean_test_r2,rank_test_r2,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error
26,0.088754,0.00581,500,0.1,2,0.831745,1,-45289.274162,4080.812463
25,0.05738,0.006207,300,0.1,2,0.828644,2,-45545.06818,3821.981125
14,0.098352,0.006009,500,0.05,2,0.826579,3,-45998.690357,4246.079137
12,0.042657,0.005914,200,0.05,2,0.825768,4,-45898.799486,4490.330377
13,0.058174,0.005307,300,0.05,2,0.825052,5,-45917.507206,4377.245132
