# Feature Engineering and Model selection

## Setup

In [186]:
import pandas as pd
import numpy as np

In [187]:
df = pd.read_csv("../data/housing_data.csv", index_col="id")

# Missing neighborhood set to NaN
df.neighborhood = df.neighborhood.apply(lambda x: np.nan if x == "BA" else x)

# Drop null prices and outliers
df = df[~(df.prices.isna())]
df = df[~(df.areas > 200)]
df = df[~(df.bedrooms > 5)]
df = df[~(df.bathrooms > 5)]
df = df[~(df.parkingSpots > 5)]
df = df[~(df.prices > 1700000)]

### Our data

In [188]:
df.head()

Unnamed: 0_level_0,areas,bedrooms,bathrooms,parkingSpots,prices,type,neighborhood
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2667338201,51.0,2.0,1.5,1.0,295000.0,,Santa Teresa
2643720430,94.0,3.0,4.0,2.0,629000.0,apartment,Cidade Jardim
2681509397,66.0,2.0,2.0,2.0,460000.0,apartment,Imbuí
2671493238,82.0,3.0,3.0,2.0,649990.0,apartment,Piatã
2592933397,50.0,2.0,1.0,1.0,184900.0,apartment,Luís Anselmo


In [189]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1935 entries, 2667338201 to 2639465330
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   areas         1935 non-null   float64
 1   bedrooms      1931 non-null   float64
 2   bathrooms     1933 non-null   float64
 3   parkingSpots  1627 non-null   float64
 4   prices        1935 non-null   float64
 5   type          1925 non-null   object 
 6   neighborhood  1906 non-null   object 
dtypes: float64(5), object(2)
memory usage: 120.9+ KB


## Feature Engineering

### Neighborhood Price/Area

Since this relationship behaves differently for apartments/houses. We'll have different values for each. 

In [190]:
neighborhood_house_price_sum =  df[df.type == "house"].groupby("neighborhood").prices.sum()
neighborhood_house_area_sum =  df[df.type == "house"].groupby("neighborhood").areas.sum()

neighborhood_apartment_price_sum =  df[df.type == "apartment"].groupby("neighborhood").prices.sum()
neighborhood_apartment_area_sum =  df[df.type == "apartment"].groupby("neighborhood").areas.sum()


neighborhoods_area_price = pd.DataFrame({
    "neighborhood_apartment_area_price": neighborhood_apartment_price_sum / neighborhood_apartment_area_sum,
    "neighborhood_house_area_price": neighborhood_house_price_sum / neighborhood_house_area_sum
    })

In [191]:
def set_neighborhood_area_price(x):
    if x.type == "house" and x.neighborhood in neighborhoods_area_price.index:
        return neighborhoods_area_price.loc[x.neighborhood]["neighborhood_house_area_price"]
    elif x.type == "apartment" and x.neighborhood in neighborhoods_area_price.index:
        return neighborhoods_area_price.loc[x.neighborhood]["neighborhood_apartment_area_price"]
    else: return np.nan

df["neighborhood_area_price"] = df.apply(set_neighborhood_area_price, axis=1)
df.head()

Unnamed: 0_level_0,areas,bedrooms,bathrooms,parkingSpots,prices,type,neighborhood,neighborhood_area_price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2667338201,51.0,2.0,1.5,1.0,295000.0,,Santa Teresa,
2643720430,94.0,3.0,4.0,2.0,629000.0,apartment,Cidade Jardim,6641.723356
2681509397,66.0,2.0,2.0,2.0,460000.0,apartment,Imbuí,6118.134938
2671493238,82.0,3.0,3.0,2.0,649990.0,apartment,Piatã,6270.982143
2592933397,50.0,2.0,1.0,1.0,184900.0,apartment,Luís Anselmo,3789.516129


## Model Evaluation

### Prepare training data

In [192]:
df.type = df.type.replace({"house": 0, "apartment": 1})

X = df.drop("prices", axis=1)
y = df.prices

In [193]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import GridSearchCV

### Set Pipeline function

In [194]:
def create_pipeline(model, training_data):
    numerical_cols = training_data.select_dtypes(include='number').columns
    categorical_cols = training_data.select_dtypes(include='object').columns

    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median')

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    #Bundling
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    return Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

### Random Forest Regressor

In [195]:
from sklearn.ensemble import RandomForestRegressor

pipe = create_pipeline(RandomForestRegressor(random_state=1, n_jobs=6), X)

#### Grid Search

In [196]:
params = {
    "model__n_estimators" : [100, 250, 300, 500],
    'model__bootstrap': [True, False],
    "model__max_depth" : [2, 5, 10, None]
}

cv = GridSearchCV(pipe, params, cv=5, scoring=["r2", "neg_mean_absolute_error"], refit="r2", verbose=2)

In [197]:
# cv.fit(X, y)

In [198]:
# random_forest_results = pd.DataFrame(cv.cv_results_)
# random_forest_results[["mean_fit_time", "mean_score_time", "param_model__bootstrap", "param_model__n_estimators", "param_model__max_depth",
#           "mean_test_r2", "rank_test_r2", "mean_test_neg_mean_absolute_error", "std_test_neg_mean_absolute_error"]].sort_values("rank_test_r2").head()

### XGBoost

In [199]:
from xgboost import XGBRegressor

pipe = create_pipeline(XGBRegressor(random_state=1, n_jobs=6), X)

#### GridSearchCV

In [200]:
params = {
    "model__n_estimators" : [200, 300, 500],
    'model__learning_rate': [0.05],
    "model__max_depth" : [2],
    'model__min_child_weight':[10,12],
    "model__gamma": [2,4,6]
}

cv = GridSearchCV(pipe, params, cv=5, scoring=["r2", "neg_mean_absolute_error"], refit="r2", verbose=2)

In [201]:
cv.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END model__gamma=2, model__learning_rate=0.05, model__max_depth=2, model__min_child_weight=7, model__n_estimators=200; total time=   0.1s
[CV] END model__gamma=2, model__learning_rate=0.05, model__max_depth=2, model__min_child_weight=7, model__n_estimators=200; total time=   0.0s
[CV] END model__gamma=2, model__learning_rate=0.05, model__max_depth=2, model__min_child_weight=7, model__n_estimators=200; total time=   0.0s
[CV] END model__gamma=2, model__learning_rate=0.05, model__max_depth=2, model__min_child_weight=7, model__n_estimators=200; total time=   0.0s


[CV] END model__gamma=2, model__learning_rate=0.05, model__max_depth=2, model__min_child_weight=7, model__n_estimators=200; total time=   0.0s
[CV] END model__gamma=2, model__learning_rate=0.05, model__max_depth=2, model__min_child_weight=7, model__n_estimators=300; total time=   0.1s
[CV] END model__gamma=2, model__learning_rate=0.05, model__max_depth=2, model__min_child_weight=7, model__n_estimators=300; total time=   0.0s
[CV] END model__gamma=2, model__learning_rate=0.05, model__max_depth=2, model__min_child_weight=7, model__n_estimators=300; total time=   0.0s
[CV] END model__gamma=2, model__learning_rate=0.05, model__max_depth=2, model__min_child_weight=7, model__n_estimators=300; total time=   0.0s
[CV] END model__gamma=2, model__learning_rate=0.05, model__max_depth=2, model__min_child_weight=7, model__n_estimators=300; total time=   0.0s
[CV] END model__gamma=2, model__learning_rate=0.05, model__max_depth=2, model__min_child_weight=7, model__n_estimators=500; total time=   0.0s

In [204]:
xgboost_results = pd.DataFrame(cv.cv_results_)
xgboost_results.sort_values("rank_test_r2").head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__gamma,param_model__learning_rate,param_model__max_depth,param_model__min_child_weight,param_model__n_estimators,params,...,std_test_r2,rank_test_r2,split0_test_neg_mean_absolute_error,split1_test_neg_mean_absolute_error,split2_test_neg_mean_absolute_error,split3_test_neg_mean_absolute_error,split4_test_neg_mean_absolute_error,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error,rank_test_neg_mean_absolute_error
35,0.091711,0.00147,0.0064,0.00049,6,0.05,2,12,500,"{'model__gamma': 6, 'model__learning_rate': 0....",...,0.045804,1,-38744.835342,-40348.19286,-35620.056217,-39515.037008,-46937.841812,-40233.192648,3714.230154,1
23,0.092855,0.001518,0.0064,0.00049,4,0.05,2,12,500,"{'model__gamma': 4, 'model__learning_rate': 0....",...,0.045804,1,-38744.835342,-40348.19286,-35620.056217,-39515.037008,-46937.841812,-40233.192648,3714.230154,1
11,0.102348,0.017956,0.006401,0.001021,2,0.05,2,12,500,"{'model__gamma': 2, 'model__learning_rate': 0....",...,0.045804,1,-38744.835342,-40348.19286,-35620.056217,-39515.037008,-46937.841812,-40233.192648,3714.230154,1
31,0.061009,0.001676,0.006001,2e-06,6,0.05,2,10,300,"{'model__gamma': 6, 'model__learning_rate': 0....",...,0.049065,4,-38602.062702,-40166.07868,-35695.152404,-40595.971203,-47352.917323,-40482.436462,3841.103325,10
7,0.061907,0.001319,0.006102,0.000203,2,0.05,2,10,300,"{'model__gamma': 2, 'model__learning_rate': 0....",...,0.049065,4,-38602.062702,-40166.07868,-35695.152404,-40595.971203,-47352.917323,-40482.436462,3841.103325,10
19,0.061307,0.001629,0.0056,0.00049,4,0.05,2,10,300,"{'model__gamma': 4, 'model__learning_rate': 0....",...,0.049065,4,-38602.062702,-40166.07868,-35695.152404,-40595.971203,-47352.917323,-40482.436462,3841.103325,10
4,0.063344,0.002042,0.006201,0.000401,2,0.05,2,9,300,"{'model__gamma': 2, 'model__learning_rate': 0....",...,0.048855,7,-38588.979459,-40747.788916,-35728.791379,-40511.661489,-47374.185007,-40590.28125,3839.052814,16
28,0.061109,0.000372,0.005902,0.000196,6,0.05,2,9,300,"{'model__gamma': 6, 'model__learning_rate': 0....",...,0.048855,7,-38588.979459,-40747.788916,-35728.791379,-40511.661489,-47374.185007,-40590.28125,3839.052814,16
16,0.060408,0.001797,0.0064,0.000491,4,0.05,2,9,300,"{'model__gamma': 4, 'model__learning_rate': 0....",...,0.048855,7,-38588.979459,-40747.788916,-35728.791379,-40511.661489,-47374.185007,-40590.28125,3839.052814,16
22,0.063608,0.003119,0.0058,0.0004,4,0.05,2,12,300,"{'model__gamma': 4, 'model__learning_rate': 0....",...,0.047569,10,-38696.871205,-40163.475048,-36050.304889,-40539.067199,-47244.276748,-40538.799018,3705.228127,13
