In [1]:
#Data Cleaning
import pandas as pd
import os
import numpy as np

#Data is saved on diffrent CSVs for each city
#To make things easier, we can combine the data into one dataframe
dfs = []
for file in os.listdir("data"):
    df = pd.read_csv(os.path.join("data", file))
    df["City"] = file.replace(".csv", "")
    dfs.append(df)
df = pd.concat(dfs)

#As documented on kaggle, 9 implies that this information was not found for a home.
#Therefore we replaced all 9s with np.nan as is standard for empty values

temp = df["No. of Bedrooms"].copy()
df = df.applymap(lambda x: (np.nan if x == 9  else  x))
df["No. of Bedrooms"] = temp
df

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator,City
0,2235000,1016,Barasat,3,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kolkata
1,3665999,1111,Keshtopur,2,0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kolkata
2,3774000,1020,Rajarhat,2,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kolkata
3,2524000,935,Narendrapur,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kolkata
4,8300000,1956,New Town,3,1,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kolkata
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7714,14500000,1180,Mira Road East,2,0,,,,,,...,,,,,,,,,,Mumbai
7715,14500000,530,Naigaon East,1,1,,,,,,...,,,,,,,,,,Mumbai
7716,4100000,700,Shirgaon,1,0,,,,,,...,,,,,,,,,,Mumbai
7717,2750000,995,Mira Road East,2,0,,,,,,...,,,,,,,,,,Mumbai


In [2]:
#Given this implies that a given row with a nan values likely contains nan values, we can simply drop all rows with nan
cleaned_df = df[~df.isnull().any(axis=1)]
cleaned_df

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator,City
0,2235000,1016,Barasat,3,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kolkata
1,3665999,1111,Keshtopur,2,0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kolkata
2,3774000,1020,Rajarhat,2,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kolkata
3,2524000,935,Narendrapur,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kolkata
4,8300000,1956,New Town,3,1,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Kolkata
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1393,62000000,1450,Worli,3,0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Mumbai
1394,2500000,540,Virar East,1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mumbai
1395,19000000,1267,Belapur,3,1,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mumbai
1396,14900000,1245,Airoli,2,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mumbai


In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import numpy as np
#import plotly.figure_factory as ff
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

df = cleaned_df

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## Baseline


In [5]:
import sklearn

## Given a possible pipeline, get the score metrics for it
def score_suite(pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    return (
        pipe.score(X_test, y_test),
        sklearn.metrics.mean_squared_error(y_test, y_pred),
    )

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_df.drop("Price", axis=1), 
    cleaned_df["Price"], 
    test_size=0.33, 
    random_state=42
)

In [7]:
ct = ColumnTransformer(
   [
       ("Location_One_Hot",  OneHotEncoder(handle_unknown="ignore"), ["Location", "City"]),
   ],
   remainder="passthrough"
)

base_pipe = Pipeline([('transformers', ct), ('LR', LinearRegression())])
base_pipe.fit(X_train, y_train)
score_suite(base_pipe, X_test, y_test)

(0.5420585030258087, 71141662601592.81)

## Model Building

In [8]:
from tqdm import tqdm

def nested_cv_sklearnModels(model_param_dict, X_train, y_train):
    inner_cv_results = {}
    for model in tqdm(model_param_dict):
        model_data = model_param_dict[model]
        inner_cv = sklearn.model_selection.GridSearchCV(model_data["estimator"],model_data["params"])
        inner_cv.fit(X_train, y_train)
        #inner_cv_results[model] = inner_cv.cv_results_
        inner_cv_results[model] = inner_cv.best_params_
    return inner_cv_results

In [9]:
from sklearn.model_selection import RandomizedSearchCV
def random_cv_sklearnModels(model_param_dict, X_train, y_train):
    inner_cv_results = {}
    for model in tqdm(model_param_dict):
        model_data = model_param_dict[model]
        inner_cv = RandomizedSearchCV(model_data["estimator"],model_data["params"])
        inner_cv.fit(X_train, y_train)
        #inner_cv_results[model] = inner_cv.cv_results_
        inner_cv_results[model] = inner_cv.best_params_
    return inner_cv_results

In [10]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge

ct = ColumnTransformer(
    [
        ("Location_One_Hot",  OneHotEncoder(handle_unknown="ignore"), ["Location", "City"]),
    ],
    remainder="passthrough"
)

model_param_dict = {}

model_param_dict["base"] = {
    "estimator": base_pipe,
    "params": {
    }  
}    


#https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
#See above for why __
model_param_dict["SVR"] = {
    "estimator": Pipeline([('transformers', ct), ('SVR', SVR())]),
    "params": {
        #"SVR__kernel": ["linear", "poly", "rbf", "sigmoid"],
        "SVR__degree": [1,2,3,5,8,10,15,20,30],
        "SVR__epsilon": [.2,.4,.6,.8,],
        #"SVR__gamma": ["auto", "scale"],
        "SVR__coef0": [0, 1, 10, 100, 1000],
        "SVR__C": [1,10,100,1000],
    }  
}    

model_param_dict["DT"] = {
    "estimator": Pipeline([('transformers', ct), ('DT', DecisionTreeRegressor())]),
    "params": {
        "DT__splitter": ['best', 'random'],
        "DT__max_features": [1,5,10,20,50,100],
        "DT__criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
        "DT__max_depth": [None, 5, 10, 25, 50, 100, 1000],
        "DT__min_samples_split": [2, 5, 10, 20, 50, 100],
        "DT__min_samples_leaf": [1, 5, 10, 20, 50, 100],
        "DT__max_leaf_nodes": [None, 10, 50, 100, 200, 500],
        #add more hyperparameters...
        #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
    }  
}    

model_param_dict["RandomForestRegressor"] = {
    "estimator": Pipeline([('transformers', ct), ('RFR', RandomForestRegressor())]),
    "params": {
        "RFR__criterion": ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
        "RFR__n_estimators": [1,5,10,20,50,100],
        "RFR__max_features": [1,5,10,20,50,100],
        "RFR__min_samples_split": [2, 5, 10, 20, 50, 100],
        "RFR__min_samples_leaf": [1, 5, 10, 20, 50, 100],
        "RFR__max_leaf_nodes": [None, 10, 50, 100, 200, 500],
        #add more hyperparameters...
        #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
    }  
}    

model_param_dict["GradientBoostingRegressor"] = {
    "estimator":Pipeline([('transformers', ct), ('GBR', GradientBoostingRegressor())]),
    "params": {
        "GBR__loss":['squared_error', 'absolute_error', 'huber', 'quantile'],
        "GBR__n_estimators": [1,5,10,20,50,100],
        "GBR__alpha": [.05,.1,.15,.2,.25,.30,.35,.40,.45,.50,.55,.60,.65,.70,.75,.80,.85,.90,.95],
        "GBR__max_features": [1,5,10,20,50,100],
        "GBR__max_depth": [3, 5, 7],
        "GBR__subsample": [0.5, 0.75, 1.0],
        "GBR__min_samples_split": [2, 5, 10],
        "GBR__learning_rate": [0.001, 0.01, 0.1],
    }  
}    

model_param_dict["LassoRegression"] = {
    "estimator": Pipeline([('transformers', ct), ('LAS', Lasso())]),
    "params": {
        "LAS__alpha": [.05, .1, .15, .2, .25, .30, .35, .40, .45, .50, .55, .60, .65, .70, .75, .80, .85, .90, .95],
        "LAS__selection": ['cyclic', 'random'],
        "LAS__tol": [1e-4, 1e-5, 1e-6],
    }  
}

model_param_dict["ElasticNet"] = {
    "estimator": Pipeline([('transformers', ct), ('EL', ElasticNet())]),
    "params": {
        "EL__alpha": [.05, .1, .15, .2, .25, .30, .35, .40, .45, .50, .55, .60, .65, .70, .75, .80, .85, .90, .95],
        "EL__l1_ratio": [.1, .30, .50, .70, .90],
        "EL__selection": ['cyclic', 'random'],
        "EL__tol": [1e-3, 1e-4, 1e-5],
    }  
}
 
model_param_dict["BayesianRidgeRegression"] = {
    "estimator": Pipeline([('transformers', ct), ('BRR', BayesianRidge())]),
    "params": {
        "BRR__alpha_1": [1e-6, 1e-5, 1e-4, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
        "BRR__alpha_2": [1e-6, 1e-5, 1e-4, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
        "BRR__lambda_1": [1e-6, 1e-5, 1e-4, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
        "BRR__lambda_2": [1e-6, 1e-5, 1e-4, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
        "BRR__tol": [1e-4, 1e-5, 1e-6],
    }  
}

model_param_dict

{'base': {'estimator': Pipeline(steps=[('transformers',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('Location_One_Hot',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ['Location', 'City'])])),
                  ('LR', LinearRegression())]),
  'params': {}},
 'SVR': {'estimator': Pipeline(steps=[('transformers',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('Location_One_Hot',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ['Location', 'City'])])),
                  ('SVR', SVR())]),
  'params': {'SVR__degree': [1, 2, 3, 5, 8, 10, 15, 20, 30],
   'SVR__epsilon': [0.2, 0.4, 0.6, 0.8],
   'SVR__coef0': [0, 1, 10, 100, 1000],
   'SVR__C': [1, 10, 100, 1000]}},
 'D

In [383]:
#nested_cv_sklearnModels(model_param_dict, X_train, y_train)
# model = model_param_dict['ElasticNet']
# clf = sklearn.model_selection.GridSearchCV(model["estimator"],model["params"])
# clf.fit(X_train, y_train)
# clf.best_params_
#nested_cv_sklearnModels(model_param_dict, X_train, y_train)

## Best Parameters

After running the cross validation we document that the best optimial parameters are 

<ol>
  <li>{'SVR__C': 1000, 'SVR__coef0': 0, 'SVR__degree': 1, 'SVR__epsilon': 0.8}</li>
  <li>{'DT__criterion': 'poisson','DT__max_depth': 1000,'DT__max_features': 100,'DT__splitter': 'random'}</li>
  <li>'RandomForestRegressor': {'RFR__max_features': 100, 'RFR__n_estimators': 50}</li>
  <li>'GradientBoostingRegressor': {'GBR__alpha': 0.55,'GBR__loss': 'squared_error','GBR__max_features': 100 'GBR__n_estimators': 100}</li>
  <li>'LasRegressor': {'LAS__alpha': 0.35} </li>
  <li>ElasticNet: {'EL__alpha': 0.05, 'EL__l1_ratio': 0.9}</li>
</ol>
   
Below, I will instantiate the models with their best parameters and run an ensemble

In [384]:

#MLP Regressor
# mlp_reg = Pipeline([('transformers', ct), ('SVR', MLPRegressor(random_state=1, max_iter=50))])
# mlp_reg.fit(X_train, y_train)
# score_suite(mlp_reg,X_test, y_test)

#Support Vector Regressor
#svr_reg = Pipeline([('transformers', ct), ('SVR', SVR(kernel='linear',C=1000,degree=1,epsilon=.8))])
# svr_reg.fit(X_train, y_train)
# score_suite(svr_reg,X_test, y_test)

#Decision Tree Regressor
#dtr_reg = Pipeline([('transformers', ct), ('DT', DecisionTreeRegressor(criterion='poisson', max_depth=1000,splitter='random',max_features=100))])
# dtr_reg.fit(X_train, y_train)
# score_suite(dtr_reg,X_test, y_test)

#Linear Regression
lr_reg = Pipeline([('transformers', ct), ('LR', LinearRegression())])
# lr_reg.fit(X_train, y_train)
# score_suite(lr_reg,X_test, y_test)

#Random Forest Regressor
rtr_reg = Pipeline([('transformers', ct), ('RFR', RandomForestRegressor(max_features=100, n_estimators=50))])
# rtr_reg.fit(X_train, y_train)
# score_suite(rtr_reg,X_test, y_test)

#Gradient Boosting Regressor
gbr_reg = Pipeline([('transformers', ct), ('GBR', GradientBoostingRegressor(alpha=.55,loss='squared_error',max_features=100, n_estimators=100))])
# gbr_reg.fit(X_train, y_train)
# score_suite(gbr_reg,X_test, y_test)

#LassoRegession
#ISSUE: may not converge
# las_reg = Pipeline([('transformers', ct), ('SVR', linear_model.Lasso(alpha=0.35,tol=.01))])
# las_reg.fit(X_train, y_train)
# score_suite(las_reg,X_test, y_test)

#ELASTIC NET
el_reg = Pipeline([('transformers', ct), ('SVR', ElasticNet(alpha=.05,l1_ratio=.9))])
# el_reg.fit(X_train, y_train)
# score_suite(el_reg,X_test, y_test)


#Construct ensemble
#ensemble = VotingRegressor(estimators=[('svr',svr_reg),('dt',dtr_reg),('rfr',rtr_reg),('gbr',gbr_reg)])
ensemble = VotingRegressor(estimators=[('rtr',rtr_reg),('gbr',gbr_reg),('lr',lr_reg),('el',el_reg)])
ensemble = ensemble.fit(X_train, y_train)
score_suite(ensemble,X_test, y_test)


(0.7804663262047938, 29890898782495.06)

It seems that RandomForestRegressor, GradientBoostingRegressor, and Linear Regression produce the best ensemble