In [1]:
#Data Cleaning
import pandas as pd
import os
import numpy as np

#Data is saved on diffrent CSVs for each city
#To make things easier, we can combine the data into one dataframe
dfs = []
for file in os.listdir("data"):
    df = pd.read_csv(os.path.join("data", file))
    df["City"] = file.replace(".csv", "")
    dfs.append(df)
df = pd.concat(dfs)

#As documented on kaggle, 9 implies that this information was not found for a home.
#Therefore we replaced all 9s with np.nan as is standard for empty values

temp = df["No. of Bedrooms"].copy()
df = df.applymap(lambda x: (np.nan if x == 9  else  x))
df["No. of Bedrooms"] = temp
df

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator,City
0,6968000,1340,Nizampet,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
1,29000000,3498,Hitech City,4,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
2,6590000,1318,Manikonda,2,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
3,5739000,1295,Alwal,3,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
4,5679000,1145,Kukatpally,2,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6202,5364000,590,Chandapura,1,0,,,,,,...,,,,,,,,,,Bangalore
6203,8716000,1179,Kasavanahalli,2,0,,,,,,...,,,,,,,,,,Bangalore
6204,7373000,1143,Kasavanahalli,2,0,,,,,,...,,,,,,,,,,Bangalore
6205,4985000,1680,Kasavanahalli,3,0,,,,,,...,,,,,,,,,,Bangalore


In [2]:
#Given this implies that a given row with a nan values likely contains nan values, we can simply drop all rows with nan
cleaned_df = df[~df.isnull().any(axis=1)]
cleaned_df

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator,City
0,6968000,1340,Nizampet,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
1,29000000,3498,Hitech City,4,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
2,6590000,1318,Manikonda,2,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
3,5739000,1295,Alwal,3,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
4,5679000,1145,Kukatpally,2,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1946,8306999,1184,Hosa Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Bangalore
1947,4883000,655,Hosa Road,1,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Bangalore
1948,11500000,1680,Hosa Road,3,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Bangalore
1949,8378000,1195,Hosa Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Bangalore


In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import numpy as np
#import plotly.figure_factory as ff
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

df = cleaned_df

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## Baseline


In [5]:
import sklearn

## Given a possible pipeline, get the score metrics for it
def score_suite(pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    return (
        pipe.score(X_test, y_test),
        sklearn.metrics.mean_squared_error(y_test, y_pred),
    )

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_df.drop("Price", axis=1), 
    cleaned_df["Price"], 
    test_size=0.33, 
    random_state=42
)

In [7]:
ct = ColumnTransformer(
    [
        ("Location_One_Hot",  OneHotEncoder(handle_unknown="ignore"), ["Location", "City"]),
    ],
    remainder="passthrough"
)
base_pipe = Pipeline([('transformers', ct), ('LR', LinearRegression())])
base_pipe.fit(X_train, y_train)
score_suite(base_pipe, X_test, y_test)

(0.7011333963063829, 40692579165816.54)

## Model Building

In [8]:
from tqdm import tqdm
def nested_cv_sklearnModels(model_param_dict, X_train, y_train):
    inner_cv_results = {}
    for model in tqdm(model_param_dict):
        model_data = model_param_dict[model]
        inner_cv = sklearn.model_selection.GridSearchCV(model_data["estimator"],model_data["params"])
        inner_cv.fit(X_train, y_train)
        inner_cv_results[model] = inner_cv.cv_results_
    return inner_cv_results

In [9]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

ct = ColumnTransformer(
    [
        ("Location_One_Hot",  OneHotEncoder(handle_unknown="ignore"), ["Location", "City"]),
    ],
    remainder="passthrough"
)

model_param_dict = {}

model_param_dict["base"] = {
    "estimator": base_pipe,
    "params": {
    }  
}    


#https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
#See above for why __
model_param_dict["SVR"] = {
    "estimator": Pipeline([('transformers', ct), ('SVR', SVR())]),
    "params": {
        "SVR__kernel": ["linear", "poly", "rbf", "sigmoid"]
        #"SVR__degree": [1,2,3,5,8,10,15,20,30],
        #"SVR__gamma": ["auto", "scale"],
        #"SVR__coef0": [0, 1, 10, 100, 1000],
        #"SVR__C": [1,10,100,1000],
    }  
}    

model_param_dict["DT"] = {
    "estimator": Pipeline([('transformers', ct), ('DT', DecisionTreeRegressor())]),
    "params": {
        "DT__criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
        "DT__max_depth": [5, 10, 25, 50, 100, 1000],
        #add more hyperparameters...
        #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
    }  
}    

model_param_dict["RandomForestRegressor"] = {
    "estimator": Pipeline([('transformers', ct), ('RFR', RandomForestRegressor())]),
    "params": {
        "RFR__n_estimators": [1,5,10,20,50,100],
        "RFR__max_features": [1,5,10,20,50,100],
        #add more hyperparameters...
        #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
    }  
}    






model_param_dict

{'base': {'estimator': Pipeline(steps=[('transformers',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('Location_One_Hot',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ['Location', 'City'])])),
                  ('LR', LinearRegression())]),
  'params': {}},
 'SVR': {'estimator': Pipeline(steps=[('transformers',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('Location_One_Hot',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ['Location', 'City'])])),
                  ('SVR', SVR())]),
  'params': {'SVR__kernel': ['linear', 'poly', 'rbf', 'sigmoid']}},
 'DT': {'estimator': Pipeline(steps=[('transformers',
                   ColumnTransformer(remainder='passthroug

In [10]:
nested_cv_sklearnModels(model_param_dict, X_train, y_train)

100%|██████████| 4/4 [15:54<00:00, 238.55s/it]


{'base': {'mean_fit_time': array([0.11552243]),
  'std_fit_time': array([0.00678842]),
  'mean_score_time': array([0.00643229]),
  'std_score_time': array([0.00044357]),
  'params': [{}],
  'split0_test_score': array([0.62803123]),
  'split1_test_score': array([0.52175842]),
  'split2_test_score': array([0.69695306]),
  'split3_test_score': array([0.79319009]),
  'split4_test_score': array([0.62742105]),
  'mean_test_score': array([0.65347077]),
  'std_test_score': array([0.08953893]),
  'rank_test_score': array([1], dtype=int32)},
 'SVR': {'mean_fit_time': array([2.50961599, 2.3633986 , 2.476369  , 2.3982234 ]),
  'std_fit_time': array([0.09378322, 0.05606864, 0.07475677, 0.07083015]),
  'mean_score_time': array([0.43403997, 0.45948687, 0.49666882, 0.48919072]),
  'std_score_time': array([0.02356594, 0.00949669, 0.00929311, 0.00544826]),
  'param_SVR__kernel': masked_array(data=['linear', 'poly', 'rbf', 'sigmoid'],
               mask=[False, False, False, False],
         fill_value=