In [None]:
#Data Cleaning
import pandas as pd
import os
import numpy as np

#Data is saved on diffrent CSVs for each city
#To make things easier, we can combine the data into one dataframe
dfs = []
for file in os.listdir("data"):
    df = pd.read_csv(os.path.join("data", file))
    df["City"] = file.replace(".csv", "")
    dfs.append(df)
df = pd.concat(dfs)

#As documented on kaggle, 9 implies that this information was not found for a home.
#Therefore we replaced all 9s with np.nan as is standard for empty values

temp = df["No. of Bedrooms"].copy()
df = df.applymap(lambda x: (np.nan if x == 9  else  x))
df["No. of Bedrooms"] = temp
df

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator,City
0,30000000,3340,JP Nagar Phase 1,4,0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
1,7888000,1045,Dasarahalli on Tumkur Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
2,4866000,1179,Kannur on Thanisandra Main Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
3,8358000,1675,Doddanekundi,3,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
4,6845000,1670,Kengeri,3,0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7714,14500000,1180,Mira Road East,2,0,,,,,,...,,,,,,,,,,Mumbai
7715,14500000,530,Naigaon East,1,1,,,,,,...,,,,,,,,,,Mumbai
7716,4100000,700,Shirgaon,1,0,,,,,,...,,,,,,,,,,Mumbai
7717,2750000,995,Mira Road East,2,0,,,,,,...,,,,,,,,,,Mumbai


In [None]:
#Given this implies that a given row with a nan values likely contains nan values, we can simply drop all rows with nan
cleaned_df = df[~df.isnull().any(axis=1)]
cleaned_df

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator,City
0,30000000,3340,JP Nagar Phase 1,4,0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
1,7888000,1045,Dasarahalli on Tumkur Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
2,4866000,1179,Kannur on Thanisandra Main Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
3,8358000,1675,Doddanekundi,3,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
4,6845000,1670,Kengeri,3,0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1393,62000000,1450,Worli,3,0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Mumbai
1394,2500000,540,Virar East,1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mumbai
1395,19000000,1267,Belapur,3,1,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mumbai
1396,14900000,1245,Airoli,2,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mumbai


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import numpy as np
#import plotly.figure_factory as ff
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

df = cleaned_df

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## Baseline


In [None]:
import sklearn

## Given a possible pipeline, get the score metrics for it
def score_suite(pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    return (
        pipe.score(X_test, y_test),
        sklearn.metrics.mean_squared_error(y_test, y_pred),
    )

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_df.drop("Price", axis=1), 
    cleaned_df["Price"], 
    test_size=0.33, 
    random_state=42
)

In [None]:
ct = ColumnTransformer(
    [
        ("Location_One_Hot",  OneHotEncoder(handle_unknown="ignore"), ["Location", "City"]),
    ],
    remainder="passthrough"
)
base_pipe = Pipeline([('transformers', ct), ('LR', LinearRegression())])
base_pipe.fit(X_train, y_train)
score_suite(base_pipe, X_test, y_test)

(0.5691999439296691, 71954979863747.73)

## Model Building

In [None]:
from tqdm import tqdm
def nested_cv_sklearnModels(model_param_dict, X_train, y_train):
    
    X_inner, X_outer, y_inner, y_outer = train_test_split(
        X_train, y_train, test_size=1/len(model_param_dict), random_state=42
    )
    
    
    #inner cv
    inner_cv_results = {}
    gridserach_estimatators = []
    for model in tqdm(model_param_dict):
        model_data = model_param_dict[model]
        inner_cv = sklearn.model_selection.RandomizedSearchCV(model_data["estimator"],model_data["params"], n_iter=10)
        inner_cv.fit(X_inner, y_inner)
        inner_cv_results[model] = inner_cv.cv_results_
        gridserach_estimatators.append(inner_cv)
    
    #outer CV time
    outer_scores = []
    for best_estimator in gridserach_estimatators:
        score = best_estimator.score(X_outer,y_outer)
        
        outer_scores.append(score)
    
    
    
    #Metadata processing
    df = pd.DataFrame()
    for key in inner_cv_results:
        df = pd.concat([df, pd.DataFrame(inner_cv_results[key])])


    
    return df,gridserach_estimatators, outer_scores

In [None]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import uniform

ct = ColumnTransformer(
    [
        ("Location_One_Hot",  OneHotEncoder(handle_unknown="ignore"), ["Location", "City"]),
    ],
    remainder="passthrough"
)

model_param_dict = {}

model_param_dict["base"] = {
    "estimator": base_pipe,
    "params": {
    }  
}    


#https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
#See above for why __
model_param_dict["SVR"] = {
    "estimator": Pipeline([('transformers', ct), ('SVR', SVR())]),
    "params": {
        "SVR__kernel": ["linear", "rbf", "sigmoid"],
        #"SVR__degree": [1,2,3,5,8,10,15,20,30],
        #"SVR__gamma": ["auto", "scale"],
        "SVR__coef0": uniform(loc=0, scale=100),
        "SVR__C": uniform(loc=1, scale=100)
    }  
}   

model_param_dict["SVR - poly"] = {
    "estimator": Pipeline([('transformers', ct), ('SVR', SVR())]),
    "params": {
        "SVR__kernel": ["poly"],
        "SVR__degree": uniform(loc=1, scale=10),
        #"SVR__gamma": ["auto", "scale"],
        "SVR__coef0": uniform(loc=0, scale=10),
        "SVR__C": uniform(loc=1, scale=10),
    }  
}  

model_param_dict["DT"] = {
    "estimator": Pipeline([('transformers', ct), ('DT', DecisionTreeRegressor())]),
    "params": {
        "DT__criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
        "DT__max_depth": uniform(loc=0, scale=1000),
        #add more hyperparameters...
        #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
    }  
}    

model_param_dict["RandomForestRegressor"] = {
    "estimator": Pipeline([('transformers', ct), ('RFR', RandomForestRegressor())]),
    "params": {
        "RFR__n_estimators": uniform(loc=0, scale=1000),
        "RFR__max_features": uniform(loc=0, scale=1000),
        #add more hyperparameters...
        #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
    }  
}    






model_param_dict

{'base': {'estimator': Pipeline(steps=[('transformers',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('Location_One_Hot',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ['Location', 'City'])])),
                  ('LR', LinearRegression())]),
  'params': {}},
 'SVR': {'estimator': Pipeline(steps=[('transformers',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('Location_One_Hot',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ['Location', 'City'])])),
                  ('SVR', SVR())]),
  'params': {'SVR__kernel': ['linear', 'rbf', 'sigmoid'],
   'SVR__coef0': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x15b0566c190>,
   'SVR__C': <scipy.stats._dis

In [None]:
inner_cv_results,gridserach_estimatators,outer_scores = nested_cv_sklearnModels(model_param_dict, X_train, y_train)
inner_cv_results

 40%|████      | 2/5 [16:11<28:33, 571.14s/it]

In [None]:
outer_scores


[0.6134172399310431, 0.340536087644589, 0.6655633428018871, 0.7931644880584522]

65