In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression , Ridge , Lasso , ElasticNet
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error ,r2_score
#from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%%time
df = pd.read_csv("data_cleaned.csv",nrows=10000) # reading the file and printing the first 5 rows 
df.head()

Wall time: 33.2 ms


Unnamed: 0,region,price,type,sqfeet,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,...,has_pool,has_grill,has_fireplace,gym_nearby,school/clg_nearby,wifi_facilities,valet_service,shopping_nearby,sports_playground,dining_nearby
0,3.0,7.021084,3.0,7.184629,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,3.0,6.715383,3.0,7.032624,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
2,3.0,6.684612,3.0,6.831954,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
3,3.0,6.665684,3.0,6.953684,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
4,3.0,6.802395,3.0,7.16858,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0


In [3]:
df.shape

(10000, 26)

In [4]:
df.info()  # Getting Basic information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   region                   10000 non-null  float64
 1   price                    10000 non-null  float64
 2   type                     10000 non-null  float64
 3   sqfeet                   10000 non-null  float64
 4   smoking_allowed          10000 non-null  float64
 5   wheelchair_access        10000 non-null  float64
 6   electric_vehicle_charge  10000 non-null  float64
 7   comes_furnished          10000 non-null  float64
 8   laundry_options          10000 non-null  float64
 9   parking_options          10000 non-null  float64
 10  lat                      10000 non-null  float64
 11  long                     10000 non-null  float64
 12  premium_house            10000 non-null  float64
 13  pets_allowed             10000 non-null  float64
 14  beds_per_sqfeet        

#### Attempting Multiple Models

In [5]:
df.shape

(10000, 26)

In [6]:
%%time
X = df.drop('price',axis=1)  #independent variables
y = df['price']   # dependent variable-Price

Wall time: 4.86 ms


In [7]:
print(X.shape)
print(y.shape)

(10000, 25)
(10000,)


In [8]:
#calculating multicollinearity for each variable
from statsmodels.stats.outliers_influence import variance_inflation_factor 

# Compute the VIF for each predictor variable
vif = [variance_inflation_factor(X, i) for i in range(X.shape[1])]

# Print the VIF for each predictor variable
for i, v in enumerate(vif):
    print('VIF for predictor {}: {}'.format(i, v))

VIF for predictor 0: 537.8456015652796
VIF for predictor 1: 25.144207467741925
VIF for predictor 2: 924.0650792932337
VIF for predictor 3: 4.011735342294804
VIF for predictor 4: 1.343775436516012
VIF for predictor 5: 1.0340543616775209
VIF for predictor 6: 1.15915679363737
VIF for predictor 7: 10.264434811195676
VIF for predictor 8: 1.7777884465184686
VIF for predictor 9: 103.32906900757233
VIF for predictor 10: 157.2157026824252
VIF for predictor 11: 3.7316963913704675
VIF for predictor 12: 5.742851886458786
VIF for predictor 13: 943.5268460504867
VIF for predictor 14: 4.768309470592075
VIF for predictor 15: 4.1156034615918
VIF for predictor 16: 1.5646932955433113
VIF for predictor 17: 1.389312576765718
VIF for predictor 18: 2.9997215188969233
VIF for predictor 19: 1.5785728154274614
VIF for predictor 20: 1.34057322409328
VIF for predictor 21: 1.2213527011962133
VIF for predictor 22: 2.3914548781972873
VIF for predictor 23: 2.2432230968502456
VIF for predictor 24: 2.036887229878906


In [9]:
%%time
#creating objects of all required models and storing them in the list

models=[]
models.append(('LR',LinearRegression()))
models.append(('Ridge',Ridge(alpha=1.0,positive=True)))
models.append(('Lasso',Lasso(alpha=0.7,positive=True)))
models.append(('ElasticNet',ElasticNet(positive=True)))
models.append(('Gauss',GaussianProcessRegressor()))
models.append(('DT',DecisionTreeRegressor(min_samples_split=5,max_features='auto')))
models.append(('RF',RandomForestRegressor(n_estimators=200,max_features='auto')))
models.append(('KNN',KNeighborsRegressor(n_neighbors=8)))
models.append(('SVR',SVR(C=3.0)))


models

Wall time: 1.04 ms


[('LR', LinearRegression()),
 ('Ridge', Ridge(positive=True)),
 ('Lasso', Lasso(alpha=0.7, positive=True)),
 ('ElasticNet', ElasticNet(positive=True)),
 ('Gauss', GaussianProcessRegressor()),
 ('DT', DecisionTreeRegressor(max_features='auto', min_samples_split=5)),
 ('RF', RandomForestRegressor(n_estimators=200)),
 ('KNN', KNeighborsRegressor(n_neighbors=8)),
 ('SVR', SVR(C=3.0))]

#### Using K-Fold Cross Validation

In [10]:
%%time

from sklearn.model_selection import KFold
kfold = KFold(n_splits=10)   #Splitting the data into K fold and then testing it using k-1 folds

Wall time: 0 ns


In [None]:
%%time
results = []
names = []
scoring_method=['neg_mean_squared_error','r2'] #using negative mean squared error and r square to check residuals 

m_name=[]
m_method=[]   # creating 3 empty lists to store name of the model , method name and its mean
m_mean=[]

for name,model in models:
    for method in scoring_method: 
        cv_results = cross_val_score(model, X, y,  cv=kfold,scoring=method,n_jobs=-1)  
        m_name.append(name)                                          # computing cross validation score for every model
        m_method.append(method)
        m_mean.append(cv_results.mean())
        
        #print(name,':',method,':',cv_results.mean())
        
        
final_df=pd.DataFrame.from_dict({'Model-name':m_name,'Method':m_method,'Mean Value':m_mean})
final_df


In [None]:
# Create the pivot table
piv = pd.pivot_table(data=final_df, index=final_df['Model-name'], columns=['Method'])

# Reorder the levels of the index
result = piv.reorder_levels([1, 0], axis='columns')
result_final = result.sort_index(axis='columns', level='Method').reindex(['LR','Ridge','Lasso','ElasticNet','Gauss','DT','RF','KNN','SVR'])


result_final.to_excel('K-Fold_all_scores.xlsx')

result_final