In [99]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [100]:
df = pd.read_csv("../Dataset/gurgaon_properties_post_feature_selectionv2.csv")

In [101]:
df.drop(columns=['pooja room', 'study room', 'others'],inplace=True)

In [102]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 52,1.55,4,4,3,Moderately Old,2100.0,0,0,1,Low,Low Floor
1,flat,sector 86,0.5,2,2,1,Relatively New,577.78,0,0,1,Low,Mid Floor
2,flat,sector 112,3.65,3,5,2,Relatively New,2800.0,1,0,1,Medium,Mid Floor
3,flat,sector 53,4.8,3,4,3+,Relatively New,2500.0,1,0,2,High,Mid Floor
4,flat,sector 83,1.76,3,3,3,Relatively New,1500.0,1,0,2,High,Mid Floor


Furnishing_type is also a categorical column so convert into category

In [103]:
df['furnishing_type'].value_counts()

furnishing_type
1    2374
2     995
0     185
Name: count, dtype: int64

In [104]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [105]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 52,1.55,4,4,3,Moderately Old,2100.0,0,0,semifurnished,Low,Low Floor
1,flat,sector 86,0.5,2,2,1,Relatively New,577.78,0,0,semifurnished,Low,Mid Floor
2,flat,sector 112,3.65,3,5,2,Relatively New,2800.0,1,0,semifurnished,Medium,Mid Floor
3,flat,sector 53,4.8,3,4,3+,Relatively New,2500.0,1,0,furnished,High,Mid Floor
4,flat,sector 83,1.76,3,3,3,Relatively New,1500.0,1,0,furnished,High,Mid Floor


In [106]:
X = df.drop(columns=['price'])
y = df['price']

In [107]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

## Ordinal Encoding

In [108]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [109]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [110]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [111]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [112]:
scores.mean(),scores.std()

(0.6057289897365525, 0.04139470931805974)

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [114]:
pipeline.fit(X_train,y_train)


In [115]:
y_pred = pipeline.predict(X_test)

In [116]:
y_pred = np.expm1(y_pred)

In [117]:
mean_absolute_error(np.expm1(y_test),y_pred)

1.0701827690997028

In [118]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [119]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [120]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [121]:
model_output

[['linear_reg', 0.6057289897365525, 1.0701827690997028],
 ['svr', 0.6622233660964244, 1.044480073438913],
 ['ridge', 0.6057356339388817, 1.070143678915813],
 ['LASSO', 0.053034381482521896, 1.6385311183215419],
 ['decision tree', 0.7692323840035928, 0.7434347194933699],
 ['random forest', 0.8795539343976555, 0.5426686466817402],
 ['extra trees', 0.8635607117875, 0.5834068557157673],
 ['gradient boosting', 0.8727489417887796, 0.5847306368362608],
 ['adaboost', 0.7365504636441671, 0.8596307468498687],
 ['mlp', 0.8053909780277101, 0.7756064299010282],
 ['xgboost', 0.8878688454312703, 0.5234577734091446]]

In [122]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [123]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.887869,0.523458
5,random forest,0.879554,0.542669
6,extra trees,0.863561,0.583407
7,gradient boosting,0.872749,0.584731
4,decision tree,0.769232,0.743435
9,mlp,0.805391,0.775606
8,adaboost,0.73655,0.859631
1,svr,0.662223,1.04448
2,ridge,0.605736,1.070144
0,linear_reg,0.605729,1.070183


Observation
* Only doing ordinal encoding tree based model are doing good but liner model are not doing that good.
* Xgboost is giving best result, followd by all other tree based model

## OneHotEncoding

In [124]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown="ignore", drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [125]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [126]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [127]:
scores.mean()

0.7844143085244542

In [128]:
scores.std()

0.028043497255924393

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [130]:
pipeline.fit(X_train,y_train)

In [131]:
y_pred = pipeline.predict(X_test)



In [132]:
y_pred = np.expm1(y_pred)

In [133]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.7955021858436361

In [134]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [135]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [136]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [137]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [138]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.88703,0.503278
6,extra trees,0.885314,0.515703
5,random forest,0.886054,0.522086
7,gradient boosting,0.872248,0.588203
9,mlp,0.872881,0.608456
4,decision tree,0.808563,0.707581
0,linear_reg,0.784414,0.795502
2,ridge,0.785104,0.798745
8,adaboost,0.745702,0.873189
1,svr,0.665601,1.041232


## OneHotEncoding With PCA

In [139]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown="ignore", drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [140]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [141]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [142]:
scores.mean()

0.05544839259909882

In [143]:
scores.std()

0.034936405844989735

In [144]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [145]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [146]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [147]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [148]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.697766,0.854071
6,extra trees,0.660546,0.90517
4,decision tree,0.594493,0.990592
10,xgboost,0.595704,1.100491
7,gradient boosting,0.598624,1.138427
1,svr,0.228894,1.464969
8,adaboost,0.296599,1.518851
9,mlp,0.218237,1.520414
3,LASSO,0.053179,1.63848
2,ridge,0.055448,1.646895


Observation
For dimentionality reduction this is used. but this is not giving good result nned to check why

## Target Encoder

In [149]:
import category_encoders as ce

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown="ignore", drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [150]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [151]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [152]:
scores.mean(),scores.std()

(-3643688903371.8975, 10902730222898.287)

In [153]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [154]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [155]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [156]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [157]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.9026735,0.471982
6,extra trees,0.899311,0.493197
10,xgboost,0.9001273,0.494525
7,gradient boosting,0.8871066,0.552973
9,mlp,0.8699654,0.59507
4,decision tree,0.8257263,0.636432
8,adaboost,0.8106079,0.753279
2,ridge,0.7853703,0.794397
0,linear_reg,-3643689000000.0,0.798358
1,svr,0.6865518,1.016389


Observation
* This work bwat with tree based model.
* After all check random forest is giving best mae, so we will use this.

## Hyperparameter Tuning

In [158]:
from sklearn.model_selection import GridSearchCV

In [159]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [160]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown="ignore", drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [161]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [162]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [163]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [164]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
332 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Software\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Software\anaconda\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\Software\anaconda\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "D:\Software\anaconda\Lib\site-packages\sklearn\base.py", line 1466, in 

In [165]:
final_pipe = search.best_estimator_

In [166]:
search.best_params_

{'regressor__max_depth': 30,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [167]:
search.best_score_

0.8959705538888825

In [168]:
final_pipe.fit(X,y_transformed)

In [169]:
y_pred = final_pipe.predict(X_test)
y_pred = np.expm1(y_pred)

In [170]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.21808495770891861

This can be done better using Xgboost hyperparmeter.

## Exporting the model

In [171]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown="ignore", drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [172]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=300))
])

In [173]:
pipeline.fit(X,y_transformed)

In [174]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [175]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [176]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 52,4,4,3,Moderately Old,2100.00,0,0,semifurnished,Low,Low Floor
1,flat,sector 86,2,2,1,Relatively New,577.78,0,0,semifurnished,Low,Mid Floor
2,flat,sector 112,3,5,2,Relatively New,2800.00,1,0,semifurnished,Medium,Mid Floor
3,flat,sector 53,3,4,3+,Relatively New,2500.00,1,0,furnished,High,Mid Floor
4,flat,sector 83,3,3,3,Relatively New,1500.00,1,0,furnished,High,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,house,sector 70a,4,4,3+,New Property,2610.00,0,0,semifurnished,Low,Mid Floor
3550,flat,sector 113,2,2,3,Relatively New,1417.12,0,0,semifurnished,Medium,Mid Floor
3551,house,sector 4,3,2,1,New Property,684.00,0,0,semifurnished,Low,Low Floor
3552,flat,sector 85,2,2,2,Relatively New,1075.00,0,0,semifurnished,Low,Mid Floor
