In [60]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

from sklearn.linear_model import Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

import xgboost as xgb

In [61]:
df = pd.read_csv('cleaned_data_v3.csv')

In [62]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [63]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [64]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [65]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3554 entries, 0 to 3553
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   property_type    3554 non-null   object 
 1   sector           3554 non-null   object 
 2   price            3554 non-null   float64
 3   bedRoom          3554 non-null   float64
 4   bathroom         3554 non-null   float64
 5   balcony          3554 non-null   object 
 6   agePossession    3554 non-null   object 
 7   built_up_area    3554 non-null   float64
 8   servant room     3554 non-null   float64
 9   store room       3554 non-null   float64
 10  furnishing_type  3554 non-null   object 
 11  luxury_category  3554 non-null   object 
 12  floor_category   3554 non-null   object 
dtypes: float64(6), object(7)
memory usage: 361.1+ KB


In [67]:
X = df.drop(columns=['price'])
y = df['price']

In [68]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [69]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [70]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [71]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor',LinearRegression())
])

In [72]:
print(X.shape)  # Should be (n_samples, n_features)
print(y_transformed.shape)

(3554, 12)
(3554,)


In [73]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
print(scores)

[0.7112841  0.75780656 0.75338985 0.75435482 0.7495126  0.68068362
 0.79641346 0.72883971 0.69340419 0.73740773]


In [74]:
scores.mean(),scores.std()

(0.7363096633436828, 0.03238005754429938)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [76]:
pipeline.fit(X_train,y_train)

In [77]:
y_pred = pipeline.predict(X_test)

In [78]:
y_pred = np.expm1(y_pred)

In [79]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.9463822160089355

In [80]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [81]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    
}

In [82]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))
    print(scorer(model_name, model))

['linear_reg', 0.7363096633436828, 0.9463822160089355]
['svr', 0.7642012011196353, 0.8472636473483927]
['ridge', 0.7363125343993552, 0.946338774185337]
['LASSO', 0.05943378064493572, 1.528905986892753]
['decision tree', 0.7695055787998407, 0.7283857218379266]
['random forest', 0.880297365293288, 0.5327157589615203]
['extra trees', 0.8693829138586782, 0.5560928982087826]
['gradient boosting', 0.8725599951577208, 0.575598106818494]
['adaboost', 0.7531050063148379, 0.8066408876444128]
['mlp', 0.8142860064901326, 0.7176730062038887]


In [83]:
model_output

[['linear_reg', 0.7363096633436828, 0.9463822160089355],
 ['svr', 0.7642012011196353, 0.8472636473483927],
 ['ridge', 0.7363125343993552, 0.946338774185337],
 ['LASSO', 0.05943378064493572, 1.528905986892753],
 ['decision tree', 0.7763646294590743, 0.7401069096729465],
 ['random forest', 0.8810083465236558, 0.5283945490510291],
 ['extra trees', 0.8683250314556735, 0.5509317255147204],
 ['gradient boosting', 0.8725601336017659, 0.5762530746478087],
 ['adaboost', 0.7528627989416511, 0.8165519947345712],
 ['mlp', 0.8105452645923463, 0.7160137898341699]]

In [84]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [85]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.881008,0.528395
6,extra trees,0.868325,0.550932
7,gradient boosting,0.87256,0.576253
9,mlp,0.810545,0.716014
4,decision tree,0.776365,0.740107
8,adaboost,0.752863,0.816552
1,svr,0.764201,0.847264
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382
3,LASSO,0.059434,1.528906


### OneHotEncoding

In [86]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [87]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [88]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [89]:
scores.mean()

0.8546054073648314

In [90]:
scores.std()

0.01599847663314007

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [92]:
pipeline.fit(X_train,y_train)

In [93]:
y_pred = pipeline.predict(X_test)

In [94]:
y_pred = np.expm1(y_pred)

In [95]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6497382874070646

In [96]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [97]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    
}

In [98]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))
    print(scorer(model_name, model))

['linear_reg', 0.8546054073648314, 0.6497382874070646]
['svr', 0.7697413260547326, 0.8341243500492146]
['ridge', 0.8546783310939278, 0.65291434945168]
['LASSO', 0.05943378064493578, 1.528905986892753]
['decision tree', 0.8032285972969241, 0.7020790335465339]
['random forest', 0.890222891402028, 0.49439353440312694]
['extra trees', 0.8926738809507532, 0.4746598142621768]
['gradient boosting', 0.8766262812212116, 0.5689732350714807]
['adaboost', 0.7513589272244545, 0.8407838540856883]
['mlp', 0.8753440933374914, 0.5635605837971508]


In [99]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [100]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.894146,0.465517
5,random forest,0.890939,0.499591
9,mlp,0.868538,0.541855
7,gradient boosting,0.876792,0.568635
0,linear_reg,0.854605,0.649738
2,ridge,0.854678,0.652914
4,decision tree,0.80159,0.687876
1,svr,0.769741,0.834124
8,adaboost,0.74551,0.844601
3,LASSO,0.059434,1.528906


### OneHotEncoding With PCA

In [101]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [102]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [103]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [104]:
scores.mean()

0.06225201431451136

In [105]:
scores.std()

0.01986059407164015

In [106]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [107]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    
}

In [108]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))
    print(scorer(model_name, model))

['linear_reg', 0.06225201431451136, 1.5267074088549337]
['svr', 0.21807348496172244, 1.3611626793047398]
['ridge', 0.06225201516179146, 1.5267074078044667]
['LASSO', 0.059675784467370055, 1.5287392557835464]
['decision tree', 0.6964420082698518, 0.761508966234373]
['random forest', 0.7628341047277558, 0.6570599077383127]
['extra trees', 0.7400972392897442, 0.6994812596086065]
['gradient boosting', 0.6106227078866426, 0.9879063301936338]
['adaboost', 0.29864705501525074, 1.3532031419766162]
['mlp', 0.21153092145094812, 1.4150661791937216]


In [109]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [110]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.763037,0.657672
6,extra trees,0.7403,0.696538
4,decision tree,0.696442,0.761509
7,gradient boosting,0.610623,0.987906
8,adaboost,0.299821,1.357068
1,svr,0.218073,1.361163
9,mlp,0.20541,1.397306
2,ridge,0.062252,1.526707
0,linear_reg,0.062252,1.526707
3,LASSO,0.059676,1.528739


### Target Encoder

In [111]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [112]:
!pip install category_encoders



In [113]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [114]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [115]:
scores.mean(),scores.std()

(0.829521918225536, 0.01838446337912282)

In [116]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [117]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    
}

In [118]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))
    print(scorer(model_name, model))

['linear_reg', 0.829521918225536, 0.7130109838896388]
['svr', 0.7829174051174264, 0.818850747431723]
['ridge', 0.8295359700269425, 0.7135228301064971]
['LASSO', 0.05943378064493572, 1.528905986892753]
['decision tree', 0.825705493513772, 0.5548314997000179]
['random forest', 0.9005351303197605, 0.4552396597846095]
['extra trees', 0.9020985995787836, 0.45687886453592963]
['gradient boosting', 0.8892331208576119, 0.508473981508696]
['adaboost', 0.8213539768755626, 0.6957213875325432]
['mlp', 0.8499824250403938, 0.5847602053929977]


In [119]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [120]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.900547,0.454515
6,extra trees,0.903092,0.460327
7,gradient boosting,0.889105,0.508592
4,decision tree,0.830623,0.551221
9,mlp,0.847263,0.633203
8,adaboost,0.817559,0.684075
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851
3,LASSO,0.059434,1.528906


### Hyperparameter Tuning

In [121]:
from sklearn.model_selection import GridSearchCV

In [122]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [123]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [124]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [125]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [126]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [None]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


In [None]:
final_pipe = search.best_estimator_

In [None]:
search.best_params_

In [None]:
search.best_score_

In [None]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [None]:
pipeline.fit(X,y_transformed)

In [None]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [None]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [None]:
X

### Trying out the predictions

In [None]:
X.columns

In [None]:
X.iloc[0].values

In [None]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


In [None]:
np.expm1(pipeline.predict(one_df))

In [None]:
X.dtypes

In [None]:
sorted(X['sector'].unique().tolist())