In [1]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler 
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.svm import SVR 
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [4]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [5]:
df['furnishing_type'] = df['furnishing_type'].replace({0.0 : 'unfurnished', 1.0 : 'semifernished', 2.0: 'furnished'})

In [6]:
df['furnishing_type'].value_counts()

furnishing_type
unfurnished      2349
semifernished    1018
furnished         187
Name: count, dtype: int64

In [7]:
y = df['price']
X = df.drop(columns='price')

In [8]:
y_transform = np.log1p(y)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3554 entries, 0 to 3553
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   property_type    3554 non-null   object 
 1   sector           3554 non-null   object 
 2   price            3554 non-null   float64
 3   bedRoom          3554 non-null   float64
 4   bathroom         3554 non-null   float64
 5   balcony          3554 non-null   object 
 6   agePossession    3554 non-null   object 
 7   built_up_area    3554 non-null   float64
 8   servant room     3554 non-null   float64
 9   store room       3554 non-null   float64
 10  furnishing_type  3554 non-null   object 
 11  luxury_category  3554 non-null   object 
 12  floor_category   3554 non-null   object 
dtypes: float64(6), object(7)
memory usage: 361.1+ KB


In [11]:
cat_col = X.select_dtypes(include = ['object']).columns.to_list()
num_col = X.select_dtypes(include = ['float64']).columns.to_list()

In [12]:
sc = StandardScaler() 
oe = OrdinalEncoder()

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', oe, cat_col), 
        ('num', sc, num_col)
], remainder='passthrough')

In [13]:
preprocessor

In [14]:
pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('regressor' , LinearRegression())
    ]
)


In [15]:
pipeline

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transform,test_size=0.2,random_state=42)


In [17]:
pipeline.fit(X_train, y_train)

In [18]:
y_pred = pipeline.predict(X_test)

In [19]:
mean_absolute_error(np.expm1(y_test), np.expm1(y_pred))

0.9463822160089357

In [20]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transform, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transform,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [21]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor(),
    'mlp':MLPRegressor()
}

In [22]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [23]:
model_df = pd.DataFrame(model_output).sort_values(by=2).rename(columns={0:'model name', 1:'r2_score',2:'mae'})

In [24]:
model_df

Unnamed: 0,model name,r2_score,mae
8,xgboost,0.890446,0.504281
5,random forest,0.881321,0.535057
6,gradient boosting,0.872593,0.576001
9,mlp,0.810946,0.701238
4,decision tree,0.770794,0.732807
1,svr,0.764201,0.847264
7,adaboost,0.752888,0.860459
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382
3,LASSO,0.059434,1.528906


# one hot encoder

In [151]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), cat_col),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [152]:
cat_col

['property_type',
 'sector',
 'balcony',
 'agePossession',
 'furnishing_type',
 'luxury_category',
 'floor_category']

In [153]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [154]:
model_df = pd.DataFrame(model_output).sort_values(by=2).rename(columns={0:'model name', 1:'r2_score',2:'mae'})

In [155]:
model_df

Unnamed: 0,model name,r2_score,mae
8,xgboost,0.89585,0.493456
5,random forest,0.890383,0.503325
9,mlp,0.873972,0.534779
6,gradient boosting,0.87658,0.569324
0,linear_reg,0.854609,0.649751
2,ridge,0.854739,0.652915
4,decision tree,0.80509,0.699437
7,adaboost,0.75556,0.796472
1,svr,0.769741,0.834124
3,LASSO,0.059434,1.528906


# target encoding

In [10]:
import category_encoders as ce


In [11]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category'] 
transformer = ColumnTransformer(
    [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']), 
        ('cat2', ce.TargetEncoder(), ['sector'])
    ]
)

In [12]:
pipeline = Pipeline(
    [
        ('preprocessor', transformer), 
        ('regressor', LinearRegression())
    ]
)

In [13]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transform, cv = kfold, scoring='r2')

In [14]:
scores.mean(), scores.std()

(0.8295219182255362, 0.018384463379122782)

In [15]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', transformer),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transform, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transform,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [16]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [17]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [18]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [19]:
model_df.sort_values(by = 'mae')

Unnamed: 0,name,r2,mae
10,xgboost,0.904798,0.447518
5,random forest,0.90074,0.455594
6,extra trees,0.901722,0.455737
7,gradient boosting,0.888794,0.508656
4,decision tree,0.823697,0.575173
9,mlp,0.851802,0.614804
8,adaboost,0.818078,0.671658
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


In [20]:
from sklearn.model_selection import GridSearchCV 

In [223]:
parameters_grid = {
    'n_estimators': [100, 300, 500, 1000]
}

In [224]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category'] 
transformer = ColumnTransformer(
    [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']), 
        ('cat2', ce.TargetEncoder(), ['sector'])
    ]
)

In [225]:
pipeline = Pipeline(
    [
        ('preprocessor', transformer), 
        ('regressor', XGBRegressor()
    ]
)

In [226]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [227]:
search = GridSearchCV(pipeline, parameters_grid, cv=kfold, scoring='r2',n_jobs = -1)

In [228]:
search.fit(X, y_transform)

ValueError: Invalid parameter 'n_estimators' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['bedRoom', 'bathroom',
                                                   'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('cat', OrdinalEncoder(),
                                                  ['property_type', 'sector',
                                                   'balcony', 'agePossession',
                                                   'furnishing_type',
                                                   'luxury_category',
                                                   'floor_category']),
                                                 ('cat1',
                                                  OneHotEncoder(drop='first',
                                                                sparse_output=False),
                                                  ['agePoss...
                              feature_types=None, gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=None, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, multi_strategy=None,
                              n_estimators=None, n_jobs=None,
                              num_parallel_tree=None, random_state=None, ...))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].

In [None]:
search.best_params_

In [None]:
search.best_score_

In [None]:
search.best_estimator_

In [None]:
final_pipe = search.best_estimator_

In [232]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transform,test_size=0.2,random_state=42)

In [233]:
pipeline = Pipeline(
    [
        ('preprocessor', transformer), 
        ('regressor', XGBRegressor(n_estimators=500))
    ]
)

In [234]:
pipeline.fit(X_train, y_train)

In [235]:
y_pred = pipeline.predict(X_test)

In [236]:
mean_absolute_error(np.expm1(y_test), np.expm1(y_pred))

0.44342288991989137

# exporting the model

In [237]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [238]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [239]:
pipeline.fit(X,y_transform)

In [257]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [258]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [254]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifernished,High,Mid Floor
4,flat,sector 102,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 84,2.0,2.0,1,Relatively New,532.0,0.0,0.0,unfurnished,Medium,Mid Floor
3550,house,sector 109,5.0,5.0,3+,Relatively New,6228.0,1.0,1.0,unfurnished,High,Low Floor
3551,flat,sector 2,1.0,1.0,1,Moderately Old,665.0,0.0,0.0,semifernished,Medium,Mid Floor
3552,house,sector 43,5.0,6.0,3,Moderately Old,5490.0,1.0,1.0,unfurnished,Medium,Mid Floor


In [250]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [251]:
np.expm1(pipeline.predict(one_df))

array([3.16560682])

In [245]:
df.columns

Index(['property_type', 'sector', 'price', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [264]:
df['sector'].unique().tolist()

['sector 36',
 'sector 89',
 'sohna road',
 'sector 92',
 'sector 102',
 'gwal pahari',
 'sector 108',
 'sector 105',
 'sector 26',
 'sector 109',
 'sector 28',
 'sector 65',
 'sector 12',
 'sector 85',
 'sector 70a',
 'sector 30',
 'sector 107',
 'sector 3',
 'sector 2',
 'sector 41',
 'sector 4',
 'sector 62',
 'sector 49',
 'sector 81',
 'sector 66',
 'sector 86',
 'sector 48',
 'sector 51',
 'sector 37',
 'sector 111',
 'sector 67',
 'sector 113',
 'sector 13',
 'sector 61',
 'sector 69',
 'sector 67a',
 'sector 37d',
 'sector 82',
 'sector 53',
 'sector 74',
 'sector 52',
 'sector 43',
 'sector 14',
 'sector 25',
 'sector 95',
 'sector 56',
 'sector 83',
 'sector 104',
 'sector 88a',
 'sector 55',
 'sector 50',
 'sector 84',
 'sector 91',
 'sector 76',
 'sector 82a',
 'sector 78',
 'manesar',
 'sector 93',
 'sector 7',
 'sector 71',
 'sector 110',
 'sector 33',
 'sector 70',
 'sector 103',
 'sector 90',
 'sector 38',
 'sector 79',
 'sector 112',
 'sector 22',
 'sector 59',
 'secto

In [266]:
df.columns

Index(['property_type', 'sector', 'price', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [270]:
df['store room'].replace([1,0],['Yes','No'])

0        No
1        No
2        No
3        No
4       Yes
       ... 
3549     No
3550    Yes
3551     No
3552    Yes
3553     No
Name: store room, Length: 3554, dtype: object