In [40]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [41]:
df = pd.read_csv('../Dataset/Processed/gurgaon_properties_post_feature_selection_v2.csv')

In [42]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [43]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [44]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [45]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [46]:
X = df.drop(columns=['price'])
y = df['price']

In [47]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [10]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [11]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [12]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [13]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [14]:
scores.mean(),scores.std()

(np.float64(0.7363096633436828), np.float64(0.03238005754429936))

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [16]:
pipeline.fit(X_train,y_train)

In [17]:
y_pred = pipeline.predict(X_test)

In [18]:
y_pred = np.expm1(y_pred)

In [19]:
mean_absolute_error(np.expm1(y_test),y_pred)

np.float64(0.9463822160089359)

In [20]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [21]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [22]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [23]:
model_output

[['linear_reg',
  np.float64(0.7363096633436828),
  np.float64(0.9463822160089359)],
 ['svr', np.float64(0.7642079133544131), np.float64(0.8472636473483939)],
 ['ridge', np.float64(0.7363125343993555), np.float64(0.9463387741853386)],
 ['LASSO', np.float64(0.05943378064493573), np.float64(1.528905986892753)],
 ['decision tree',
  np.float64(0.775217252251678),
  np.float64(0.7469113957667736)],
 ['random forest',
  np.float64(0.8813455991845821),
  np.float64(0.5301907443828737)],
 ['extra trees',
  np.float64(0.8678662889822295),
  np.float64(0.5544991797153718)],
 ['gradient boosting',
  np.float64(0.8725149176086591),
  np.float64(0.5756213881409791)],
 ['adaboost', np.float64(0.7543209334994885), np.float64(0.8131068025470256)],
 ['mlp', np.float64(0.8035478428639736), np.float64(0.6751334323606777)],
 ['xgboost', np.float64(0.8894876835260124), np.float64(0.5040475180464287)]]

In [24]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [25]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.889488,0.504048
5,random forest,0.881346,0.530191
6,extra trees,0.867866,0.554499
7,gradient boosting,0.872515,0.575621
9,mlp,0.803548,0.675133
4,decision tree,0.775217,0.746911
8,adaboost,0.754321,0.813107
1,svr,0.764208,0.847264
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


### OneHotEncoding

In [26]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [27]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [28]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [29]:
scores.mean()

np.float64(0.8546073013476256)

In [30]:
scores.std()

np.float64(0.015999826710187303)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [32]:
pipeline.fit(X_train,y_train)

In [33]:
y_pred = pipeline.predict(X_test)

In [34]:
y_pred = np.expm1(y_pred)

In [35]:
mean_absolute_error(np.expm1(y_test),y_pred)

np.float64(0.6497593061775198)

In [39]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [40]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [41]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [42]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [43]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.89447,0.472923
10,xgboost,0.89585,0.493456
5,random forest,0.890796,0.506349
9,mlp,0.874502,0.512863
7,gradient boosting,0.876648,0.569919
0,linear_reg,0.854607,0.649759
2,ridge,0.854685,0.652823
4,decision tree,0.808499,0.697527
1,svr,0.769741,0.834124
8,adaboost,0.750923,0.840331


### OneHotEncoding With PCA

In [44]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [45]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=9)),
    ('regressor', LinearRegression())
])

In [46]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [47]:
scores.mean()

np.float64(0.7059216559695731)

In [48]:
scores.std()

np.float64(0.03832201287802145)

In [49]:
def scorer(model_name, model):
    output = []    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=9)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [50]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [51]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [52]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [53]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.836568,0.63658
6,extra trees,0.83431,0.637039
5,random forest,0.830371,0.644691
7,gradient boosting,0.818767,0.726624
9,mlp,0.792546,0.792437
1,svr,0.780267,0.83091
4,decision tree,0.664965,0.853639
8,adaboost,0.681708,0.945829
2,ridge,0.705923,1.004096
0,linear_reg,0.705922,1.004225


### Target Encoder

In [54]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [55]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [56]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [57]:
scores.mean(),scores.std()

(np.float64(0.829521918225536), np.float64(0.018384463379122837))

In [58]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [59]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [60]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [61]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [62]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.904798,0.447518
5,random forest,0.900919,0.456739
6,extra trees,0.902216,0.461574
7,gradient boosting,0.888855,0.509855
4,decision tree,0.82836,0.554244
9,mlp,0.852885,0.614347
8,adaboost,0.816378,0.686327
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


### Hyperparameter Tuning

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [15]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [16]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [17]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [18]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [19]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/daksh/anaconda3/lib/python3.11/site-packages/joblib/externals/loky/backend/popen_loky_posix.py", line 180, in <module>
    exitcode = process_obj._bootstrap()
  File "/Users/daksh/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/daksh/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwarg

[CV 1/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.0s[CV 4/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.0s
[CV 3/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.0s

[CV 2/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.0s
[CV 5/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.0s
[CV 6/10] END regressor__max_depth=None, regressor__max_features=auto, regressor__max_samples=0.1, regressor__n_estimators=50;, score=nan total time=   0.0s
[CV 8/10] END regressor__max_depth=None, regressor__max_fe



[CV 1/10] END regressor__max_depth=20, regressor__max_features=auto, regressor__max_samples=0.5, regressor__n_estimators=200;, score=nan total time=   0.0s
[CV 5/10] END regressor__max_depth=20, regressor__max_features=auto, regressor__max_samples=0.5, regressor__n_estimators=200;, score=nan total time=   0.0s
[CV 9/10] END regressor__max_depth=20, regressor__max_features=auto, regressor__max_samples=0.5, regressor__n_estimators=200;, score=nan total time=   0.0s
[CV 2/10] END regressor__max_depth=20, regressor__max_features=auto, regressor__max_samples=0.5, regressor__n_estimators=200;, score=nan total time=   0.0s
[CV 3/10] END regressor__max_depth=20, regressor__max_features=auto, regressor__max_samples=0.5, regressor__n_estimators=300;, score=nan total time=   0.0s
[CV 6/10] END regressor__max_depth=20, regressor__max_features=auto, regressor__max_samples=0.5, regressor__n_estimators=200;, score=nan total time=   0.0s
[CV 10/10] END regressor__max_depth=20, regressor__max_features=


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/daksh/anaconda3/lib/python3.11/site-packages/joblib/externals/loky/backend/popen_loky_posix.py", line 180, in <module>
    exitcode = process_obj._bootstrap()
  File "/Users/daksh/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/daksh/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwarg

[CV 6/10] END regressor__max_depth=20, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=300;, score=0.864 total time=   0.4s
[CV 4/10] END regressor__max_depth=20, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=100;, score=0.877 total time=   0.2s
[CV 3/10] END regressor__max_depth=20, regressor__max_features=sqrt, regressor__max_samples=0.5, regressor__n_estimators=50;, score=0.905 total time=   0.2s
[CV 7/10] END regressor__max_depth=20, regressor__max_features=sqrt, regressor__max_samples=0.5, regressor__n_estimators=100;, score=0.897 total time=   0.3s
[CV 1/10] END regressor__max_depth=20, regressor__max_features=sqrt, regressor__max_samples=0.5, regressor__n_estimators=100;, score=0.905 total time=   0.3s
[CV 4/10] END regressor__max_depth=20, regressor__max_features=sqrt, regressor__max_samples=0.5, regressor__n_estimators=50;, score=0.882 total time=   0.2s
[CV 5/10] END regressor__max_depth=20, regressor__max


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/daksh/anaconda3/lib/python3.11/site-packages/joblib/externals/loky/backend/popen_loky_posix.py", line 180, in <module>
    exitcode = process_obj._bootstrap()
  File "/Users/daksh/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/daksh/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwarg

[CV 4/10] END regressor__max_depth=30, regressor__max_features=sqrt, regressor__max_samples=1.0, regressor__n_estimators=50;, score=0.890 total time=   0.2s
[CV 7/10] END regressor__max_depth=30, regressor__max_features=sqrt, regressor__max_samples=1.0, regressor__n_estimators=50;, score=0.903 total time=   0.3s
[CV 8/10] END regressor__max_depth=30, regressor__max_features=sqrt, regressor__max_samples=0.5, regressor__n_estimators=200;, score=0.892 total time=   0.6s
[CV 2/10] END regressor__max_depth=20, regressor__max_features=sqrt, regressor__max_samples=1.0, regressor__n_estimators=300;, score=0.930 total time=   1.3s
[CV 5/10] END regressor__max_depth=30, regressor__max_features=sqrt, regressor__max_samples=1.0, regressor__n_estimators=50;, score=0.902 total time=   0.2s
[CV 5/10] END regressor__max_depth=30, regressor__max_features=sqrt, regressor__max_samples=0.5, regressor__n_estimators=300;, score=0.900 total time=   0.9s
[CV 8/10] END regressor__max_depth=30, regressor__max_f


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/daksh/anaconda3/lib/python3.11/site-packages/joblib/externals/loky/backend/popen_loky_posix.py", line 180, in <module>
    exitcode = process_obj._bootstrap()
  File "/Users/daksh/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/daksh/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwarg

[CV 7/10] END regressor__max_depth=30, regressor__max_features=sqrt, regressor__max_samples=0.5, regressor__n_estimators=300;, score=0.897 total time=   0.9s
[CV 2/10] END regressor__max_depth=30, regressor__max_features=sqrt, regressor__max_samples=0.5, regressor__n_estimators=300;, score=0.923 total time=   0.9s
[CV 7/10] END regressor__max_depth=30, regressor__max_features=sqrt, regressor__max_samples=1.0, regressor__n_estimators=200;, score=0.902 total time=   0.9s
[CV 4/10] END regressor__max_depth=30, regressor__max_features=sqrt, regressor__max_samples=1.0, regressor__n_estimators=100;, score=0.894 total time=   0.4s
[CV 8/10] END regressor__max_depth=30, regressor__max_features=sqrt, regressor__max_samples=1.0, regressor__n_estimators=100;, score=0.895 total time=   0.4s
[CV 1/10] END regressor__max_depth=30, regressor__max_features=sqrt, regressor__max_samples=1.0, regressor__n_estimators=200;, score=0.912 total time=   0.9s
[CV 4/10] END regressor__max_depth=30, regressor__ma

640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
359 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/daksh/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/daksh/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/daksh/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/daksh/anaconda3/lib/pytho

In [20]:
final_pipe = search.best_estimator_

In [21]:
search.best_params_

{'regressor__max_depth': 30,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [22]:
search.best_score_

np.float64(0.9025025308911081)

In [23]:
final_pipe.fit(X,y_transformed)

In [54]:
# Define your columns to encode
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [65]:
# Create a pipeline with XGBRegressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(tree_method="hist", device="cuda"))
])

# Define the parameter grid for XGBRegressor
param_grid = {
    'regressor__n_estimators': [300, 500],
    'regressor__max_depth': [5, 10, 13],
    'regressor__learning_rate': [0.01, 0.1, 0.3],
    'regressor__gamma': [0, 0.1, 0.3],
    'regressor__subsample': [0.7, 1.0],  # Adjusted subsample values
    'regressor__colsample_bytree': [0.7, 1.0],
    'regressor__lambda': [0.1, 1],  # Regularization parameter
    'regressor__alpha': [0.1, 1]  # L1 regularization parameter
}

# Set up KFold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform grid search using GridSearchCV
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

# Assuming X and y_transformed are defined and compatible
search.fit(X, y_transformed)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


[CV 1/5] END regressor__alpha=0.1, regressor__colsample_bytree=0.7, regressor__gamma=0, regressor__lambda=0.1, regressor__learning_rate=0.01, regressor__max_depth=5, regressor__n_estimators=300, regressor__subsample=0.7;, score=0.903 total time=   0.3s
[CV 2/5] END regressor__alpha=0.1, regressor__colsample_bytree=0.7, regressor__gamma=0, regressor__lambda=0.1, regressor__learning_rate=0.01, regressor__max_depth=5, regressor__n_estimators=300, regressor__subsample=0.7;, score=0.886 total time=   0.2s
[CV 3/5] END regressor__alpha=0.1, regressor__colsample_bytree=0.7, regressor__gamma=0, regressor__lambda=0.1, regressor__learning_rate=0.01, regressor__max_depth=5, regressor__n_estimators=300, regressor__subsample=0.7;, score=0.882 total time=   0.2s
[CV 4/5] END regressor__alpha=0.1, regressor__colsample_bytree=0.7, regressor__gamma=0, regressor__lambda=0.1, regressor__learning_rate=0.01, regressor__max_depth=5, regressor__n_estimators=300, regressor__subsample=0.7;, score=0.889 total t


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/daksh/anaconda3/lib/python3.11/site-packages/joblib/externals/loky/backend/popen_loky_posix.py", line 180, in <module>
    exitcode = process_obj._bootstrap()
  File "/Users/daksh/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/daksh/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwarg

[CV 3/5] END regressor__alpha=0.1, regressor__colsample_bytree=0.7, regressor__gamma=0, regressor__lambda=0.1, regressor__learning_rate=0.01, regressor__max_depth=5, regressor__n_estimators=500, regressor__subsample=1.0;, score=0.893 total time=   0.3s
[CV 4/5] END regressor__alpha=0.1, regressor__colsample_bytree=0.7, regressor__gamma=0, regressor__lambda=0.1, regressor__learning_rate=0.01, regressor__max_depth=5, regressor__n_estimators=500, regressor__subsample=1.0;, score=0.901 total time=   0.3s
[CV 3/5] END regressor__alpha=0.1, regressor__colsample_bytree=0.7, regressor__gamma=0, regressor__lambda=0.1, regressor__learning_rate=0.01, regressor__max_depth=5, regressor__n_estimators=300, regressor__subsample=1.0;, score=0.881 total time=   0.1s
[CV 5/5] END regressor__alpha=0.1, regressor__colsample_bytree=0.7, regressor__gamma=0, regressor__lambda=0.1, regressor__learning_rate=0.01, regressor__max_depth=5, regressor__n_estimators=300, regressor__subsample=1.0;, score=0.876 total t

In [66]:
search.best_params_

{'regressor__alpha': 0.1,
 'regressor__colsample_bytree': 0.7,
 'regressor__gamma': 0,
 'regressor__lambda': 0.1,
 'regressor__learning_rate': 0.1,
 'regressor__max_depth': 5,
 'regressor__n_estimators': 500,
 'regressor__subsample': 1.0}

In [67]:
search.best_score_

np.float64(0.9076690291035117)

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [70]:
final_pipe_xg = search.best_estimator_

In [71]:
final_pipe_xg.fit(X_train,y_train)

In [72]:
y_pred = final_pipe_xg.predict(X_test)
y_pred = np.expm1(y_pred)

In [73]:
mean_absolute_error(np.expm1(y_test),y_pred)

np.float64(0.4594920825555858)

### Exporting the model

In [74]:
final_pipe_xg.fit(X, y_transformed)

In [78]:
import pickle

with open('../pipeline.pkl', 'wb') as file:
    pickle.dump(final_pipe_xg, file)

In [80]:
with open('../df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [81]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 84,2.0,2.0,1,Relatively New,532.0,0.0,0.0,unfurnished,Medium,Mid Floor
3550,house,sector 109,5.0,5.0,3+,Relatively New,6228.0,1.0,1.0,unfurnished,High,Low Floor
3551,flat,sector 2,1.0,1.0,1,Moderately Old,665.0,0.0,0.0,semifurnished,Medium,Mid Floor
3552,house,sector 43,5.0,6.0,3,Moderately Old,5490.0,1.0,1.0,unfurnished,Medium,Mid Floor


### Trying out the predictions

In [82]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [83]:
X.iloc[0].values

array(['flat', 'sector 36', np.float64(3.0), np.float64(2.0), '2',
       'New Property', np.float64(850.0), np.float64(0.0),
       np.float64(0.0), 'unfurnished', 'Low', 'Low Floor'], dtype=object)

In [84]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [86]:
np.expm1(final_pipe_xg.predict(one_df))

array([3.0049446], dtype=float32)