In [21]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [23]:
df = pd.read_csv('../processed_data/gurgaon_properties_post_feature_selection_v2.csv')

In [25]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 108,3.95,3.0,5.0,2,New Property,2343.101198,1.0,0.0,1.0,Medium,High Floor
1,house,sector 33,11.5,5.0,6.0,3+,Relatively New,4680.0,1.0,0.0,1.0,Medium,Mid Floor
2,house,sector 13,4.5,4.0,5.0,2,Old Property,4950.0,0.0,0.0,1.0,Low,Low Floor
3,flat,sector 106,1.35,4.0,3.0,3,Old Property,1678.0,1.0,0.0,1.0,Medium,High Floor
4,flat,sector 37c,1.22,3.0,4.0,3,Relatively New,1665.0,0.0,0.0,1.0,Medium,Mid Floor


In [27]:
df['furnishing_type'].value_counts()

furnishing_type
1.0    2316
2.0    1013
0.0     189
Name: count, dtype: int64

In [29]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [31]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 108,3.95,3.0,5.0,2,New Property,2343.101198,1.0,0.0,semifurnished,Medium,High Floor
1,house,sector 33,11.5,5.0,6.0,3+,Relatively New,4680.0,1.0,0.0,semifurnished,Medium,Mid Floor
2,house,sector 13,4.5,4.0,5.0,2,Old Property,4950.0,0.0,0.0,semifurnished,Low,Low Floor
3,flat,sector 106,1.35,4.0,3.0,3,Old Property,1678.0,1.0,0.0,semifurnished,Medium,High Floor
4,flat,sector 37c,1.22,3.0,4.0,3,Relatively New,1665.0,0.0,0.0,semifurnished,Medium,Mid Floor


In [33]:
X = df.drop(columns=['price'])
y = df['price']

In [35]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

# **Ordinal Encoding**

In [18]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [20]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [22]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [24]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [25]:
scores.mean(),scores.std()

(0.7169418029987584, 0.058677286404436334)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [30]:
pipeline.fit(X_train,y_train)

In [32]:
y_pred = pipeline.predict(X_test)

In [34]:
y_pred = np.expm1(y_pred)

In [36]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.8511539862878427

In [38]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [40]:
!pip install xgboost



In [41]:
# Basic regressors from sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)
from sklearn.neural_network import MLPRegressor

# XGBoost regressor
from xgboost import XGBRegressor

model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [42]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [43]:
model_output

[['linear_reg', 0.7169418029987584, 0.8511539862878427],
 ['svr', 0.7503439494797759, 0.8601210242198963],
 ['ridge', 0.7169491896651573, 0.8513697318000162],
 ['LASSO', 0.05151220006292094, 1.5200372107524673],
 ['decision tree', 0.7813624284032722, 0.6676069234595329],
 ['random forest', 0.8817250734334751, 0.5816937819469922],
 ['extra trees', 0.8707631012828708, 0.6077885846964126],
 ['gradient boosting', 0.8758744841451357, 0.6074078345278575],
 ['adaboost', 0.7542459566393761, 0.8453971468499394],
 ['mlp', 0.8121907621181219, 0.7476513030809732],
 ['xgboost', 0.8907359239061708, 0.5446795256190341]]

In [44]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [45]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.890736,0.54468
5,random forest,0.881725,0.581694
7,gradient boosting,0.875874,0.607408
6,extra trees,0.870763,0.607789
4,decision tree,0.781362,0.667607
9,mlp,0.812191,0.747651
8,adaboost,0.754246,0.845397
0,linear_reg,0.716942,0.851154
2,ridge,0.716949,0.85137
1,svr,0.750344,0.860121


# **OneHotEncoding**

In [47]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [48]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [49]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [50]:
scores.mean()

0.8448655997781598

In [51]:
scores.std()

0.03340492588919986

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [53]:
pipeline.fit(X_train,y_train)

In [54]:
y_pred = pipeline.predict(X_test)

In [55]:
y_pred = np.expm1(y_pred)

In [56]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6222598635109029

In [57]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [58]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [59]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [60]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [61]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.896416,0.506411
5,random forest,0.894133,0.53398
10,xgboost,0.897572,0.538346
9,mlp,0.87072,0.58108
7,gradient boosting,0.878839,0.598516
0,linear_reg,0.844866,0.62226
2,ridge,0.845329,0.623852
4,decision tree,0.824296,0.627691
8,adaboost,0.759547,0.829667
1,svr,0.754422,0.858786


# **OneHotEncoding With PCA**

In [63]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [64]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [65]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [66]:
scores.mean()

0.05424165605129007

In [67]:
scores.std()

0.010737632836479025

In [69]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [71]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [72]:

model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [73]:

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [74]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.751859,0.702313
6,extra trees,0.725391,0.743975
4,decision tree,0.678491,0.786738
10,xgboost,0.614989,0.900044
7,gradient boosting,0.612768,0.982992
1,svr,0.226657,1.342662
8,adaboost,0.309238,1.348752
9,mlp,0.21863,1.433409
3,LASSO,0.051697,1.519945
2,ridge,0.054242,1.521206


# **Target Encoder**

In [76]:
!pip install category_encoders



In [110]:
# Install category_encoders (run only once in your notebook or terminal)
# !pip install category_encoders

import category_encoders as ce
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score

# Define all categorical columns
ordinal_cols = ['property_type', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']
onehot_cols = ['agePossession']
targetenc_cols = ['sector']
numeric_cols = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

# Wrap TargetEncoder in a pipeline to make it work with ColumnTransformer
target_enc_pipeline = Pipeline([
    ('target', ce.TargetEncoder())
])

# Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('ord', OrdinalEncoder(), ordinal_cols),
        ('ohe', OneHotEncoder(drop='first', sparse_output=False), onehot_cols),
        ('target', target_enc_pipeline, targetenc_cols)
    ],
    remainder='passthrough'
)

# Full Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

In [114]:
scores.mean(),scores.std()

(0.8148380597236423, 0.03786582209387001)

In [116]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [118]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [120]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

In [121]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [122]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.894345,0.510744
6,extra trees,0.891471,0.524572
10,xgboost,0.892486,0.529164
7,gradient boosting,0.886336,0.580152
1,svr,0.864195,0.66039
9,mlp,0.855142,0.663467
4,decision tree,0.803793,0.666563
0,linear_reg,0.814838,0.683182
2,ridge,0.814862,0.683307
8,adaboost,0.819675,0.769714


# **Hyperparameter Tuning**

In [160]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

# Define categorical columns
ordinal_cols = ['property_type', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']
onehot_cols = ['agePossession']
numeric_cols = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

# All categorical columns MUST be handled — do NOT leave any raw string column for passthrough
# 'sector' is now included only once and encoded using OrdinalEncoder
# You can also OneHotEncode it if it's non-ordinal
categorical_cols = ordinal_cols + ['sector']

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat_ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
        ('cat_ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), onehot_cols)
    ],
    remainder='drop'  # Do not passthrough anything unprocessed
)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# GridSearch parameter grid
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__max_samples': [0.25, 0.5, 1.0],
    'regressor__max_features': ['sqrt', 'log2']
}

# Cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Grid search (keep n_jobs=1 to avoid pickling issues in Jupyter)
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=1, verbose=4)

# Fit
search.fit(X, y_transformed)


Fitting 10 folds for each of 54 candidates, totalling 540 fits
[CV 1/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=50;, score=0.825 total time=   0.3s
[CV 2/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=50;, score=0.824 total time=   0.2s
[CV 3/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=50;, score=0.845 total time=   0.2s
[CV 4/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=50;, score=0.860 total time=   0.2s
[CV 5/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=50;, score=0.837 total time=   0.3s
[CV 6/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.25, regressor__n_estimators=50;, score=0.

In [161]:
final_pipe = search.best_estimator_

In [164]:
search.best_params_

{'regressor__max_depth': 20,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 200}

In [166]:
search.best_score_

0.8705765021812211

In [168]:
final_pipe.fit(X,y_transformed)

# **Exporting the model**

In [41]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# Define your categorical columns correctly
ordinal_cols = ['property_type', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']
onehot_cols = ['sector', 'agePossession']
numeric_cols = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

# Build the ColumnTransformer safely
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat_ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_cols),
        ('cat_ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), onehot_cols)
    ],
    remainder='drop'  # ensures only encoded/scaled data is passed
)

# Final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42))
])

# Fit the model
pipeline.fit(X, y_transformed)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat_ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [43]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [175]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [177]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 108,3.0,5.0,2,New Property,2343.101198,1.0,0.0,semifurnished,Medium,High Floor
1,house,sector 33,5.0,6.0,3+,Relatively New,4680.000000,1.0,0.0,semifurnished,Medium,Mid Floor
2,house,sector 13,4.0,5.0,2,Old Property,4950.000000,0.0,0.0,semifurnished,Low,Low Floor
3,flat,sector 106,4.0,3.0,3,Old Property,1678.000000,1.0,0.0,semifurnished,Medium,High Floor
4,flat,sector 37c,3.0,4.0,3,Relatively New,1665.000000,0.0,0.0,semifurnished,Medium,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3513,flat,sector 39,5.0,4.0,3,Old Property,3556.000000,1.0,0.0,unfurnished,Low,Mid Floor
3514,flat,sector 50,3.0,4.0,3+,Moderately Old,1870.000000,1.0,0.0,furnished,High,Mid Floor
3515,house,sector 38,4.0,9.0,3+,Relatively New,4800.000000,0.0,1.0,furnished,High,Low Floor
3516,flat,sector 108,3.0,3.0,3,Relatively New,1711.000000,0.0,1.0,furnished,High,High Floor


# **Trying out the predictions**

In [180]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [182]:
X.iloc[0].values

array(['flat', 'sector 108', 3.0, 5.0, '2', 'New Property',
       2343.101198244157, 1.0, 0.0, 'semifurnished', 'Medium',
       'High Floor'], dtype=object)

In [184]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [186]:
np.expm1(pipeline.predict(one_df))

array([4.47149288])