In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,ridge_regression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [3]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [5]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [7]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 61,2.6,3,3,2,New Property,1766.0,0,0,0,Low,Low Floor
1,flat,sector 61,2.1,3,3,3,Relatively New,1592.0,0,0,2,Low,High Floor
2,flat,sector 85,2.6,3,3,3,Under Construction,2129.0,1,0,0,Low,High Floor
3,flat,sector 70,1.23,2,2,3,Relatively New,1222.0,0,0,0,Low,Low Floor
4,flat,sector 33,1.35,3,2,2,New Property,1444.0,0,0,0,Medium,Mid Floor


In [9]:
df['furnishing_type'].value_counts()

furnishing_type
0    2374
2     995
1     185
Name: count, dtype: int64

In [11]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0:'unfurnished',2:'semifurnished',1:'furnished'})

In [13]:
df['furnishing_type'].value_counts()

furnishing_type
unfurnished      2374
semifurnished     995
furnished         185
Name: count, dtype: int64

In [15]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 61,2.6,3,3,2,New Property,1766.0,0,0,unfurnished,Low,Low Floor
1,flat,sector 61,2.1,3,3,3,Relatively New,1592.0,0,0,semifurnished,Low,High Floor
2,flat,sector 85,2.6,3,3,3,Under Construction,2129.0,1,0,unfurnished,Low,High Floor
3,flat,sector 70,1.23,2,2,3,Relatively New,1222.0,0,0,unfurnished,Low,Low Floor
4,flat,sector 33,1.35,3,2,2,New Property,1444.0,0,0,unfurnished,Medium,Mid Floor


In [17]:
X = df.drop(columns=['price'])
y = df['price']

In [19]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Strategy for encoding of categorical columns
- We will apply 3 types of encoding techniques i.e. Ordinal encoding, OHE, Target encoding.
- will build model corresponding to each technique and select technique which will give highest R2 score

## 1. Ordinal Encoding

In [25]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [27]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [29]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [31]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [33]:
scores.mean(),scores.std()

(0.7361481554263762, 0.029225215562589363)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [37]:
pipeline.fit(X_train,y_train)

In [39]:
y_pred = pipeline.predict(X_test)

In [41]:
mean_absolute_error(np.expm1(y_test),np.expm1(y_pred))

0.964213824926411

In [43]:
## Function that Try out all possible regression model.
def scorer(model_name, model):
    output = []
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [45]:
## All regression model
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [47]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [48]:
model_output

[['linear_reg', 0.7361481554263762, 0.964213824926411],
 ['svr', 0.7638652050503751, 0.8642669832493138],
 ['ridge', 0.7361513949584398, 0.9640759642614981],
 ['LASSO', 0.053450587522733704, 1.4984045642804833],
 ['decision tree', 0.7791830748451789, 0.6825610576155168],
 ['random forest', 0.8826119689654355, 0.48693172650571],
 ['extra trees', 0.8682227386142113, 0.5130697688260331],
 ['gradient boosting', 0.8756822615607825, 0.5461579554619105],
 ['adaboost', 0.7577528368796365, 0.8298843904128608],
 ['mlp', 0.8039510832948041, 0.7345844434287288],
 ['xgboost', 0.8918882645737216, 0.4806767000327131]]

In [53]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [55]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.891888,0.480677
5,random forest,0.882612,0.486932
6,extra trees,0.868223,0.51307
7,gradient boosting,0.875682,0.546158
4,decision tree,0.779183,0.682561
9,mlp,0.803951,0.734584
8,adaboost,0.757753,0.829884
1,svr,0.763865,0.864267
2,ridge,0.736151,0.964076
0,linear_reg,0.736148,0.964214


## 2.OneHotEncoding

- We will apply OHE to the specific columns where the Order of categories doesnt matter.

In [59]:
# Creating a column transformer for preprocessing
# we will apply OHE where order doesnt matter.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [61]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [63]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [65]:
scores.mean()

0.8531183475030699

In [67]:
scores.std()

0.023081207838091616

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [71]:
pipeline.fit(X_train,y_train)

In [73]:
y_pred = pipeline.predict(X_test)

In [75]:
y_pred = np.expm1(y_pred)

In [77]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6687055380575635

In [79]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [81]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [83]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [84]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [85]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.890442,0.469519
6,extra trees,0.8938,0.470935
10,xgboost,0.896893,0.473859
7,gradient boosting,0.877068,0.553267
9,mlp,0.866019,0.617824
4,decision tree,0.800214,0.650347
0,linear_reg,0.853118,0.668706
2,ridge,0.853475,0.674042
8,adaboost,0.756535,0.768776
1,svr,0.770158,0.850277


## OneHotEncoding With PCA

- As we have apply OHE to the columns where ordering of categories matters.
- So number of featues are increasing and it might create problem of curse of dimensionality.
- To reduce number of features we will use Pricipal component analysis as a Dimensionality reduction method

In [89]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [90]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [91]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [92]:
scores.mean(),scores.std()

(0.05677037375734234, 0.012253784353247898)

In [93]:

def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [94]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}


In [95]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [96]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [97]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.752734,0.761124
6,extra trees,0.726146,0.790496
4,decision tree,0.675942,0.881725
10,xgboost,0.619107,1.00226
7,gradient boosting,0.619666,1.035128
1,svr,0.223043,1.35053
8,adaboost,0.311019,1.401735
9,mlp,0.217371,1.422669
3,LASSO,0.053691,1.498293
2,ridge,0.05677,1.503005


- Overall performance of all Linear as well as tree based model would be degrade after PCA.
- So It is not recommended to use

## 3.Target Encoding

In [100]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession','furnishing_type']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [101]:
!pip install category_encoders



In [102]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [103]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [104]:
scores.mean(),scores.std()

(0.8280931257060452, 0.024950353240248403)

In [105]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [106]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [107]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [108]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])


In [134]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.900908,0.461513
6,extra trees,0.899549,0.475951
10,xgboost,0.905255,0.478917
7,gradient boosting,0.888946,0.514572
4,decision tree,0.812461,0.600284
9,mlp,0.85001,0.677578
8,adaboost,0.82226,0.699247
0,linear_reg,0.828093,0.734016
2,ridge,0.828109,0.734469
1,svr,0.782665,0.837766


### Observations:
- Generally Tree based models like Random Forest, XGBoost Perform well compare to Linear model
- Target Encoding beat other encoding techniques.
- So for final model selection we will Perform HyperParameter tuning on Random Forest as well as XGBoost with One Hot Encoded Features.

## Hyper Parameter Tuning

### Random Forest

In [143]:
## Using Random search CV

In [145]:
from sklearn.model_selection import RandomizedSearchCV

In [147]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [149]:
# Create the random grid
random_grid = {'regressor__n_estimators': n_estimators,
               'regressor__max_features': max_features,
               'regressor__max_depth': max_depth,
               'regressor__min_samples_split': min_samples_split,
               'regressor__min_samples_leaf': min_samples_leaf,
               'regressor__bootstrap': bootstrap}
print(random_grid)

{'regressor__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'regressor__max_features': ['auto', 'sqrt'], 'regressor__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'regressor__min_samples_split': [2, 5, 10], 'regressor__min_samples_leaf': [1, 2, 4], 'regressor__bootstrap': [True, False]}


In [151]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession','furnishing_type']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [153]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [155]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [157]:
rf_random_search = RandomizedSearchCV(estimator = pipeline, param_distributions = random_grid, n_iter = 100, cv = kfold, verbose=2, random_state=42, n_jobs = -1)

In [159]:
rf_random_search.fit(X, y_transformed)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


410 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
276 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\daksh\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\daksh\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\daksh\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\daksh\anaconda3\Lib\site-packages\sklearn\base.py"

In [160]:
rf_random_search.best_score_

0.9003966095823289

In [161]:
rf_random_search.best_params_

{'regressor__n_estimators': 400,
 'regressor__min_samples_split': 2,
 'regressor__min_samples_leaf': 1,
 'regressor__max_features': 'sqrt',
 'regressor__max_depth': None,
 'regressor__bootstrap': False}

In [173]:
Best_pipe_rf = rf_random_search.best_estimator_

In [175]:
Best_pipe_rf.fit(X,y_transformed)

In [176]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [177]:
Best_pipe_rf.fit(X_train,y_train)

In [178]:
y_pred = Best_pipe_rf.predict(X_test)

In [179]:
mean_absolute_error(np.expm1(y_test),np.expm1(y_pred))

0.46868522357708287

## XGBoost

In [None]:
## Using Randomize search CV

In [185]:
from scipy.stats import uniform, randint

# Hyperparameter distributions for RandomizedSearchCV
param_distributions = {
    'regressor__n_estimators': randint(50, 300),           # Number of boosting rounds
    'regressor__learning_rate': uniform(0.01, 0.3),       # Learning rate
    'regressor__max_depth': randint(3, 10),               # Maximum tree depth
    'regressor__subsample': uniform(0.6, 0.4),            # Subsample ratio
    'regressor__colsample_bytree': uniform(0.6, 0.4),     # Fraction of features for each tree
    'regressor__gamma': uniform(0, 5),                    # Minimum loss reduction
    'regressor__min_child_weight': randint(1, 10)         # Minimum sum of weights
}

In [187]:
# Pipeline combining preprocessing and XGBoost
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42))  # XGBoost model
])

In [189]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [191]:
XGB_random_search = RandomizedSearchCV(estimator = pipeline, param_distributions = param_distributions, n_iter = 100, cv = kfold, verbose=2, random_state=42, n_jobs = -1)

In [193]:
XGB_random_search.fit(X, y_transformed)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [194]:
XGB_random_search.best_score_

0.9033470551676033

In [195]:
XGB_random_search.best_params_

{'regressor__colsample_bytree': 0.6535408475202532,
 'regressor__gamma': 0.06835982413498642,
 'regressor__learning_rate': 0.032607718105738696,
 'regressor__max_depth': 6,
 'regressor__min_child_weight': 1,
 'regressor__n_estimators': 246,
 'regressor__subsample': 0.9998870693144523}

In [196]:
final_pipe_xgb = XGB_random_search.best_estimator_

In [197]:
final_pipe_xgb.fit(X_train,y_train)

In [198]:
y_pred = final_pipe_xgb.predict(X_test)

In [199]:
mean_absolute_error(np.expm1(y_test),np.expm1(y_pred))

0.4709825404552133

## Conclusion: 
- Random Forest Regressor: R2 score: 0.90, MAE: 0.46
- XGBoost Regressor: R2 score: 0.90, MAE: 0.47
- So the performance of the both model is similar, so that we can select anyone of them as a final model with appropriate Hyperparameters.

In [None]:
{'regressor__n_estimators': 400,
 'regressor__min_samples_split': 2,
 'regressor__min_samples_leaf': 1,
 'regressor__max_features': 'sqrt',
 'regressor__max_depth': None,
 'regressor__bootstrap': False}

## Exporting the model

In [210]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [212]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=400,min_samples_split=2,min_samples_leaf=1,max_features='sqrt',max_depth=None,bootstrap=False))
])

In [214]:
pipeline.fit(X,y_transformed)


In [216]:
## save pipeline
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [218]:
## save data
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

## Trying out the prediction

In [220]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [222]:
X.iloc[0].values

array(['flat', 'sector 61', 3, 3, '2', 'New Property', 1766.0, 0, 0,
       'unfurnished', 'Low', 'Low Floor'], dtype=object)

In [224]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [226]:
np.expm1(pipeline.predict(one_df))

array([3.23884773])

In [228]:
X.dtypes

property_type       object
sector              object
bedRoom              int64
bathroom             int64
balcony             object
agePossession       object
built_up_area      float64
servant room         int64
store room           int64
furnishing_type     object
luxury_category     object
floor_category      object
dtype: object