In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.decomposition import PCA
import category_encoders as ce

from sklearn.model_selection import GridSearchCV

import warnings
warnings.simplefilter("ignore") 

In [2]:
df = pd.read_excel('Cleaned_and_Feature_Section_Bangalore_Rental_House_data.xlsx')

In [3]:
df = df[~(df['Total_Floors'] == 0)].reset_index(drop=True)

In [4]:
def categorize_floors(floor):
    if floor <= 3:
        return 'Low-rise'
    elif 4 <= floor <= 10:
        return 'Mid-rise'
    elif 11 <= floor <= 20:
        return 'High-rise'
    else:
        return 'Skyscraper'

df['Total_Floors'] = df['Total_Floors'].apply(categorize_floors)

In [5]:
df.loc[df['Age'] == '10+' , 'Age'] = 11
df['Age'] = df['Age'].astype(float)

In [6]:
df = df.dropna(subset=['Age']).reset_index(drop=True)

In [7]:
def categorize_age(value):
    if pd.isna(value):
        return "Undefined"
    if value == 0:
        return "Under Construction"
    if 1 <= value <= 5:
        return "New Property"
    if 6 <= value <= 10:
        return "Moderately Old"
    if value > 10:
        return "Old Property"
    else:
        return "Undefined"

df['Age'] = df['Age'].apply(categorize_age)

In [8]:
df.head()

Unnamed: 0,Region,Bedroom,Bathroom,Balcony,Additional_rooms,Area (sq.ft),Furnishing,Age,Covered_Parking,Brokerage,Deposit,Maintenance,Type,Total_Floors,Rent
0,Bangalore East,4,5,3,2,2100.0,Semifurnished,Moderately Old,1,120000,840000,0,House/Villa,Low-rise,120000
1,Bangalore East,3,3,2,0,1777.26,Semifurnished,New Property,1,43000,300000,5000,Apartment,High-rise,43000
2,Bangalore East,1,2,1,0,600.0,Semifurnished,New Property,1,0,70000,0,Builder Floor,Low-rise,12000
3,Bangalore East,2,2,1,0,1160.0,Furnished,New Property,1,40000,200000,0,Apartment,Mid-rise,40000
4,Bangalore East,3,5,3+,1,3300.0,Semifurnished,New Property,2,140000,840000,15000,Apartment,Mid-rise,140000


In [9]:
X = df.drop(columns=['Rent'])
y = df['Rent']

In [10]:
y_transformed = np.log1p(y)

#### Ordinal Encoding

In [11]:
columns_to_encode = ['Region', 'Balcony', 'Furnishing', 'Type', 'Total_Floors', 'Age']
columns_to_scale = ['Bedroom', 'Bathroom', 'Additional_rooms', 'Area (sq.ft)', 
                    'Covered_Parking', 'Brokerage', 'Deposit', 'Maintenance']

for col in columns_to_encode:
    X[col] = X[col].astype(str)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columns_to_scale),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [12]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

print("R² Scores for each fold:", scores)
print("Average R² Score:", scores.mean())

R² Scores for each fold: [0.8251095  0.8357346  0.80483575 0.8215927  0.8095418  0.79397774
 0.80872397 0.78156485 0.80695881 0.81384039]
Average R² Score: 0.8101880102407207


In [13]:
scores.mean(),scores.std()

(0.8101880102407207, 0.014636868830337227)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [15]:
pipeline.fit(X_train,y_train)

In [16]:
y_pred = pipeline.predict(X_test)

In [17]:
y_pred = np.expm1(y_pred)

In [18]:
mean_absolute_error(np.expm1(y_test),y_pred)

12162.46657859621

In [19]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [20]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [21]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [22]:
model_output

[['linear_reg', 0.8101880102407207, 12162.46657859621],
 ['svr', 0.8841467934088421, 7788.9746864797435],
 ['ridge', 0.8101886576594701, 12161.419414021917],
 ['LASSO', -0.0026299827215488627, 25611.107147231705],
 ['decision tree', 0.8152643965280667, 7355.747867986737],
 ['random forest', 0.8984496094777323, 5569.978625472502],
 ['extra trees', 0.8952579102130812, 5608.50488257394],
 ['gradient boosting', 0.8999340680614342, 6703.246389506834],
 ['adaboost', 0.8267361856412488, 10834.1795666387],
 ['mlp', 0.8279937004302435, 8286.691894957423],
 ['xgboost', 0.9029479875868015, 6134.321210450323]]

In [23]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [24]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.89845,5569.978625
6,extra trees,0.895258,5608.504883
10,xgboost,0.902948,6134.32121
7,gradient boosting,0.899934,6703.24639
4,decision tree,0.815264,7355.747868
1,svr,0.884147,7788.974686
9,mlp,0.827994,8286.691895
8,adaboost,0.826736,10834.179567
2,ridge,0.810189,12161.419414
0,linear_reg,0.810188,12162.466579


#### OneHotEncoding

In [25]:
columns_to_scale = ['Bedroom', 'Bathroom', 'Additional_rooms', 'Area (sq.ft)', 
                    'Covered_Parking', 'Brokerage', 'Deposit', 'Maintenance']

columns_to_ordinal_encode = ['Furnishing', 'Balcony', 'Type', 'Total_Floors', 'Age']

columns_to_onehot_encode = ['Region']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columns_to_scale),
        ('cat', OrdinalEncoder(), columns_to_ordinal_encode),
        ('cat1', OneHotEncoder(drop='first'), columns_to_onehot_encode)
    ],
    remainder='passthrough' 
)

In [26]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [27]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [28]:
scores.mean()

0.8129775047978841

In [29]:
scores.std()

0.014480161396192253

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [31]:
pipeline.fit(X_train,y_train)

In [32]:
y_pred = pipeline.predict(X_test)

In [33]:
y_pred = np.expm1(y_pred)

In [34]:
mean_absolute_error(np.expm1(y_test),y_pred)

12173.965721597227

In [35]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [36]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [37]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [38]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [39]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.898368,5485.822713
6,extra trees,0.894273,5640.448843
10,xgboost,0.901871,6201.7182
7,gradient boosting,0.900441,6669.441063
4,decision tree,0.812531,7506.871039
1,svr,0.888773,7688.22529
9,mlp,0.820812,8130.861189
8,adaboost,0.826999,10889.941967
2,ridge,0.812976,12171.70224
0,linear_reg,0.812978,12173.965722


#### OneHotEncoding With PCA

In [40]:
columns_to_scale = ['Bedroom', 'Bathroom', 'Additional_rooms', 'Area (sq.ft)', 
                    'Covered_Parking', 'Brokerage', 'Deposit', 'Maintenance']

columns_to_ordinal_encode = ['Balcony', 'Type', 'Total_Floors' , 'Furnishing', 'Age']

columns_to_onehot_encode = ['Region']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columns_to_scale),
        ('cat', OrdinalEncoder(), columns_to_ordinal_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), columns_to_onehot_encode)
    ],
    remainder='passthrough'
)

In [41]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [42]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [43]:
scores.mean()

0.8026595949559029

In [44]:
scores.std()

0.014338988206149338

In [45]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [46]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [47]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [48]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [49]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
1,svr,0.877541,8138.274478
6,extra trees,0.857963,8224.628678
5,random forest,0.863042,8258.844417
7,gradient boosting,0.859584,8570.505277
10,xgboost,0.86469,8713.094764
9,mlp,0.818399,9794.973893
8,adaboost,0.804673,11016.7469
4,decision tree,0.740103,11721.981083
2,ridge,0.80266,12478.527022
0,linear_reg,0.80266,12479.573062


#### Target Encoder

In [50]:
columns_to_scale = ['Bedroom', 'Bathroom', 'Additional_rooms', 'Area (sq.ft)', 
                    'Covered_Parking', 'Brokerage', 'Deposit', 'Maintenance']

columns_to_ordinal_encode = ['Balcony', 'Type', 'Total_Floors','Furnishing', 'Age']
columns_to_target_encode = ['Region']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columns_to_scale),
        ('cat', OrdinalEncoder(), columns_to_ordinal_encode),
        ('target_enc', ce.TargetEncoder(), columns_to_target_encode)
    ],
    remainder='passthrough'
)

In [51]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [52]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [53]:
scores.mean(),scores.std()

(0.8045928793903494, 0.014715229075846315)

In [54]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [55]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [56]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [57]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [58]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.896999,5510.242402
6,extra trees,0.893056,5548.220548
10,xgboost,0.902166,6208.474479
7,gradient boosting,0.899755,6731.518942
4,decision tree,0.81366,7482.255445
1,svr,0.864197,7994.309328
9,mlp,0.859833,10010.657157
8,adaboost,0.827803,10972.224632
2,ridge,0.804594,12443.735233
0,linear_reg,0.804593,12444.497083


#### Hyperparameter Tuning

In [79]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [80]:
columns_to_scale = ['Bedroom', 'Bathroom', 'Additional_rooms', 'Area (sq.ft)', 
                    'Covered_Parking', 'Brokerage', 'Deposit', 'Maintenance']

columns_to_ordinal_encode = ['Furnishing', 'Balcony', 'Type', 'Total_Floors', 'Age']

columns_to_onehot_encode = ['Region']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columns_to_scale),
        ('cat', OrdinalEncoder(), columns_to_ordinal_encode),
        ('cat1', OneHotEncoder(drop='first'), columns_to_onehot_encode)
    ],
    remainder='passthrough' 
)

In [81]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [82]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [83]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [84]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


In [85]:
final_pipe = search.best_estimator_

In [86]:
search.best_params_

{'regressor__max_depth': 20,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [87]:
search.best_score_

0.9034941910692771

In [88]:
final_pipe.fit(X,y_transformed)

In [89]:
columns_to_scale = ['Bedroom', 'Bathroom', 'Additional_rooms', 'Area (sq.ft)', 
                    'Covered_Parking', 'Brokerage', 'Deposit', 'Maintenance']

columns_to_ordinal_encode = ['Furnishing', 'Balcony', 'Type', 'Total_Floors', 'Age']

columns_to_onehot_encode = ['Region']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columns_to_scale),
        ('cat', OrdinalEncoder(), columns_to_ordinal_encode),
        ('cat1', OneHotEncoder(drop='first'), columns_to_onehot_encode)
    ],
    remainder='passthrough' 
)

In [90]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(max_depth=20, max_features='sqrt', max_samples=1.0, n_estimators=300))
])

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [92]:
pipeline.fit(X_train,y_train)

#### Exporting the model

In [93]:
import pickle

with open('House_Rent_Prediction_Pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [94]:
with open('House_Rent_Data.pkl', 'wb') as file:
    pickle.dump(X, file)

#### Trying out the predictions

In [95]:
y_pred = pipeline.predict(X_test)

In [96]:
y_pred = np.expm1(y_pred)

In [97]:
mean_absolute_error(np.expm1(y_test),y_pred)

6235.438757702566

In [98]:
r2_score(np.expm1(y_test),y_pred)

0.930052889049178