# **ML Model Building**

In [1]:
# importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pickle

In [2]:
# reading the cleaned data file
df = pd.read_csv("Cleaned_House_Rent_Train.csv")
# top five rows
df.head()

Unnamed: 0,type,locality,latitude,longitude,lease_type,negotiable,furnishing,parking,property_size,property_age,...,SC,GP,PARK,RWH,STP,HK,PB,VP,type_encode,locality_encode
0,BHK2,Bellandur,12.929557,77.67228,2,1,2,4,1400,4.0,...,1,0,1,1,1,0,1,1,3,376
1,BHK3,Thiruvanmiyur,12.98287,80.262012,3,0,2,4,1350,6.0,...,1,0,1,0,0,0,1,1,4,1687
2,BHK1,Attiguppe,12.955991,77.531634,3,1,2,2,600,3.0,...,0,0,0,0,0,0,0,0,2,228
3,BHK3,Kodihalli,12.963903,77.649446,3,1,2,4,1500,15.0,...,0,0,1,0,0,0,0,1,4,1058
4,BHK1,"Seetharampalya,Hoodi",12.986196,77.718314,3,1,2,4,1080,0.0,...,0,0,0,0,0,0,1,0,2,1554


using random forest algorithm to know the feature importances of all input variables

In [3]:
# input(X)
X = df[['type_encode', 'latitude', 'longitude', 'lease_type', 'negotiable','furnishing', 'parking', 'property_size', 'property_age', 
        'bathroom', 'facing', 'cup_board', 'floor', 'total_floor', 'water_supply', 'building_type', 'balconies','LIFT', 'GYM',
        'INTERNET', 'AC','CLUB', 'INTERCOM', 'POOL', 'CPA','FS', 'SERVANT', 'SECURITY', 'SC', 'GP', 'PARK', 'RWH', 'STP', 'HK','PB', 'VP']]
# output(Y)
Y = df["rent"]

In [4]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()  
rf.fit(X, Y)

pred_y_train = rf.predict(X)                                                    
                                                                                              
from sklearn.metrics import mean_squared_error
print("Train MSE:", mean_squared_error(Y, pred_y_train))        
                                                                                                         
from sklearn.metrics import r2_score                                                                        
print("Train r2_score:", r2_score(Y, pred_y_train))                                

Train MSE: 0.00452190221697675
Train r2_score: 0.9751219577728011


In [5]:
# feature importances
pd.DataFrame(list(zip(X.columns, rf.feature_importances_)), columns = ["features", "importance"]).sort_values("importance", ascending = False)

Unnamed: 0,features,importance
7,property_size,0.599792
2,longitude,0.134662
1,latitude,0.04935
0,type_encode,0.043692
13,total_floor,0.019037
9,bathroom,0.017863
17,LIFT,0.016809
8,property_age,0.016729
5,furnishing,0.010564
11,cup_board,0.010039


selecting the top 9 features to use it for ML model

In [6]:
# input(X)
X = df[['type_encode', 'latitude', 'longitude', "property_size", "total_floor", "property_age",  "bathroom", "LIFT", "furnishing"]]
# output(Y)
Y = df["rent"]

In [7]:
# Train_test_split for model training and model evaluation
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

# Linear regression

In [8]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
S_X_train = ss.fit_transform(X_train)
S_X_test = ss.transform(X_test)

lr.fit(S_X_train, Y_train)

pred_y_train = lr.predict(S_X_train)
pred_y_test = lr.predict(S_X_test)

from sklearn.metrics import mean_squared_error
print("Train MSE:", mean_squared_error(Y_train, pred_y_train))
print("Test MSE:", mean_squared_error(Y_test, pred_y_test))
print("\n")

from sklearn.metrics import r2_score
print("Train r2_score:", r2_score(Y_train, pred_y_train))
print("Test r2_score:", r2_score(Y_test, pred_y_test))

Train MSE: 0.05778182626371728
Test MSE: 0.08648056777050316


Train r2_score: 0.6812637982085156
Test r2_score: 0.5271121692954497


XGBRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor

In [9]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

# train test split(train - 70% and test - 30%)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3, random_state = 42)

# algorithms
algorithms = [XGBRegressor, RandomForestRegressor, ExtraTreesRegressor, 
              GradientBoostingRegressor, HistGradientBoostingRegressor]

for i in algorithms:
    model = i()
    model.fit(X_train, Y_train)

    pred_y_train = model.predict(X_train)
    pred_y_test = model.predict(X_test)

    print("Algorithm:", i.__name__)
    
    from sklearn.metrics import mean_squared_error
    print("Train MSE:", mean_squared_error(Y_train, pred_y_train),"----->", "Test MSE:", mean_squared_error(Y_test, pred_y_test))         

    from sklearn.metrics import r2_score
    print("Train r2_score:", r2_score(Y_train, pred_y_train),"-->", "Test r2_score:", r2_score(Y_test, pred_y_test))
    print("\n")

Algorithm: XGBRegressor
Train MSE: 0.01657427783980261 -----> Test MSE: 0.0329062976333595
Train r2_score: 0.9085729422606935 --> Test r2_score: 0.8200637657045401


Algorithm: RandomForestRegressor
Train MSE: 0.004878951474763865 -----> Test MSE: 0.034326567066227054
Train r2_score: 0.9730867201272992 --> Test r2_score: 0.8122975339551483


Algorithm: ExtraTreesRegressor
Train MSE: 6.583539342362844e-05 -----> Test MSE: 0.03730885122884002
Train r2_score: 0.9996368386982523 --> Test r2_score: 0.7959899873633507


Algorithm: GradientBoostingRegressor
Train MSE: 0.03436234343655714 -----> Test MSE: 0.03658607044958818
Train r2_score: 0.8104503865690353 --> Test r2_score: 0.7999422536768928


Algorithm: HistGradientBoostingRegressor
Train MSE: 0.026368610445159753 -----> Test MSE: 0.031610042181754595
Train r2_score: 0.8545454291899567 --> Test r2_score: 0.8271518716727513




selecting histgradientboosting as the best model as it gives the high r2 score out of all the other models

In [10]:
# hyperparameter tuning for histgradientboosting model
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

model = HistGradientBoostingRegressor()

# Define the hyperparameter grid to search
param_grid = {
    'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3],
    'max_depth': np.linspace(20, 40, 3, dtype = int),
    'min_samples_leaf': np.linspace(20, 40, 3, dtype = int),
    # 'l2_regularization': np.linspace(0, 1, 3)
}

# Use grid search to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv = 5, scoring = 'neg_mean_squared_error', n_jobs = -1)
grid_search.fit(X_train, Y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, Y_train)


# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

print(f"Best Hyperparameters: {best_params}")
print("\n")
print(f"Mean Squared Error on Test Set: {mse}")
print("\n")
print(f"r2 score on Test Set: {r2}")

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 40, 'min_samples_leaf': 40}


Mean Squared Error on Test Set: 0.0320195327439838


r2 score on Test Set: 0.8249127200499216


# Saving the best model

In [11]:
with open("house_rent_regression_model.pkl", "wb") as file:
    pickle.dump(best_model, file)

# making the predictions on the test data

In [12]:
test_df = pd.read_csv("Cleaned_House_Rent_Test.csv")

In [13]:
with open("house_rent_regression_model.pkl", "rb") as file:
    model = pickle.load(file)

In [14]:
columns = test_df[["type_encode", "latitude", "longitude", "property_size", "total_floor", "property_age", "bathroom", "LIFT", "furnishing"]]

In [15]:
predictions = np.exp(model.predict(columns))

In [16]:
# predicted house rent price
for i in predictions:
    print(i)

26933.67045442235
10953.88651040747
10983.324792761063
9834.801932073327
24206.47705288754
16655.614548236616
27281.84329154312
14314.134669270034
27029.007706560988
14974.43331060909
14307.416280351517
24551.388501635225
15566.87305111792
23649.47955388192
10090.55660843606
13242.758106795392
11379.830072029641
11693.557853032024
24196.532431090676
21562.205211500703
27112.4346276517
13721.818784539664
12549.224952353732
15976.10742466966
10240.950479992549
14137.605528721826
16857.55140436154
13904.612395392754
10610.14859744263
25144.12683523022
22552.71303436117
12776.44919688021
24962.045595698444
10952.16811459684
16636.178943496732
44013.75569137888
12035.112511125506
10983.386903253686
22164.369704867775
9934.16919465103
16610.682434711063
22849.34807801134
20752.141841389508
11409.712651131045
35667.92742241332
17115.587830112443
33803.869737141365
25266.37556329235
17952.217588031297
16677.928599244864
14246.162466033824
15723.5279465989
10705.591819908434
23146.66970531336
1

In [2]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
tfidf = TfidfVectorizer()

# Model Object
pickle.dump(lr, open('model.pkl', 'wb'))

# Vectorizer
pickle.dump(tfidf, open('vectorizer.pkl', 'wb'))

model = pickle.load(open('model.pkl', 'rb'))
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))