Import Libraries

In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

Load the Dataset

In [138]:
df = pd.read_csv('/content/House_Rent_Dataset.csv')

Data Exploration

In [139]:
# Display the first few rows of the dataset
print(df.head(5))

    Posted On  BHK   Rent  Size            Floor    Area Type  \
0  2022-05-18    2  10000  1100  Ground out of 2   Super Area   
1  2022-05-13    2  20000   800       1 out of 3   Super Area   
2  2022-05-16    2  17000  1000       1 out of 3   Super Area   
3  2022-07-04    2  10000   800       1 out of 2   Super Area   
4  2022-05-09    2   7500   850       1 out of 2  Carpet Area   

              Area Locality     City Furnishing Status  Tenant Preferred  \
0                    Bandel  Kolkata       Unfurnished  Bachelors/Family   
1  Phool Bagan, Kankurgachi  Kolkata    Semi-Furnished  Bachelors/Family   
2   Salt Lake City Sector 2  Kolkata    Semi-Furnished  Bachelors/Family   
3               Dumdum Park  Kolkata       Unfurnished  Bachelors/Family   
4             South Dum Dum  Kolkata       Unfurnished         Bachelors   

   Bathroom Point of Contact  
0         2    Contact Owner  
1         1    Contact Owner  
2         1    Contact Owner  
3         1    Contact Owner

In [140]:
# Check for missing values
print(df.isnull().sum())

Posted On            0
BHK                  0
Rent                 0
Size                 0
Floor                0
Area Type            0
Area Locality        0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
dtype: int64


In [141]:
# Summary statistics
print(df.describe())

               BHK          Rent         Size     Bathroom
count  4746.000000  4.746000e+03  4746.000000  4746.000000
mean      2.083860  3.499345e+04   967.490729     1.965866
std       0.832256  7.810641e+04   634.202328     0.884532
min       1.000000  1.200000e+03    10.000000     1.000000
25%       2.000000  1.000000e+04   550.000000     1.000000
50%       2.000000  1.600000e+04   850.000000     2.000000
75%       3.000000  3.300000e+04  1200.000000     2.000000
max       6.000000  3.500000e+06  8000.000000    10.000000


In [142]:
# Data types of columns
print(df.dtypes)

Posted On            object
BHK                   int64
Rent                  int64
Size                  int64
Floor                object
Area Type            object
Area Locality        object
City                 object
Furnishing Status    object
Tenant Preferred     object
Bathroom              int64
Point of Contact     object
dtype: object


Data Preprocessing

In [143]:
# Encode categorical variables
label_encoder = LabelEncoder()
df['Area Type'] = label_encoder.fit_transform(df['Area Type'])
df['Area Locality'] = label_encoder.fit_transform(df['Area Locality'])
df['City'] = label_encoder.fit_transform(df['City'])
df['Furnishing Status'] = label_encoder.fit_transform(df['Furnishing Status'])
df['Tenant Preferred'] = label_encoder.fit_transform(df['Tenant Preferred'])
df['Point of Contact'] = label_encoder.fit_transform(df['Point of Contact'])

Feature Selection

In [144]:
# Select features (predictors) and target variable
selected_features = ['BHK', 'Size', 'Area Type', 'Area Locality', 'City', 'Furnishing Status', 'Tenant Preferred', 'Bathroom']
X = df[selected_features]
y = df['Rent']

Model Selection

In [145]:
# Initialize models
lr_model = LinearRegression()
rf_model = RandomForestRegressor()

Model Training

In [146]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the models
lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

Model Evaluation

In [147]:
# Make predictions
lr_predictions = lr_model.predict(X_test)
rf_predictions = rf_model.predict(X_test)

# Evaluate the models
lr_rmse = mean_squared_error(y_test, lr_predictions, squared=False)
rf_rmse = mean_squared_error(y_test, rf_predictions, squared=False)

lr_mae = mean_absolute_error(y_test, lr_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)

lr_r2 = r2_score(y_test, lr_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print("Linear Regression Model:")
print("RMSE:", lr_rmse)
print("MAE:", lr_mae)
print("R^2 Score:", lr_r2)

print("\nRandom Forest Regression Model:")
print("RMSE:", rf_rmse)
print("MAE:", rf_mae)
print("R^2 Score:", rf_r2)

Linear Regression Model:
RMSE: 47213.32681019883
MAE: 24410.64757814495
R^2 Score: 0.44068131727286575

Random Forest Regression Model:
RMSE: 47417.68191934265
MAE: 13776.565428601807
R^2 Score: 0.4358290011258549


Prediction

In [149]:
# Predict house rent prices using the trained models
lr_rent_predictions = lr_model.predict(X_test)
rf_rent_predictions = rf_model.predict(X_test)

print(lr_rent_predictions)
print(rf_rent_predictions)

[ 8.93814718e+03  2.06685352e+04  5.26843688e+04  7.08912025e+04
  1.35783284e+05  5.15918956e+04  6.41801090e+04  2.96612474e+03
  1.53818345e+05  9.49855691e+03 -9.12655133e+03  2.63697392e+04
  3.21926302e+04 -2.78095380e+03  5.24450886e+04  1.61424393e+04
  1.18995786e+05  2.33157201e+03  3.28058883e+04  7.88575885e+04
  3.63789811e+04  8.42316002e+03  3.76875588e+04  4.38370005e+04
  1.41615295e+05  2.26361995e+04 -2.06698338e+04  5.94116797e+04
 -2.61320484e+04 -1.41400001e+04 -3.14937484e+04 -1.32923296e+04
  3.82814737e+04  3.08715950e+04  5.29440286e+04  4.46860522e+04
  2.23296995e+04 -5.12800359e+03  1.20419088e+04  9.39225021e+03
  1.98931686e+04  1.73026583e+05  2.31715483e+04 -2.69577978e+04
  5.06430199e+04  2.89495115e+04  5.18311084e+04  8.99343508e+04
  1.80417742e+05  2.47466346e+04  1.98433141e+04  9.27111231e+04
  2.45499978e+04  4.17441466e+04  3.72560656e+04  1.23594375e+05
  6.23283021e+04  5.03418899e+04 -9.34484488e+03  9.16703220e+03
 -5.49729104e+03 -2.16650