In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
df = pd.read_excel('Cleaned_and_Feature_Section_Bangalore_Rental_House_data.xlsx')

In [3]:
df = df[~(df['Total_Floors'] == 0)].reset_index(drop=True)

In [4]:
def categorize_floors(floor):
    if floor <= 3:
        return 'Low-rise'
    elif 4 <= floor <= 10:
        return 'Mid-rise'
    elif 11 <= floor <= 20:
        return 'High-rise'
    else:
        return 'Skyscraper'

df['Total_Floors'] = df['Total_Floors'].apply(categorize_floors)

In [5]:
df.loc[df['Age'] == '10+' , 'Age'] = 11
df['Age'] = df['Age'].astype(float)

In [6]:
df = df.dropna(subset=['Age']).reset_index(drop=True)

In [7]:
def categorize_age(value):
    if pd.isna(value):
        return "Undefined"
    if value == 0:
        return "Under Construction"
    if 1 <= value <= 5:
        return "New Property"
    if 6 <= value <= 10:
        return "Moderately Old"
    if value > 10:
        return "Old Property"
    else:
        return "Undefined"

df['Age'] = df['Age'].apply(categorize_age)

In [8]:
df.loc[df['Balcony'] == 0 , 'Balcony'] = '0'

In [9]:
import pickle

with open('Model_Prediction_Dataset.pkl', 'wb') as file:
    pickle.dump(df, file)

In [10]:
df.head()

Unnamed: 0,Region,Bedroom,Bathroom,Balcony,Additional_rooms,Area (sq.ft),Furnishing,Age,Covered_Parking,Brokerage,Deposit,Maintenance,Type,Total_Floors,Rent
0,Bangalore East,4,5,3,2,2100.0,Semifurnished,Moderately Old,1,120000,840000,0,House/Villa,Low-rise,120000
1,Bangalore East,3,3,2,0,1777.26,Semifurnished,New Property,1,43000,300000,5000,Apartment,High-rise,43000
2,Bangalore East,1,2,1,0,600.0,Semifurnished,New Property,1,0,70000,0,Builder Floor,Low-rise,12000
3,Bangalore East,2,2,1,0,1160.0,Furnished,New Property,1,40000,200000,0,Apartment,Mid-rise,40000
4,Bangalore East,3,5,3+,1,3300.0,Semifurnished,New Property,2,140000,840000,15000,Apartment,Mid-rise,140000


In [11]:
X = df.drop(columns=['Rent'])
y = df['Rent']

In [12]:
y_transformed = np.log1p(y)

In [13]:
columns_to_scale = ['Bedroom', 'Bathroom', 'Additional_rooms', 'Area (sq.ft)', 
                    'Covered_Parking', 'Brokerage', 'Deposit', 'Maintenance']

columns_to_ordinal_encode = ['Furnishing', 'Balcony', 'Type', 'Total_Floors', 'Age']

columns_to_onehot_encode = ['Region']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columns_to_scale),
        ('cat', OrdinalEncoder(), columns_to_ordinal_encode),
        ('cat1', OneHotEncoder(drop='first'), columns_to_onehot_encode)
    ],
    remainder='passthrough' 
)

In [14]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(max_depth=20, max_features='sqrt', max_samples=1.0, n_estimators=300))
])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [16]:
pipeline.fit(X_train,y_train)

In [17]:
y_pred = pipeline.predict(X_test)

In [18]:
y_pred = np.expm1(y_pred)

In [19]:
mean_absolute_error(np.expm1(y_test),y_pred)

6164.11833503913

In [20]:
r2_score(np.expm1(y_test),y_pred)

0.9329858121432554