In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("/Users/birhangborgoyary/Documents/ml project/House_Rent_Dataset.csv")
df.head()

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [3]:
df = df[df['Rent'] <= 100000]



In [4]:
# Check data types
print(df.info())

# Check maximum value of 'Rent' column
print("Maximum Rent value:", df['Rent'].max())

# Check a sample of the filtered DataFrame
print(df.head())


<class 'pandas.core.frame.DataFrame'>
Index: 4466 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4466 non-null   object
 1   BHK                4466 non-null   int64 
 2   Rent               4466 non-null   int64 
 3   Size               4466 non-null   int64 
 4   Floor              4466 non-null   object
 5   Area Type          4466 non-null   object
 6   Area Locality      4466 non-null   object
 7   City               4466 non-null   object
 8   Furnishing Status  4466 non-null   object
 9   Tenant Preferred   4466 non-null   object
 10  Bathroom           4466 non-null   int64 
 11  Point of Contact   4466 non-null   object
dtypes: int64(4), object(8)
memory usage: 453.6+ KB
None
Maximum Rent value: 100000
    Posted On  BHK   Rent  Size            Floor    Area Type  \
0  2022-05-18    2  10000  1100  Ground out of 2   Super Area   
1  2022-05-13    2  20000 

In [5]:
num_rows, num_cols = df.shape
print("Total number of rows:", num_rows)
print("Total number of columns:", num_cols)


Total number of rows: 4466
Total number of columns: 12


In [6]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

# Remove leading and trailing spaces from column names
df.columns = df.columns.str.strip()

# Remove leading and trailing spaces from the data
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Missing values:
 Posted On            0
BHK                  0
Rent                 0
Size                 0
Floor                0
Area Type            0
Area Locality        0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
dtype: int64


In [7]:
num_rows, num_cols = df.shape
print("Total number of rows:", num_rows)
print("Total number of columns:", num_cols)


Total number of rows: 4466
Total number of columns: 12


In [8]:
# Drop irrelevant columns
df.pop("Posted On")
df.pop("Area Locality")
df.drop(["Floor"], axis=1, inplace=True)

In [9]:
print("Columns in the dataset after dropping irrelevant columns:")
print(df.columns)


Columns in the dataset after dropping irrelevant columns:
Index(['BHK', 'Rent', 'Size', 'Area Type', 'City', 'Furnishing Status',
       'Tenant Preferred', 'Bathroom', 'Point of Contact'],
      dtype='object')


In [10]:
df.head()

Unnamed: 0,BHK,Rent,Size,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2,10000,1100,Super Area,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2,20000,800,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2,17000,1000,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2,10000,800,Super Area,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2,7500,850,Carpet Area,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [11]:
# Unique values for 'BHK' column
unique_bhk = df['BHK'].unique()
print("Unique values in 'BHK' column:")
print(unique_bhk)

Unique values in 'BHK' column:
[2 1 3 6 4 5]


In [12]:
# Unique values for 'Area Type' column
unique_area_type = df['Area Type'].unique()
print("\nUnique values in 'Area Type' column:")
print(unique_area_type)


Unique values in 'Area Type' column:
['Super Area' 'Carpet Area' 'Built Area']


In [13]:
# Unique values for 'City' column
unique_city = df['City'].unique()
print("\nUnique values in 'City' column:")
print(unique_city)


Unique values in 'City' column:
['Kolkata' 'Mumbai' 'Bangalore' 'Delhi' 'Chennai' 'Hyderabad']


In [14]:
# Unique values for 'Furnishing Status' column
unique_furnishing_status = df['Furnishing Status'].unique()
print("\nUnique values in 'Furnishing Status' column:")
print(unique_furnishing_status)


Unique values in 'Furnishing Status' column:
['Unfurnished' 'Semi-Furnished' 'Furnished']


In [15]:
# Unique values for 'Tenant Preferred' column
unique_tenant_preferred = df['Tenant Preferred'].unique()
print("\nUnique values in 'Tenant Preferred' column:")
print(unique_tenant_preferred)


Unique values in 'Tenant Preferred' column:
['Bachelors/Family' 'Bachelors' 'Family']


In [16]:
# Unique values for 'Point of Contact' column
unique_point_of_contact = df['Point of Contact'].unique()
print("\nUnique values in 'Point of Contact' column:")
print(unique_point_of_contact)


Unique values in 'Point of Contact' column:
['Contact Owner' 'Contact Agent' 'Contact Builder']


In [17]:
# Replace categorical values with numerical values in 'Area Type' column
df['Area Type'].replace({'Super Area': 0, 'Carpet Area': 1, 'Built Area': 2}, inplace=True)


In [18]:
# Replace categorical values with numerical values in 'City' column
city_mapping = {'Kolkata': 0, 'Mumbai': 1, 'Bangalore': 2, 'Delhi': 3, 'Chennai': 4, 'Hyderabad': 5}
df['City'].replace(city_mapping, inplace=True)


In [19]:
# Replace categorical values with numerical values in 'Furnishing Status' column
furnishing_mapping = {'Unfurnished': 0, 'Semi-Furnished': 1, 'Furnished': 2}
df['Furnishing Status'].replace(furnishing_mapping, inplace=True)


In [20]:
# Replace categorical values with numerical values in 'Tenant Preferred' column
tenant_mapping = {'Bachelors/Family': 0, 'Bachelors': 1, 'Family': 2}
df['Tenant Preferred'].replace(tenant_mapping, inplace=True)


In [21]:
# Replace categorical values with numerical values in 'Point of Contact' column
point_of_contact_mapping = {'Contact Owner': 0, 'Contact Agent': 1, 'Contact Builder': 2}
df['Point of Contact'].replace(point_of_contact_mapping, inplace=True)


In [22]:
df.head()


Unnamed: 0,BHK,Rent,Size,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2,10000,1100,0,0,0,0,2,0
1,2,20000,800,0,0,1,0,1,0
2,2,17000,1000,0,0,1,0,1,0
3,2,10000,800,0,0,0,0,1,0
4,2,7500,850,1,0,0,1,1,0


In [23]:
X = df.drop("Rent", axis=1)


In [24]:
y = df['Rent']


In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [27]:
from sklearn.linear_model import LinearRegression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train_scaled, y_train)


In [28]:
from sklearn.ensemble import RandomForestRegressor
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train_scaled, y_train)


In [29]:
from sklearn.svm import SVR
svr_model = SVR(kernel='linear')
svr_model.fit(X_train_scaled, y_train)


In [30]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred_linear = linear_reg_model.predict(X_test_scaled)
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("Linear Regression - R2 Score:", r2_linear)


Linear Regression - R2 Score: 0.5316244480410482


In [31]:
y_pred_rf = random_forest_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Regression - R2 Score:", r2_rf)


Random Forest Regression - R2 Score: 0.7531467660907413


In [32]:
y_pred_svr = svr_model.predict(X_test_scaled)
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("SVR - R2 Score:", r2_svr)


SVR - R2 Score: 0.13693992953705025


In [None]:
import numpy as np

# Input numerical values for the features
BHK = int(input("Enter the number of bedrooms (BHK): "))
Size = float(input("Enter the size of the property: "))
Area_Type = int(input("Enter the area type (0 for Super Area, 1 for Carpet Area, 2 for Built Area): "))
City = int(input("Enter the city (0 for Kolkata, 1 for Mumbai, 2 for Bangalore, 3 for Delhi, 4 for Chennai, 5 for Hyderabad): "))
Furnishing_Status = int(input("Enter the furnishing status (0 for Unfurnished, 1 for Semi-Furnished, 2 for Furnished): "))
Tenant_Preferred = int(input("Enter the tenant preference (0 for Bachelors/Family, 1 for Bachelors, 2 for Family): "))
Bathroom = int(input("Enter the number of bathrooms: "))
Point_of_Contact = int(input("Enter the point of contact (0 for Contact Owner, 1 for Contact Agent, 2 for Contact Builder): "))

# Create a numpy array with the input values
input_data = np.array([[BHK, Size, Area_Type, City, Furnishing_Status, Tenant_Preferred, Bathroom, Point_of_Contact]])

# Scale the input data
input_data_scaled = scaler.transform(input_data)

# Use the trained Random Forest Regression model to predict the rent
predicted_rent = random_forest_model.predict(input_data_scaled)

print("Predicted rent:", predicted_rent[0])


In [None]:
#import pickle

# Save the trained Random Forest Regression model
#with open('model.pkl', 'wb') as f:
    #pickle.dump(random_forest_model, f)


In [None]:
#import pickle

# Save the scaler
#with open('scaler.pkl', 'wb') as f:
    #pickle.dump(scaler, f)
