In [1]:
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [24]:
rents = pd.read_csv('House_Rent_Dataset.csv')
rents.head()

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [25]:
rents.rename(columns = {'BHK' : 'N0. of Rooms'},inplace = True)

In [26]:
rents.head()

Unnamed: 0,Posted On,N0. of Rooms,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [27]:
rents.isnull().sum()

Posted On            0
N0. of Rooms         0
Rent                 0
Size                 0
Floor                0
Area Type            0
Area Locality        0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
dtype: int64

In [28]:
rents['Area Type'].unique()

array(['Super Area', 'Carpet Area', 'Built Area'], dtype=object)

In [29]:
rents['City'].unique()

array(['Kolkata', 'Mumbai', 'Bangalore', 'Delhi', 'Chennai', 'Hyderabad'],
      dtype=object)

In [30]:
rents['Furnishing Status'].unique()

array(['Unfurnished', 'Semi-Furnished', 'Furnished'], dtype=object)

In [31]:
dict1 = {'Bachelors/Family' : 'Mixed'}
rents['Tenant Preferred'] = rents['Tenant Preferred'].replace(dict1)

In [32]:
rents['Tenant Preferred'].unique()

array(['Mixed', 'Bachelors', 'Family'], dtype=object)

In [33]:
rents['Bathroom'].unique()

array([ 2,  1,  3,  5,  4,  6,  7, 10], dtype=int64)

In [34]:
rents['Point of Contact'].unique()

array(['Contact Owner', 'Contact Agent', 'Contact Builder'], dtype=object)

In [35]:
rents = pd.get_dummies(rents, columns = ['City'], drop_first = True)

In [None]:
rents

In [36]:
bool_columns= ['City_Chennai', 'City_Delhi', 'City_Hyderabad','City_Kolkata','City_Mumbai']

rents[bool_columns]= rents[bool_columns].astype(int)

In [37]:
rents.head()

Unnamed: 0,Posted On,N0. of Rooms,Rent,Size,Floor,Area Type,Area Locality,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,City_Chennai,City_Delhi,City_Hyderabad,City_Kolkata,City_Mumbai
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Unfurnished,Mixed,2,Contact Owner,0,0,0,1,0
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Semi-Furnished,Mixed,1,Contact Owner,0,0,0,1,0
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Semi-Furnished,Mixed,1,Contact Owner,0,0,0,1,0
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Unfurnished,Mixed,1,Contact Owner,0,0,0,1,0
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Unfurnished,Bachelors,1,Contact Owner,0,0,0,1,0


In [38]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
rents['Size'] = scaler.fit_transform(rents[['Size']]) 

In [39]:
X = rents[['Size', 'Bathroom','City_Chennai', 'City_Delhi', 'City_Hyderabad','City_Kolkata','City_Mumbai']]
y = rents['Rent']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [41]:
model = LinearRegression()
model.fit(X_train, y_train)

In [42]:
ypred = model.predict(X_test)

In [None]:
ypred

In [43]:
from sklearn.metrics import mean_squared_error, r2_score

In [44]:
MSE = mean_squared_error(ypred, y_test)
MSE

1907703463.5956383

In [45]:
r2 = r2_score(ypred, y_test)
r2

-0.005009770213483256

In [49]:
corr = rents[['Rent','Size','City_Chennai', 'City_Delhi', 'City_Hyderabad','City_Kolkata','City_Mumbai', 'Bathroom']].corr()
corr

Unnamed: 0,Rent,Size,City_Chennai,City_Delhi,City_Hyderabad,City_Kolkata,City_Mumbai,Bathroom
Rent,1.0,0.413551,-0.082361,-0.027072,-0.087465,-0.105322,0.327038,0.441215
Size,0.413551,1.0,0.04869,-0.10915,0.16352,-0.100068,-0.049287,0.740703
City_Chennai,-0.082361,0.04869,1.0,-0.18376,-0.227448,-0.169369,-0.243983,0.017944
City_Delhi,-0.027072,-0.10915,-0.18376,1.0,-0.180834,-0.134658,-0.19398,-0.048824
City_Hyderabad,-0.087465,0.16352,-0.227448,-0.180834,1.0,-0.166672,-0.240098,0.098374
City_Kolkata,-0.105322,-0.100068,-0.169369,-0.134658,-0.166672,1.0,-0.178788,-0.212933
City_Mumbai,0.327038,-0.049287,-0.243983,-0.19398,-0.240098,-0.178788,1.0,0.18429
Bathroom,0.441215,0.740703,0.017944,-0.048824,0.098374,-0.212933,0.18429,1.0


In [46]:
X_train_sm = sm.add_constant(X_train) # Adding a constant for the intercept
model_sm = sm.OLS(y_train, X_train_sm).fit()
print(model_sm.summary())

                            OLS Regression Results                            
Dep. Variable:                   Rent   R-squared:                       0.280
Model:                            OLS   Adj. R-squared:                  0.279
Method:                 Least Squares   F-statistic:                     210.7
Date:                Wed, 07 Aug 2024   Prob (F-statistic):          6.52e-265
Time:                        09:41:52   Log-Likelihood:                -47685.
No. Observations:                3796   AIC:                         9.539e+04
Df Residuals:                    3788   BIC:                         9.544e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const           -593.5484   4444.250     -0.