In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.metrics import r2_score

In [51]:
df = pd.read_csv(r'C:\Users\admin\Downloads\Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [53]:
df.duplicated().sum()

0

In [54]:
df.shape

(545, 13)

In [55]:
dummy_mainroad = pd.get_dummies(df.mainroad, prefix="mainroad")
dummy_mainroad = dummy_mainroad.astype(int)

dummy_guestroom = pd.get_dummies(df.guestroom, prefix="guestroom")
dummy_guestroom = dummy_guestroom.astype(int)

dummy_basement = pd.get_dummies(df.basement, prefix="basement")
dummy_basement = dummy_basement.astype(int)

dummy_hotwaterheating = pd.get_dummies(df.hotwaterheating, prefix="hotwaterheating")
dummy_hotwaterheating = dummy_hotwaterheating.astype(int)

dummy_airconditioning = pd.get_dummies(df.airconditioning, prefix="airconditioning")
dummy_airconditioning = dummy_airconditioning.astype(int)

dummy_prefarea = pd.get_dummies(df.prefarea, prefix="prefarea")
dummy_prefarea = dummy_prefarea.astype(int)

dummy_furnishingstatus = pd.get_dummies(df.furnishingstatus, prefix="furnishingstatus")
dummy_furnishingstatus = dummy_furnishingstatus.astype(int)

df_cleaned = pd.concat([df, dummy_mainroad, dummy_guestroom, dummy_basement, dummy_hotwaterheating, dummy_airconditioning, dummy_prefarea, dummy_furnishingstatus ], axis=1)
df_cleaned = df_cleaned.drop(['mainroad', 'guestroom', 'basement', 'hotwaterheating','airconditioning','prefarea','furnishingstatus'], axis = 1)


In [56]:
df_cleaned.head(5)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,...,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,0,1,1,0,...,0,1,0,0,1,0,1,1,0,0
1,12250000,8960,4,4,4,3,0,1,1,0,...,0,1,0,0,1,1,0,1,0,0
2,12250000,9960,3,2,2,2,0,1,1,0,...,1,1,0,1,0,0,1,0,1,0
3,12215000,7500,4,2,2,3,0,1,1,0,...,1,1,0,0,1,0,1,1,0,0
4,11410000,7420,4,1,2,2,0,1,0,1,...,1,1,0,0,1,1,0,1,0,0


In [57]:
y= df_cleaned.price.values
x= df_cleaned.drop(columns=['price'])

In [58]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [59]:
lr = LinearRegression()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()

models = [lr,dt,rf]

In [60]:
for model in models:
    print(f'For {model} : ')
    model.fit(x_train,y_train)
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)
    print()
    
    print('For training dataset : ')
    mae_train = mean_absolute_error(y_train,y_pred_train)
    print('mae train : ',mae_train)
    mse_train = mean_squared_error(y_train,y_pred_train)
    print('rmse train : ',np.sqrt(mse_train))
    print()
    
    print('For testing dataset :')
    mae_test = mean_absolute_error(y_test,y_pred_test)
    print('mae test : ',mae_test)
    mse_test = mean_squared_error(y_test,y_pred_test)
    print('rmse test : ',np.sqrt(mse_test))
    
    print()
    print('score : ',model.score(x_train,y_train))
    print('*'*80)
    print()
    

For LinearRegression() : 

For training dataset : 
mae train :  719242.893672472
rmse train :  984051.9236507413

For testing dataset :
mae test :  970043.4039201636
rmse test :  1324506.960091438

score :  0.6859438988560158
********************************************************************************

For DecisionTreeRegressor() : 

For training dataset : 
mae train :  8107.798165137615
rmse train :  67088.47540457372

For testing dataset :
mae test :  1197610.0917431193
rmse test :  1638260.462119232

score :  0.9985402884288594
********************************************************************************

For RandomForestRegressor() : 

For training dataset : 
mae train :  282956.37133027526
rmse train :  403287.90123693936

For testing dataset :
mae test :  1010462.4287461775
rmse test :  1369547.5732705637

score :  0.947252535077978
********************************************************************************

