Importing the dependencies

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [35]:
df = pd.read_csv('./Housing.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [36]:
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [37]:
df.isna().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

<p style='color: red; font-size: 16px'>Since there isnt any null values and data is clean we move on to training the regression model </p>

# ***Training the model***

In [6]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [7]:
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [8]:
Y = df['price']
X = df.drop(['mainroad',
                           'guestroom', 
                           'basement', 
                           'hotwaterheating', 
                           'airconditioning', 
                           'prefarea', 
                           'furnishingstatus'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [9]:

poly = PolynomialFeatures(degree=3, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Using Linear and Polynomial to see which gives better accuracy

In [10]:
LinearModel = LinearRegression()
PolyModel = LinearRegression()
LinearModel.fit(X_train, y_train)
PolyModel.fit(X_train_poly,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [11]:
linear_y_pred = LinearModel.predict(X_test)
poly_y_pred = PolyModel.predict(X_test_poly)

In [12]:
mse = mean_squared_error(y_test, linear_y_pred)
mae = mean_absolute_error(y_test, linear_y_pred)
r2 = r2_score(y_test, linear_y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 3.366994820058195e-19
Mean Absolute Error: 3.097288378881752e-10
R^2 Score: 1.0


In [13]:
mse = mean_squared_error(y_test, poly_y_pred)
mae = mean_absolute_error(y_test, poly_y_pred)
r2 = r2_score(y_test, poly_y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 56689013216.06698
Mean Absolute Error: 127775.9346106634
R^2 Score: 0.9887846027549239


Linear Regression gave 100% accuracy while Poly Regression gave 98% when degree is 3. 

Let us observe the results when the polynomial degree is changed:

In [26]:
for i in range(1,8):
    print(f"When degree = {i}")
    poly = PolynomialFeatures(degree=i, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    PolyModel = LinearRegression()
    PolyModel.fit(X_train_poly,y_train)
    poly_y_pred = PolyModel.predict(X_test_poly)

    mse = mean_squared_error(y_test, poly_y_pred)
    mae = mean_absolute_error(y_test, poly_y_pred)
    r2 = r2_score(y_test, poly_y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R^2 Score: {r2} \n\n")
    

When degree = 1
Mean Squared Error: 3.366994820058195e-19
Mean Absolute Error: 3.097288378881752e-10
R^2 Score: 1.0 


When degree = 2
Mean Squared Error: 1.569992944313692e-11
Mean Absolute Error: 2.5347375028997387e-06
R^2 Score: 1.0 


When degree = 3
Mean Squared Error: 56689013216.06698
Mean Absolute Error: 127775.9346106634
R^2 Score: 0.9887846027549239 


When degree = 4
Mean Squared Error: 435222178147.12305
Mean Absolute Error: 355725.9412967548
R^2 Score: 0.9138953151436431 


When degree = 5
Mean Squared Error: 1718986616772.3406
Mean Absolute Error: 635203.4461890543
R^2 Score: 0.6599143877740461 


When degree = 6
Mean Squared Error: 5301909701614.745
Mean Absolute Error: 921485.3232386934
R^2 Score: -0.04893382487524911 


When degree = 7
Mean Squared Error: 13706868942198.645
Mean Absolute Error: 1264105.5908500287
R^2 Score: -1.7117773171854198 




<b>Findings:</b> <br>
When we removed the non numerical columns for features the dataset showed a more linear nature rather then polynomial.
Now lets train the model with all the non-numerical features included as well

In [41]:
y = df['price']

categorical_cols = ['mainroad', 'guestroom', 'basement', 
                    'hotwaterheating', 'airconditioning', 
                    'prefarea', 'furnishingstatus']

X = df.drop(columns=['price'])
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True, dtype=int)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [47]:
for i in range(1,8):
    print(f"When degree = {i}")
    poly = PolynomialFeatures(degree=i, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    PolyModel = LinearRegression()
    PolyModel.fit(X_train_poly,y_train)
    poly_y_pred = PolyModel.predict(X_test_poly)

    mse = mean_squared_error(y_test, poly_y_pred)
    mae = mean_absolute_error(y_test, poly_y_pred)
    r2 = r2_score(y_test, poly_y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R^2 Score: {r2} \n\n")
    

When degree = 1
Mean Squared Error: 1754318687330.663
Mean Absolute Error: 970043.4039201637
R^2 Score: 0.6529242642153186 


When degree = 2
Mean Squared Error: 1916484375792.3271
Mean Absolute Error: 1042927.6354577113
R^2 Score: 0.6208412817741398 


When degree = 3
Mean Squared Error: 3933215173753.171
Mean Absolute Error: 1310964.8513385758
R^2 Score: 0.22184973557626475 


When degree = 4
Mean Squared Error: 2567432469380.2944
Mean Absolute Error: 1158233.4240929952
R^2 Score: 0.4920572186667419 


When degree = 5
Mean Squared Error: 4792212989551.613
Mean Absolute Error: 1406043.2346376663
R^2 Score: 0.051904958091554376 


When degree = 6
Mean Squared Error: 7757485732327.744
Mean Absolute Error: 1621770.8186882192
R^2 Score: -0.5347468437924336 


When degree = 7
Mean Squared Error: 7313858586335.205
Mean Absolute Error: 1543075.020647675
R^2 Score: -0.44697931374138 




<b>Findings: </b> <br>
Non including the non numerical values gave better accuracy compared to when it was included.
For the current data set, higher degree of polynomial give less accuracy in both cases