<a href="https://colab.research.google.com/github/AbhishekMhamane/ml-learning/blob/main/projects/house-price-prediction/house_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the libraries

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing the dataset

In [69]:
dataset = pd.read_csv("Housing.csv")
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [71]:
X = dataset.iloc[:,1:]
y = dataset.iloc[:,0]
# X.drop(['area','bedrooms'], axis=1, inplace=True)
print(X.head())
print(y.head())

   area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  7420         4          2        3      yes        no       no   
1  8960         4          4        4      yes        no       no   
2  9960         3          2        2      yes        no      yes   
3  7500         4          2        2      yes        no      yes   
4  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64


# Pandas profiling


In [None]:
pip install ydata-profiling



In [None]:
from ydata_profiling import ProfileReport
prof = ProfileReport(dataset)
prof.to_file(output_file='output.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/13 [00:00<?, ?it/s][A
 23%|██▎       | 3/13 [00:00<00:00, 14.83it/s][A
 46%|████▌     | 6/13 [00:00<00:00, 19.97it/s][A
 69%|██████▉   | 9/13 [00:00<00:00, 22.36it/s][A
100%|██████████| 13/13 [00:00<00:00, 22.51it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Data pre-processing



* Replace missing values (No missing values present in dataset)
* Handle Categorical values (Using encoding)
* Outlier Detection and handling
* Split data into train and test
* Feature Scaling

## Encoding categorical data


In [72]:
# Handle Categorical values (Encoding)
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[
    ('onehotencoder', OneHotEncoder(drop='first'),['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']),
    ('ordinalencoder', OrdinalEncoder(categories=[['furnished','semi-furnished','unfurnished']]), ['furnishingstatus'])
], remainder='passthrough')

x_transformed = ct.fit_transform(X)
X = np.array(x_transformed)
y = np.array(y)
np.set_printoptions(threshold=np.inf, suppress=True)
print(X[:5,:])
print(y[:5])

[[   1.    0.    0.    0.    1.    1.    0. 7420.    4.    2.    3.    2.]
 [   1.    0.    0.    0.    1.    0.    0. 8960.    4.    4.    4.    3.]
 [   1.    0.    1.    0.    0.    1.    1. 9960.    3.    2.    2.    2.]
 [   1.    0.    1.    0.    1.    1.    0. 7500.    4.    2.    2.    3.]
 [   1.    1.    1.    0.    1.    0.    0. 7420.    4.    1.    2.    2.]]
[13300000 12250000 12250000 12215000 11410000]


## Split data into train and test sets

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train[:5,:])
print(y_train[:5])
print(X_test[:5,:])
print(y_test[:5])

[[   1.    0.    0.    0.    0.    0.    2. 3620.    2.    1.    1.    0.]
 [   1.    0.    0.    0.    0.    0.    2. 4000.    2.    1.    1.    0.]
 [   0.    0.    0.    0.    0.    0.    2. 3040.    2.    1.    1.    0.]
 [   1.    0.    0.    0.    0.    0.    2. 3600.    2.    1.    1.    0.]
 [   1.    0.    0.    0.    0.    0.    1. 9860.    3.    1.    1.    0.]]
[1750000 2695000 2870000 2590000 4515000]
[[    1.     0.     0.     0.     0.     0.     0.  4000.     3.     1.
      2.     1.]
 [    1.     0.     1.     0.     0.     1.     0.  9620.     3.     1.
      1.     2.]
 [    1.     0.     0.     0.     1.     0.     1.  3460.     4.     1.
      2.     0.]
 [    1.     0.     1.     1.     0.     0.     0. 13200.     2.     1.
      1.     1.]
 [    0.     0.     0.     0.     0.     0.     2.  3660.     4.     1.
      2.     0.]]
[4585000 6083000 4007500 6930000 2940000]


## Feature Scaling

In [74]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[:,7:] = sc.fit_transform(X_train_scaled[:,7:])
X_test_scaled[:,7:] = sc.transform(X_test_scaled[:,7:])
print(X_train_scaled[:5,:])
print(X_test_scaled[:5,:])

[[ 1.    0.    0.    0.    0.    0.    2.   -0.72 -1.29 -0.57 -0.93 -0.82]
 [ 1.    0.    0.    0.    0.    0.    2.   -0.54 -1.29 -0.57 -0.93 -0.82]
 [ 0.    0.    0.    0.    0.    0.    2.   -0.99 -1.29 -0.57 -0.93 -0.82]
 [ 1.    0.    0.    0.    0.    0.    2.   -0.73 -1.29 -0.57 -0.93 -0.82]
 [ 1.    0.    0.    0.    0.    0.    1.    2.2   0.05 -0.57 -0.93 -0.82]]
[[ 1.    0.    0.    0.    0.    0.    0.   -0.54  0.05 -0.57  0.21  0.33]
 [ 1.    0.    1.    0.    0.    1.    0.    2.09  0.05 -0.57 -0.93  1.47]
 [ 1.    0.    0.    0.    1.    0.    1.   -0.79  1.4  -0.57  0.21 -0.82]
 [ 1.    0.    1.    1.    0.    0.    0.    3.77 -1.29 -0.57 -0.93  0.33]
 [ 0.    0.    0.    0.    0.    0.    2.   -0.7   1.4  -0.57  0.21 -0.82]]


# Multiple Linear Regression

In [75]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train_scaled, y_train)

y_pred = reg.predict(X_test_scaled)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), 1))

print(reg.coef_)
print(reg.intercept_)

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

print(r2_score(y_test, y_pred))
print(mean_absolute_percentage_error(y_test,y_pred))

[[ 4019080.7   4585000.  ]
 [ 6258021.77  6083000.  ]
 [ 4392719.13  4007500.  ]
 [ 7335082.22  6930000.  ]
 [ 2892857.57  2940000.  ]
 [ 7084655.58  6195000.  ]
 [ 3258634.74  3535000.  ]
 [ 3196499.16  2940000.  ]
 [ 3520106.01  3500000.  ]
 [ 8359610.29  7980000.  ]
 [ 6646926.36  6755000.  ]
 [ 3788844.63  3990000.  ]
 [ 3734072.96  3150000.  ]
 [ 4616010.41  3290000.  ]
 [ 3946124.94  4130000.  ]
 [ 2023177.47  2660000.  ]
 [ 3975554.49  4410000.  ]
 [ 3624716.97  3710000.  ]
 [ 3210282.74  3360000.  ]
 [ 4668411.38  4270000.  ]
 [ 5893719.66  5005000.  ]
 [ 6415683.84  5383000.  ]
 [ 4657036.56  6440000.  ]
 [ 2704096.11  1890000.  ]
 [ 5356909.86  6125000.  ]
 [ 5740682.57  5460000.  ]
 [ 5337586.48  5803000.  ]
 [ 5444859.97  4620000.  ]
 [ 5702144.57  5530000.  ]
 [ 5865872.3   5950000.  ]
 [ 3307327.31  4305000.  ]
 [ 6299491.03  3640000.  ]
 [ 7123479.96  5250000.  ]
 [ 2960326.87  3325000.  ]
 [ 4424323.19  3703000.  ]
 [ 5125081.86  4753000.  ]
 [ 4947693.93  9100000.  ]
 

# Polynomial Regression

In [76]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
reg = LinearRegression()
reg.fit(X_poly, y_train)

y_pred = reg.predict(poly_reg.transform(X_test))
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

[[  4228039.99   4585000.  ]
 [  6634889.09   6083000.  ]
 [  4401807.49   4007500.  ]
 [-55393006.57   6930000.  ]
 [  3091892.42   2940000.  ]
 [  6768890.81   6195000.  ]
 [  2948660.95   3535000.  ]
 [  3294843.96   2940000.  ]
 [  3266977.83   3500000.  ]
 [  6920404.23   7980000.  ]
 [  5684404.83   6755000.  ]
 [  3941160.27   3990000.  ]
 [  3797258.9    3150000.  ]
 [  4799342.83   3290000.  ]
 [  4104367.36   4130000.  ]
 [  2725399.31   2660000.  ]
 [  3932760.49   4410000.  ]
 [  4294530.4    3710000.  ]
 [  3553084.86   3360000.  ]
 [  4133718.55   4270000.  ]
 [  6657841.96   5005000.  ]
 [  1651277.45   5383000.  ]
 [  4507600.4    6440000.  ]
 [  2794691.36   1890000.  ]
 [  6117135.86   6125000.  ]
 [  5344981.72   5460000.  ]
 [  6871994.31   5803000.  ]
 [  6423881.4    4620000.  ]
 [  5437059.25   5530000.  ]
 [  5168413.79   5950000.  ]
 [  3757347.78   4305000.  ]
 [  4956701.58   3640000.  ]
 [  8852334.38   5250000.  ]
 [  3138704.15   3325000.  ]
 [  5173028.12

-12.023165290715617

# Decision Trees Regression


In [None]:
from sklearn.tree import DecisionTreeRegressor
decReg = DecisionTreeRegressor(random_state = 0)
decReg.fit(X_train, y_train)

y_pred = decReg.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

print(r2_score(y_test, y_pred))

[[ 4907000.  4585000.]
 [ 5250000.  6083000.]
 [ 5145000.  4007500.]
 [ 5600000.  6930000.]
 [ 2940000.  2940000.]
 [ 6650000.  6195000.]
 [ 3430000.  3535000.]
 [ 4193000.  2940000.]
 [ 5250000.  3500000.]
 [ 6790000.  7980000.]
 [ 5775000.  6755000.]
 [ 3500000.  3990000.]
 [ 4473000.  3150000.]
 [ 4480000.  3290000.]
 [ 4480000.  4130000.]
 [ 2100000.  2660000.]
 [ 4900000.  4410000.]
 [ 4480000.  3710000.]
 [ 3640000.  3360000.]
 [ 3640000.  4270000.]
 [ 7210000.  5005000.]
 [ 9681000.  5383000.]
 [ 5740000.  6440000.]
 [ 2240000.  1890000.]
 [ 6510000.  6125000.]
 [ 6405000.  5460000.]
 [ 5229000.  5803000.]
 [ 5810000.  4620000.]
 [ 4095000.  5530000.]
 [ 5110000.  5950000.]
 [ 3605000.  4305000.]
 [ 4900000.  3640000.]
 [ 5950000.  5250000.]
 [ 3423000.  3325000.]
 [ 3640000.  3703000.]
 [ 5950000.  4753000.]
 [ 5495000.  9100000.]
 [ 3360000.  3500000.]
 [ 3150000.  3150000.]
 [ 4200000.  4270000.]
 [ 8575000.  8960000.]
 [ 5600000.  4060000.]
 [ 5740000.  5740000.]
 [ 4200000.

# Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
ranReg = RandomForestRegressor(n_estimators = 10, random_state = 0)
ranReg.fit(X_train, y_train)

y_pred = ranReg.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
print(r2_score(y_test, y_pred))

[[ 4369400.  4585000.]
 [ 5592300.  6083000.]
 [ 4581500.  4007500.]
 [ 6661200.  6930000.]
 [ 3004750.  2940000.]
 [ 7083300.  6195000.]
 [ 3147375.  3535000.]
 [ 3736600.  2940000.]
 [ 3654000.  3500000.]
 [ 7605500.  7980000.]
 [ 6051500.  6755000.]
 [ 3689000.  3990000.]
 [ 4199300.  3150000.]
 [ 4856600.  3290000.]
 [ 4873400.  4130000.]
 [ 2240000.  2660000.]
 [ 4264400.  4410000.]
 [ 4743900.  3710000.]
 [ 3514000.  3360000.]
 [ 3584000.  4270000.]
 [ 7329000.  5005000.]
 [ 6267100.  5383000.]
 [ 4560150.  6440000.]
 [ 2352000.  1890000.]
 [ 6428800.  6125000.]
 [ 4712400.  5460000.]
 [ 5640600.  5803000.]
 [ 5233200.  4620000.]
 [ 4702600.  5530000.]
 [ 5758900.  5950000.]
 [ 3628800.  4305000.]
 [ 4847500.  3640000.]
 [ 6674500.  5250000.]
 [ 3229800.  3325000.]
 [ 5117000.  3703000.]
 [ 4297300.  4753000.]
 [ 6263600.  9100000.]
 [ 3720500.  3500000.]
 [ 3503500.  3150000.]
 [ 4134900.  4270000.]
 [ 8354094.  8960000.]
 [ 6534500.  4060000.]
 [ 6839000.  5740000.]
 [ 3803800.

# Support Vector regression

In [None]:
sc_y = StandardScaler()
y_train_scaled = sc_y.fit_transform(y_train.reshape(-1,1))
y_test_scaled = sc_y.transform(y_test.reshape(-1,1))

print(y_train_scaled[:5,])
print(y_test_scaled[:5])

from sklearn.svm import SVR
reg = SVR(kernel = 'rbf')
reg.fit(X_train_scaled, y_train_scaled)

y_pred = sc_y.inverse_transform(reg.predict(X_test_scaled).reshape(-1,1))
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
print(r2_score(y_test, y_pred))

[[-1.59]
 [-1.09]
 [-1.  ]
 [-1.15]
 [-0.14]]
[[-0.1 ]
 [ 0.68]
 [-0.4 ]
 [ 1.13]
 [-0.96]]
[[ 4264232.42  4585000.  ]
 [ 6068580.13  6083000.  ]
 [ 3991888.36  4007500.  ]
 [ 5718467.26  6930000.  ]
 [ 2764353.92  2940000.  ]
 [ 6558021.14  6195000.  ]
 [ 3733922.3   3535000.  ]
 [ 3253913.79  2940000.  ]
 [ 3291241.01  3500000.  ]
 [ 7688278.77  7980000.  ]
 [ 6037100.47  6755000.  ]
 [ 3927438.48  3990000.  ]
 [ 3978992.14  3150000.  ]
 [ 5113476.37  3290000.  ]
 [ 4621978.68  4130000.  ]
 [ 2521466.2   2660000.  ]
 [ 4136037.33  4410000.  ]
 [ 4228826.88  3710000.  ]
 [ 3538854.1   3360000.  ]
 [ 3874047.93  4270000.  ]
 [ 5998326.87  5005000.  ]
 [ 5558940.25  5383000.  ]
 [ 4595366.28  6440000.  ]
 [ 2528678.6   1890000.  ]
 [ 6091159.15  6125000.  ]
 [ 5409889.16  5460000.  ]
 [ 5453248.64  5803000.  ]
 [ 5641817.08  4620000.  ]
 [ 4967186.05  5530000.  ]
 [ 5508766.36  5950000.  ]
 [ 3402711.75  4305000.  ]
 [ 4918042.27  3640000.  ]
 [ 6549431.01  5250000.  ]
 [ 2921090.55  33

  y = column_or_1d(y, warn=True)
