# Housing Price Model

## Importing Libraries and Data

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
data = pd.read_csv(r"D:\Utkarsh Mathur\Career\Data Science\Datasets\melbourne housing snapshot\melb_data.csv")

## Data Pre-processing and  Preparation

In [3]:
data.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [4]:
data.Car = data.Car.fillna(data.Car.median())
data.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

Initially the data has 20 features and a price columns. But we need to remove some of the uncessary and overlapping features so as to make a better data.<br>
<br>
I'm decribing why all the features are used:<br>
1) **Rooms, Bathrooms and Car**:- Because they contains the basic information of the built of the house.<br>
2) **Type**:- Because this carries the information of the type of property.<br>
3) **Price**:- Because it's the ultimate aim to predic prices.<br>
4) **Distance**:- Because it gives the estimate of location of the house and vicinity from CBD (City Center).<br>
5) **Landsize**:- It conveys the size of the property.<br>
6) **Lattitude and Longtitude**:- Because they give the exact location of the house eliminating many covarient features like Region, Postcode and Suburb.<br>
7) **PropertyCount**:- This is giving the estimate of quality of the neighbourhood

In [5]:
data1 = data.drop(['Suburb','Address','Method','BuildingArea','Bedroom2','Postcode', 'YearBuilt','SellerG', 'CouncilArea','Regionname', 'Date'], axis=1)
data1.head()

Unnamed: 0,Rooms,Type,Price,Distance,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount
0,2,h,1480000.0,2.5,1.0,1.0,202.0,-37.7996,144.9984,4019.0
1,2,h,1035000.0,2.5,1.0,0.0,156.0,-37.8079,144.9934,4019.0
2,3,h,1465000.0,2.5,2.0,0.0,134.0,-37.8093,144.9944,4019.0
3,3,h,850000.0,2.5,2.0,1.0,94.0,-37.7969,144.9969,4019.0
4,4,h,1600000.0,2.5,1.0,2.0,120.0,-37.8072,144.9941,4019.0


In [6]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

In [7]:
data1['Type'] = lb.fit_transform(data1['Type'])
data1.head()

Unnamed: 0,Rooms,Type,Price,Distance,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount
0,2,0,1480000.0,2.5,1.0,1.0,202.0,-37.7996,144.9984,4019.0
1,2,0,1035000.0,2.5,1.0,0.0,156.0,-37.8079,144.9934,4019.0
2,3,0,1465000.0,2.5,2.0,0.0,134.0,-37.8093,144.9944,4019.0
3,3,0,850000.0,2.5,2.0,1.0,94.0,-37.7969,144.9969,4019.0
4,4,0,1600000.0,2.5,1.0,2.0,120.0,-37.8072,144.9941,4019.0


In [8]:
data1.Lattitude = data1.Lattitude - data1.Lattitude.mean()
data1.Longtitude = data1.Longtitude - data1.Longtitude.mean() 

## Test Train Splits

In [9]:
y = data1.iloc[:, 3].values

In [10]:
X = data1.drop(['Price'],axis=1).values

In [11]:
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(novelty=True)
# lof.fit(X)

In [12]:
X.shape

(13580, 9)

In [13]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
# X = sc_X.fit_transform(X)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=0)
x_train.shape, x_test.shape

((12222, 9), (1358, 9))

## Model - Polynomial Regression

Here in building my model I'm using Polynomial Multiple Linear Regresssion with a degree of 3.

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
lr = LinearRegression(normalize = False)

In [17]:
lr.fit(poly.fit_transform(x_train),y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
pred = lr.predict(poly.fit_transform(x_test))

In [19]:
from sklearn.metrics import r2_score
r2_score(y_test,pred)

0.999999999999997

## Model - Lasso Regression

In [20]:
from sklearn.linear_model import Lasso
clf = Lasso(alpha=0.1)

In [21]:
clf.fit(x_train,y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [22]:
pred1 = clf.predict(x_test)

In [23]:
r2_score(y_test,pred1)

0.9999916206262129

## Model - Decision Tree Regressor

In [24]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=12)

In [25]:
dtr.fit(x_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=12, splitter='best')

In [26]:
pred2 = dtr.predict(x_test)

In [27]:
r2_score(y_test,pred2)

0.9999942091830432