In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [21]:
# Load the data
data = pd.read_csv('data/melb_data.csv')
print(data.shape)
data.head(5)

(13580, 21)


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [22]:
(data.dtypes == 'object').value_counts()

False    13
True      8
dtype: int64

## Get Numerical Variables

In [23]:
melTarget = data.drop(columns=['Price'], axis =1)

In [24]:
X = melTarget.select_dtypes(exclude=['object'])

In [25]:
X.shape

(13580, 12)

In [26]:
y = data['Price']
y.shape

(13580,)

In [27]:
X.isnull().sum()

Rooms               0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
Lattitude           0
Longtitude          0
Propertycount       0
dtype: int64

In [42]:
print(X.isnull().sum()[X.isnull().sum()>0])

Car               62
BuildingArea    6450
YearBuilt       5375
dtype: int64


### Linear Regression Model

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [30]:
def cal_error(X_train, y_train, X_test, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    prediction =  model.predict(X_test)
    error = np.sqrt(mean_squared_error(y_test, prediction))
    return error

## Drop columns with null Value

In [31]:
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
cols_with_missing

['Car', 'BuildingArea', 'YearBuilt']

In [32]:
reduced_X = X.drop(columns= cols_with_missing, axis=1)

In [33]:
# Divide data into training and validation subsets
X_train, X_test, y_train, y_test = train_test_split(reduced_X, y, train_size=0.8, test_size=0.2,random_state=0)
        

In [34]:
cal_error(X_train, y_train, X_test, y_test)


439866.7023444372


### replace the missing values with mean of the column 
SimpleImputer()

In [35]:
from sklearn.impute import SimpleImputer

In [36]:
X_simpleImp = SimpleImputer().fit_transform(X)
X_simpleImp.shape

(13580, 12)

In [37]:
# Divide data into training and validation subsets
X_train, X_test, y_train, y_test = train_test_split(X_simpleImp, y, train_size=0.8, test_size=0.2,random_state=0)

In [43]:
cal_error(X_train, y_train, X_test, y_test)


426697.91323775664


### replace the missing values with ...
SimpleImputer()

In [47]:
X_simpleImp_median = SimpleImputer(strategy='median').fit_transform(X)
X_simpleImp_median.shape

(13580, 12)

In [52]:
# Divide data into training and validation subsets
X_train, X_test, y_train, y_test = train_test_split(X_simpleImp_median, y, train_size=0.8, test_size=0.2,random_state=0)

In [53]:
cal_error(X_train, y_train, X_test, y_test)


426718.92375815933

## Categorical variables

In [67]:
s = (data.dtypes == object)
obj_cols = s[s].index
print(obj_cols)

Index(['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea',
       'Regionname'],
      dtype='object')


In [68]:
X_data = data.drop(columns=['Price'], axis =1)

### Approach1: Drop Categorical value

In [69]:
X = X_data.drop(columns=obj_cols, axis =1)

### Approach2: Label Encoding

In [86]:
from sklearn.preprocessing import LabelEncoder

In [79]:
X_data_labelEnc = X_data.copy()
X_data_labelEnc.head(3)

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0


In [83]:
for col in ['Type', 'Method','Regionname']:
    print(col)
    X_data_labelEnc[col] = LabelEncoder().fit_transform(X_data[col])

Type
Method
Regionname


In [85]:
X_data_labelEnc.head(3)

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,0,12794,2,0,1,23,45,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,2,4019.0
1,0,5943,2,0,1,23,47,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,2,4019.0
2,0,9814,3,0,3,23,48,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,2,4019.0


### Approach 3 (One-Hot Encoding)

In [87]:
from sklearn.preprocessing import OneHotEncoder

In [110]:
enc = OneHotEncoder(sparse=False,)
X_oneHot = pd.DataFrame(enc.fit_transform(X_data[['Type', 'Method','Regionname']]))


In [112]:
enc.categories_

[array(['h', 't', 'u'], dtype=object),
 array(['PI', 'S', 'SA', 'SP', 'VB'], dtype=object),
 array(['Eastern Metropolitan', 'Eastern Victoria',
        'Northern Metropolitan', 'Northern Victoria',
        'South-Eastern Metropolitan', 'Southern Metropolitan',
        'Western Metropolitan', 'Western Victoria'], dtype=object)]

In [117]:
len(enc.categories_[0])+len(enc.categories_[1])+len(enc.categories_[2])

16

In [96]:
X_oneHot.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [100]:
pd.get_dummies(X_data['Type'], drop_first=True).head(3)

Unnamed: 0,t,u
0,0,0
1,0,0
2,0,0


In [101]:
pd.get_dummies(X_data['Method'], drop_first=True).head(3)

Unnamed: 0,S,SA,SP,VB
0,1,0,0,0
1,1,0,0,0
2,0,0,1,0


In [102]:
pd.get_dummies(X_data['Regionname'], drop_first=True).head(3)

Unnamed: 0,Eastern Victoria,Northern Metropolitan,Northern Victoria,South-Eastern Metropolitan,Southern Metropolitan,Western Metropolitan,Western Victoria
0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
