In [10]:
import math
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

%matplotlib inline

In [46]:
data = pd.read_csv('melb_data.csv')

y = data.Price
X = data.drop(columns=['Price'])

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(columns=cols_with_missing, inplace=True)
X_valid_full.drop(columns=cols_with_missing, inplace=True)

low_cardinality_cols = [colm for colm in X_train_full.columns if X_train_full[colm].dtype == 'object' and X_train_full[colm].nunique() < 10]

numerical_cols = [c for c in X_train_full.columns if X_train_full[c].dtype in ['int64', 'float64']]

my_cols = low_cardinality_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [22]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [34]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

In [70]:
object_cols

['Type', 'Method', 'Regionname']

In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def data_scorer(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [47]:
#approach 1

drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

data_scorer(drop_X_train, drop_X_valid, y_train, y_valid)

175703.48185157913

In [51]:
from sklearn.preprocessing import OrdinalEncoder

label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

ordinal_encoder = OrdinalEncoder()

label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

data_scorer(label_X_train, label_X_valid, y_train, y_valid)


165936.40548390493

In [56]:
from sklearn.preprocessing import OneHotEncoder

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [69]:
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

OH_cols_train.columns =  ['encoded_' + str(col_name) for col_name in OH_cols_train.columns ]
OH_cols_valid.columns =  ['encoded_' + str(col_name) for col_name in OH_cols_valid.columns ]

num_X_train = X_train.drop(columns=object_cols)
num_X_valid = X_valid.drop(columns=object_cols)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1 )
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1 )


data_scorer(OH_X_train, OH_X_valid, y_train, y_valid)


166089.4893009678

In [74]:
pd.DataFrame(OH_encoder.fit_transform(X_train[['Regionname']]))

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
10859,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10860,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10861,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10862,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
