In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('melb_data.csv')
y = data.Price
X = data.drop(['Price'], axis = 1)
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X,y, train_size=0.8, test_size=0.2,random_state=0)
# Drop columns with missing values
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_with_missing, axis = 1, inplace = True)
X_valid_full.drop(cols_with_missing, axis = 1, inplace = True)
# Cardinality is the unique value in column, category low cardinality 
low_cardinalitys_cols =  [col for col in X_train_full.columns if X_train_full[col].nunique() < 10 and X_train_full[col].dtypes == 'object']
# display(low_cardinalitys_cols)
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtypes in ['int64','float64']]
# display(numerical_cols)
used_cols = numerical_cols + low_cardinalitys_cols
X_train = X_train_full[used_cols].copy()
X_valid = X_valid_full[used_cols].copy()
X_train.head()


Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
12167,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0,u,S,Southern Metropolitan
6524,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0,h,SA,Western Metropolitan
8413,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0,h,S,Western Metropolitan
2919,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0,u,SP,Northern Metropolitan
6043,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0,h,S,Western Metropolitan


In [14]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
print(object_cols)

['Type', 'Method', 'Regionname']


In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# Funtion to evaluate the result return
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(preds, y_valid)

In [16]:
# Approach 1 drop columns with non-numerical values
drop_X_train = X_train.select_dtypes(exclude =['object'])
drop_X_valid = X_valid.select_dtypes(exclude =['object'])
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

175029.99762229118


In [17]:
from sklearn.preprocessing import OrdinalEncoder
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])
display(X_train)
display(label_X_train)
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))


Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
12167,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.98670,13240.0,u,S,Southern Metropolitan
6524,2,8.0,3016.0,2.0,2.0,193.0,-37.85800,144.90050,6380.0,h,SA,Western Metropolitan
8413,3,12.6,3020.0,3.0,1.0,555.0,-37.79880,144.82200,3755.0,h,S,Western Metropolitan
2919,3,13.0,3046.0,3.0,1.0,265.0,-37.70830,144.91580,8870.0,u,SP,Northern Metropolitan
6043,3,13.3,3020.0,3.0,1.0,673.0,-37.76230,144.82720,4217.0,h,S,Western Metropolitan
...,...,...,...,...,...,...,...,...,...,...,...,...
13123,3,5.2,3056.0,3.0,1.0,212.0,-37.77695,144.95785,11918.0,h,SP,Northern Metropolitan
3264,3,10.5,3081.0,3.0,1.0,748.0,-37.74160,145.04810,2947.0,h,S,Eastern Metropolitan
9845,4,6.7,3058.0,4.0,2.0,441.0,-37.73572,144.97256,11204.0,h,PI,Northern Metropolitan
10799,3,12.0,3073.0,3.0,1.0,606.0,-37.72057,145.02615,21650.0,h,S,Northern Metropolitan


Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
12167,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.98670,13240.0,2.0,1.0,5.0
6524,2,8.0,3016.0,2.0,2.0,193.0,-37.85800,144.90050,6380.0,0.0,2.0,6.0
8413,3,12.6,3020.0,3.0,1.0,555.0,-37.79880,144.82200,3755.0,0.0,1.0,6.0
2919,3,13.0,3046.0,3.0,1.0,265.0,-37.70830,144.91580,8870.0,2.0,3.0,2.0
6043,3,13.3,3020.0,3.0,1.0,673.0,-37.76230,144.82720,4217.0,0.0,1.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
13123,3,5.2,3056.0,3.0,1.0,212.0,-37.77695,144.95785,11918.0,0.0,3.0,2.0
3264,3,10.5,3081.0,3.0,1.0,748.0,-37.74160,145.04810,2947.0,0.0,1.0,0.0
9845,4,6.7,3058.0,4.0,2.0,441.0,-37.73572,144.97256,11204.0,0.0,0.0,2.0
10799,3,12.0,3073.0,3.0,1.0,606.0,-37.72057,145.02615,21650.0,0.0,1.0,2.0


166340.29729206115


In [25]:
from sklearn.preprocessing import OneHotEncoder
# Apply one_hot encoder to each columns with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
Numerical_train = X_train.drop(object_cols, axis = 1)
Numerical_valid = X_valid.drop(object_cols, axis = 1)
OH_cols_train = pd.concat([Numerical_train, OH_cols_train], axis = 1)
OH_cols_valid = pd.concat([Numerical_valid, OH_cols_valid], axis = 1)
OH_cols_train.columns = OH_cols_train.columns.astype(str)
OH_cols_valid.columns = OH_cols_valid.columns.astype(str)
print(score_dataset(OH_cols_train, OH_cols_valid, y_train, y_valid))


165178.61892822076
