In [1]:
from sklearn.preprocessing import OneHotEncoder

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read data
data = pd.read_csv("./csv/melb_data.csv")

y = data.Price
X = data.drop(['Price'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [7]:
# drop column with missing values
cols_with_missing_value = [col for col in X_train.columns if X_train[col].isnull().any()]
X_train = X_train.drop(cols_with_missing_value, axis=1)
X_test = X_test.drop(cols_with_missing_value, axis=1)

Find categorical columns with low cardinality, drop the others

In [10]:
# low cardinality categorical columns
low_cardinality_cols = [col for col in X_train.columns if X_train[col].nunique() < 10 and X_train[col].dtype == 'object']

# numerical columns
numerical_dtypes = ['int64', 'float64']
numerical_cols = [col for col in X_train.columns if X_train[col].dtype in numerical_dtypes]

# keep only these two kinds of columns
final_cols = low_cardinality_cols + numerical_cols
X_train = X_train[final_cols].copy()
X_test = X_test[final_cols].copy()

To get a list of Categorical variables

In [11]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print(object_cols)

['Type', 'Method', 'Regionname']


One Hot Encoder

In [14]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
# handle_unknown == 'ignore', treat unknown categories as zeros
# sparse == True uses a Sparse matrix, False uses just an array

# transform categorical cols to one-hot encoding
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols]))

# add back the col labels / index
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

# Remove the categorical columns
X_train = X_train.drop(object_cols, axis=1)
X_test = X_test.drop(object_cols, axis=1)

# add in the one hot encoded categorical columns - CONCAT
OH_X_train = pd.concat([X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([X_test, OH_cols_test], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

