# Kaggle Categorical Feature Encoding Challenge
#### *Binary classification, with every feature a categorical*

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pandas.api.types import CategoricalDtype 

## Load data

In [4]:
train_raw = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Combine Datasets during FE

In [5]:
train_raw = train_raw.drop(['target'], axis = 1)
train = train_raw.append(test)

## Feature Engineering

#### Check unique values per feature between train and test sets

In [53]:
print("column: train unique values - test unique values")
for column in train.columns:
    print(column + ": " + str(len(train[:300000][column].unique())) + " --> " + str(len(train[300001:][column].unique())))

column: train unique values - test unique values
id: 300000 --> 199999
bin_0: 2 --> 2
bin_1: 2 --> 2
bin_2: 2 --> 2
bin_3: 2 --> 2
bin_4: 2 --> 2
nom_0: 3 --> 3
nom_1: 6 --> 6
nom_2: 6 --> 6
nom_3: 6 --> 6
nom_4: 4 --> 4
nom_5: 222 --> 222
nom_6: 522 --> 522
nom_7: 1220 --> 1219
nom_8: 2215 --> 2214
nom_9: 11981 --> 11839
ord_0: 3 --> 3
ord_1: 5 --> 5
ord_2: 6 --> 6
ord_3: 15 --> 15
ord_4: 26 --> 26
ord_5: 192 --> 192
day: 7 --> 7
month: 12 --> 12


#### Label Encode to binary features

In [54]:
mapping_bins34 = {'T' : 1, 'F' : 0, 'Y' : 1, 'N' : 0}

train['bin_3'] = train['bin_3'].map(mapping_bins34)
test['bin_3'] = test['bin_3'].map(mapping_bins34)

train['bin_4'] = train['bin_4'].map(mapping_bins34)
test['bin_4'] = test['bin_4'].map(mapping_bins34)

#### One Hot Encoding to nominal features

In [55]:
#One hot encode nominal features
low_card_nom = train.columns[train.columns.str.startswith('nom')]

join_train = pd.get_dummies(train[low_card_nom])
join_test = pd.get_dummies(train[low_card_nom])
train = train.drop(low_card_nom, axis = 1)
train = train.join(join_train)
test = test.drop(low_card_nom, axis = 1)
test = test.join(join_train)

#### Hashing for nominal features with high cardinality

In [40]:
# from sklearn.feature_extraction import FeatureHasher

In [47]:
# high_card_nom = ['nom_5','nom_6','nom_7','nom_8','nom_9']

# train_hash = train.copy()
# for col in high_card_nom:
#     train_hash[col]=train_hash[col].astype('str')
# hashing=FeatureHasher(input_type='string')

# train=hashing.transform(train_hash.values)

#### Ordinal encoding for ordinal features with low cardinality

In [37]:
for col in ['ord_1','ord_2']:
    print(col)
    print(train[col].unique())

ord_1
[4 2 0 1 3]
ord_2
['Cold' 'Hot' 'Lava Hot' 'Boiling Hot' 'Freezing' 'Warm']


In [28]:
ord1_dic = {'Novice' : 0, 'Contributor' : 1, 'Expert' : 2, 'Master' : 3, 'Grandmaster' : 4}
ord2_dic = {'Freezing' : 0, 'Cold' : 1, 'Warm' : 2, 'Hot' : 3, 'Boiling Hot' : 4, 'Lava Hot' : 5}

train['ord_1'] = train['ord_1'].map(ord1_dic)
train['ord_2'] = train['ord_2'].map(ord2_dic)

#### For alphabetic ordinal features, sort then map to new values

In [24]:
#sort ord_3 alphabetically then insert into dic then map new values
ord3 = sorted(train['ord_3'].unique())
ord3_dic = {}

i=1
for letter in ord3:
    ord3_dic[letter] = i
    i += 1
    
train['ord_3'] = train['ord_3'].map(ord3_dic)

In [40]:
#sort ord_4 alphabetically then insert into dic then map new values
ord4 = sorted(train['ord_4'].unique())
ord4_dic = {}

i=1
for letter in ord4:
    ord4_dic[letter] = i
    i += 1
    
train['ord_4'] = train['ord_4'].map(ord4_dic)

In [41]:
#sort ord_5 alphabetically then insert into dic then map new values
ord5 = sorted(train['ord_5'].unique())
ord5_dic = {}

i=1
for letter in ord5:
    ord5_dic[letter] = i
    i += 1
    
train['ord_5'] = train['ord_5'].map(ord5_dic)

## Split to Train/Test

In [17]:
train_x = train.drop(['target'], axis = 1)
train_y = train['target']

X_train,X_test,y_train,y_test = train_test_split(train_x,train_y,test_size=0.2)