In [1]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [2]:
%%time

# Subset
target = train['target']
train_id = train['id']
test_id = test['id']
train.drop(['target', 'id'], axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

train.head()

Wall time: 96.9 ms


Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8


In [3]:
# Transfer the cyclical features into two dimensional sin-cos features
# https://www.kaggle.com/avanwyk/encoding-cyclical-features-for-deep-learning
def cyclical_encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

train = cyclical_encode(train, 'day', 7)
test = cyclical_encode(test, 'day', 7) 

train = cyclical_encode(train, 'month', 12)
test = cyclical_encode(test, 'month', 12)

train.drop(['day', 'month'], axis=1, inplace=True)
test.drop(['day', 'month'], axis=1, inplace=True)

train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day_sin,day_cos,month_sin,month_cos
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,2,Grandmaster,Cold,h,D,kr,0.9749279,-0.222521,0.866025,0.5
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,1,Grandmaster,Hot,a,A,bF,-2.449294e-16,1.0,-0.866025,-0.5
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,1,Expert,Lava Hot,h,R,Jc,-2.449294e-16,1.0,0.866025,0.5
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,1,Grandmaster,Boiling Hot,i,D,kW,0.9749279,-0.222521,0.5,0.866025
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,1,Grandmaster,Freezing,a,R,qP,-2.449294e-16,1.0,-0.866025,-0.5


In [4]:
# First, I encode ord_1 to ord_4 since the numbers of their unique values are small 
mapper_ord_1 = {'Novice': 1, 
                'Contributor': 2,
                'Expert': 3, 
                'Master': 4, 
                'Grandmaster': 5}

mapper_ord_2 = {'Freezing': 1, 
                'Cold': 2, 
                'Warm': 3, 
                'Hot': 4,
                'Boiling Hot': 5, 
                'Lava Hot': 6}

mapper_ord_3 = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 
                'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15}

mapper_ord_4 = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 
                'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15,
                'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 
                'W': 23, 'X': 24, 'Y': 25, 'Z': 26}

# https://www.kaggle.com/asimandia/let-s-try-some-feature-engineering
traintest = pd.concat([train, test])
train['ord_1_count'] = train['ord_1'].map(traintest['ord_1'].value_counts().to_dict())
test['ord_1_count'] = test['ord_1'].map(traintest['ord_1'].value_counts().to_dict())

for col, mapper in zip(['ord_1', 'ord_2', 'ord_3', 'ord_4'], [mapper_ord_1, mapper_ord_2, mapper_ord_3, mapper_ord_4]):
    train[col+'_oe'] = train[col].replace(mapper)
    test[col+'_oe'] = test[col].replace(mapper)
    train.drop(col, axis=1, inplace=True)
    test.drop(col, axis=1, inplace=True)

SyntaxError: invalid syntax (<ipython-input-4-b0d345cc966f>, line 28)