# V 0.2

Kaggle - 0.797
AUC_X_test = 0.793

In [1]:
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Reshape, Dense, \
                                    Concatenate, BatchNormalization, Dropout
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils
import tensorflow as tf

import os
import pandas as pd
import numpy as np
import gc

import seaborn as sns
import matplotlib.pyplot as plt

from pandas.api.types import CategoricalDtype 

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn import preprocessing

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
path_input = './input/'
path_output = './output/'

In [3]:
print(os.listdir(path_input))

['sample_submission.csv', 'test.csv', 'train.csv']


In [4]:
train = pd.read_csv(path_input + 'train.csv')
test = pd.read_csv(path_input + 'test.csv')
submission = pd.read_csv(path_input + 'sample_submission.csv')

In [5]:
train.drop(['id'], inplace = True, axis = 1)
test.drop(['id'], inplace = True, axis = 1)

In [6]:
train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [7]:
train.columns

Index(['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2',
       'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_0',
       'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month', 'target'],
      dtype='object')

In [8]:
 train.ord_3.unique()

array(['h', 'a', 'i', 'j', 'g', 'e', 'd', 'b', 'k', 'f', 'l', 'n', 'o',
       'c', 'm'], dtype=object)

In [9]:
 train.ord_0.unique()

array([2, 1, 3], dtype=int64)

## Binary Features

In [10]:
bin_dict = {'T':1, 'F':0, 'Y':1, 'N':0}

train['bin_3'] = train['bin_3'].map(bin_dict)
train['bin_4'] = train['bin_4'].map(bin_dict)

test['bin_3'] = test['bin_3'].map(bin_dict)
test['bin_4'] = test['bin_4'].map(bin_dict)

In [11]:
train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,1,1,Green,Triangle,Snake,Finland,Bassoon,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,1,1,Green,Trapezoid,Hamster,Russia,Piano,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,0,1,Blue,Trapezoid,Lion,Russia,Theremin,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,0,1,Red,Trapezoid,Snake,Canada,Oboe,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,0,0,Red,Trapezoid,Lion,Canada,Oboe,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


## Nominal Features 

In [12]:
test['target'] = 'test'
df = pd.concat([train, test], axis=0, sort=False )

print(f'Shape before dummy transformation: {df.shape}')
df = pd.get_dummies(df, columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'],\
                          prefix=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'], drop_first=True)
print(f'Shape after dummy transformation: {df.shape}')


Shape before dummy transformation: (500000, 24)
Shape after dummy transformation: (500000, 39)


In [13]:
train, test = df[df['target'] != 'test'], df[df['target'] == 'test'].drop('target', axis=1)
del df
gc.collect()

89

In [14]:
train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_5,nom_6,nom_7,nom_8,nom_9,...,nom_2_Lion,nom_2_Snake,nom_3_China,nom_3_Costa Rica,nom_3_Finland,nom_3_India,nom_3_Russia,nom_4_Oboe,nom_4_Piano,nom_4_Theremin
0,0,0,0,1,1,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,...,0,1,0,0,1,0,0,0,0,0
1,0,1,0,1,1,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,...,0,0,0,0,0,0,1,0,1,0
2,0,0,0,0,1,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,...,1,0,0,0,0,0,1,0,0,1
3,0,1,0,0,1,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,...,0,1,0,0,0,0,0,1,0,0
4,0,0,0,0,0,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,...,1,0,0,0,0,0,0,1,0,0


## Ordinal Features

In [15]:
# seting the orders of our ordinal features
ord_1 = CategoricalDtype(categories=['Novice', 'Contributor','Expert', 
                                     'Master', 'Grandmaster'], ordered=True)
ord_2 = CategoricalDtype(categories=['Freezing', 'Cold', 'Warm', 'Hot',
                                     'Boiling Hot', 'Lava Hot'], ordered=True)
ord_3 = CategoricalDtype(categories=['a', 'b', 'c', 'd', 'e', 'f', 'g',
                                     'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o'], ordered=True)
ord_4 = CategoricalDtype(categories=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
                                     'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
                                     'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], ordered=True)

In [16]:
# Transforming ordinal Features
train.ord_1 = train.ord_1.astype(ord_1)
train.ord_2 = train.ord_2.astype(ord_2)
train.ord_3 = train.ord_3.astype(ord_3)
train.ord_4 = train.ord_4.astype(ord_4)

# test dataset
test.ord_1 = test.ord_1.astype(ord_1)
test.ord_2 = test.ord_2.astype(ord_2)
test.ord_3 = test.ord_3.astype(ord_3)
test.ord_4 = test.ord_4.astype(ord_4)

In [17]:
train.ord_4.head()

0    D
1    A
2    R
3    D
4    R
Name: ord_4, dtype: category
Categories (26, object): [A < B < C < D ... W < X < Y < Z]

In [18]:
# Geting the codes of ordinal categoy's - train
train.ord_1 = train.ord_1.cat.codes
train.ord_2 = train.ord_2.cat.codes
train.ord_3 = train.ord_3.cat.codes
train.ord_4 = train.ord_4.cat.codes

# Geting the codes of ordinal categoy's - test
test.ord_1 = test.ord_1.cat.codes
test.ord_2 = test.ord_2.cat.codes
test.ord_3 = test.ord_3.cat.codes
test.ord_4 = test.ord_4.cat.codes

In [19]:
train[['ord_0', 'ord_1', 'ord_2', 'ord_3']].head()

Unnamed: 0,ord_0,ord_1,ord_2,ord_3
0,2,4,1,7
1,1,4,3,0
2,1,2,5,7
3,1,4,4,8
4,1,4,0,0


## Label Encoding Features

In [20]:
lencoder = ['ord_5', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'day', 'month']

test['target'] = 'test'
df = pd.concat([train, test], axis=0, sort=False )

for feat in lencoder:
    lbl_enc = preprocessing.LabelEncoder()
    df[feat] = lbl_enc.fit_transform(df[feat].values)

In [21]:
train, test = df[df['target'] != 'test'], df[df['target'] == 'test'].drop('target', axis=1)

## Modelling

In [22]:
y = train['target']
train.drop(['target'], inplace = True, axis = 1)
df.drop(['target'], inplace = True, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [23]:
features = [x for x in train.columns if x not in ["id", "target", 'day_sin', 'day_cos', 'month_sin', 'month_cos']]
#cont_cols = [x for x in train.columns if x in ['day_sin', 'day_cos', 'month_sin', 'month_cos']]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.02, random_state = 12)

In [25]:
def auc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)

In [49]:
def create_model(data, catcols):    
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        
        inp = Input(shape=(1,))
        out = Embedding(num_unique_values + 1, embed_dim)(inp)
        out = SpatialDropout1D(0.3)(out)
        out = Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
        
    x = Concatenate()(outputs)
    x = BatchNormalization()(x)
    
    x = Dense(300, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = BatchNormalization()(x)
    
    x = Dense(300, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = BatchNormalization()(x)
    
    y = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inputs, outputs=y)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc])
    
    return model

In [50]:
model = create_model(df, features)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_77 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_78 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_79 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_80 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_81 (

In [51]:
early_stopping = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=5,
                                 verbose=1, mode='max', baseline=None, restore_best_weights=True)

In [52]:
rlr = callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5,
                                      patience=3, min_lr=1e-6, mode='max', verbose=1)

In [None]:
X_train = [X_train.loc[:, features].values[:, k] for k in range(X_train.loc[:, features].values.shape[1])]
X_test =  [X_test.loc[:, features].values[:, k] for k in range(X_test.loc[:, features].values.shape[1])]
test_model = [test.loc[:, features].values[:, k] for k in range(test.loc[:, features].values.shape[1])]

In [54]:
model.fit(  X_train, y_train,
            validation_data = (X_test, y_test),
            callbacks = [early_stopping, rlr],
            batch_size = 256, epochs = 100, verbose = 1)

Train on 294000 samples, validate on 6000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 00006: early stopping


<tensorflow.python.keras.callbacks.History at 0x1a19e3aa2b0>

In [55]:
y_pred = model.predict(X_test, verbose = 1)



In [56]:
roc_auc_score(np.array(y_test).astype(int), y_pred.reshape(y_pred.shape[0]))

0.7880006599979703

## Submission

In [None]:
y_preds = model.predict(test_model, verbose = 1)

In [None]:
submission['target'] = y_preds
submission.to_csv(path_output + 'DL_1_2.csv', index = False)

In [None]:
submission.target.mean()

In [None]:
submission.head()