%tensorflow_version 2.x 

In [4]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tqdm import tqdm

tf.__version__

'2.0.0'

In [5]:
# Define parameters (identify by gridsearch)
params = {
    'PCA_comp': None,
 'activation': 'elu',
 'amsgrad': True,
 'batchsize': 512,
 'beta1': 0.5,
 'beta2': 0.99,
 'epoch': 30,
 'lr': 0.0001,
 'nfold': 5  
 }

 

## Load Data

In [6]:
train = pd.read_csv('../data/generated/train_eng.csv')
test = pd.read_csv('../data/generated/test_eng.csv')

In [7]:
#control there's no Null values
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

0
0


##Apply Kfold Split
Instead of looping on kf.split, take the nfold test split and merge 4 of them to build train data

In [8]:
kf = KFold(n_splits=params['nfold'], random_state=750, shuffle=True)
train_folds = [None for x in range(params['nfold'])]

i=0
for _, test_indexed in kf.split(train):
  train_folds[i] = train.iloc[test_indexed,:]
  i += 1

for item in train_folds:
  print(item.shape)

(4968, 141)
(4968, 141)
(4968, 141)
(4968, 141)
(4968, 141)


In [9]:
def PowerData(df):
    '''
    add for all none categorical data it's square
    '''
    #hack
    return df

    square_columns = [col for col in df.columns[~df.columns.str.startswith('Cat_')]]
    
    print('Nb columns to square :', len(square_columns))
    
    for col in square_columns:
        df[f'{col}_x2'] = df[col]**2
        df[f'{col}_x3'] = df[col]**3
        df[f'{col}_x0_5'] = df[col]**0.5
        
    return df

Build a normalize that will apply standard scaler and PCA if required

In [10]:
def NormalizeData(train, CVorTest, PCA_comp = None, ScaleCat = False):
    '''
    Normalize data using a standard scaler
    train:
        dataframe that will be use to fit and transformed by the scaler and PCA
    CVorTest:
        dataframe that will be transformed the scaler and PCA
    PCA_comp:
        Number of PCA components to keep, if None, PCA not applied
    ScaleCat:
        Scale or not the categorical columns with the standard scaler
    '''
    
    sc = StandardScaler()
    
    if ScaleCat:
        scale_columns = train.columns
    else:
        scale_columns = [col for col in train.columns[~train.columns.str.startswith('Cat_')]]
          
    #perform feature scaling    
    print('Nb columns to scale :', len(scale_columns))
    train.loc[:, scale_columns] = sc.fit_transform(train.loc[:, scale_columns]) 
    CVorTest.loc[:, scale_columns] = sc.transform(CVorTest.loc[:, scale_columns]) 
    
    if PCA_comp is None:
        return train.values, CVorTest.values
    
    pca = PCA(PCA_comp)
    train = pca.fit_transform(train)
    CVorTest = pca.transform(CVorTest)
    
    return train, CVorTest

In [11]:
def BuildModel(nb_features, activation = 'leakyRelu', Batchnorm=True):
  '''
    Build NN model
    nb_features:
      number of features in input
    Batchnorm:
      Apply or not batchnorm in hidden layers
    activation:
      Activation to use in hidden layer : leakyRelu, relu, sigmoid, linear, elu
  '''
  model = tf.keras.models.Sequential()

  layers_dim = [512, 1024, 256, 128, 128, 128, 64]

  for dim in layers_dim:
    model.add(tf.keras.layers.Dense(dim, input_shape=(nb_features,)))
    if Batchnorm:
      model.add(tf.keras.layers.BatchNormalization())
    if activation == 'leakyRelu':
      model.add(tf.keras.layers.LeakyReLU(alpha=0.05))
    else:
      model.add(tf.keras.layers.Activation(activation))

  model.add(tf.keras.layers.Dense(4))
  model.add(tf.keras.layers.Activation('softmax'))

  return model

In [12]:
#metrics to compare model with Kaggle scoring
def f1_macro(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float64)
    y_pred = tf.cast(tf.round(y_pred), tf.float64)

    TP = tf.cast(tf.math.count_nonzero(y_pred * y_true, axis=None), tf.float64)
    FP = tf.cast(tf.math.count_nonzero(y_pred * (y_true - 1), axis=None), tf.float64)
    FN = tf.cast(tf.math.count_nonzero((y_pred - 1) * y_true, axis=None), tf.float64)
    
    precision = tf.math.divide_no_nan(TP, TP + FP)
    recall = tf.math.divide_no_nan(TP, TP + FN)
    f1 = tf.math.divide_no_nan(2 * precision * recall,precision + recall)
    return tf.reduce_mean(f1)

def f1_micro(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float64)
    y_pred = tf.cast(tf.round(y_pred), tf.float64)

    TP = tf.cast(tf.math.count_nonzero(y_pred * y_true, axis=0), tf.float64)
    FP = tf.cast(tf.math.count_nonzero(y_pred * (y_true - 1), axis=0), tf.float64)
    FN = tf.cast(tf.math.count_nonzero((y_pred - 1) * y_true, axis=0), tf.float64)
    
    precision = tf.math.divide_no_nan(TP, TP + FP)
    recall = tf.math.divide_no_nan(TP, TP + FN)
    f1 = tf.math.divide_no_nan(2 * precision * recall,precision + recall)
    return tf.reduce_mean(f1)

## Performing the KFold Cross Validation

In [None]:
nfolds = params['nfold']
loss_CV_folds = [None for i in range(nfolds)]
f1_macro_CV_folds = [None for i in range(nfolds)]
f1_micro_CV_folds = [None for i in range(nfolds)]

for i in tqdm(range(nfolds)):
  print(f'Performing KFold iteration on chunk {i+1}/{nfolds}')
  CV_block = i
  train_blocks = list(range(nfolds))
  train_blocks.pop(i)

  train = pd.concat([train_folds[j].copy() for j in train_blocks])
  CV = train_folds[CV_block].copy()

  #extract labels & convert to categorical
  labels_train = pd.get_dummies(train['label'])
  labels_CV = pd.get_dummies(CV['label'])

  #remove labels from train
  train.drop(columns=['label'], inplace=True)
  CV.drop(columns=['label'], inplace=True)

  #apply normalization
  train, CV = NormalizeData(PowerData(train), PowerData(CV),PCA_comp=params['PCA_comp'])

  print('train shape', train.shape, 'train label shape', labels_train.shape)
  print('CV shape', CV.shape, 'CV shape', labels_CV.shape)
  STEPS_PER_EPOCH = len(train) // params['batchsize']
  print('Epochs :', params['epoch'],'Batch size :', params['batchsize'], 'Step per epochs :', STEPS_PER_EPOCH)


  #build model
  model = BuildModel(train.shape[1], activation=params['activation'])
  adam = tf.keras.optimizers.Adam(lr=params['lr'], beta_1=params['beta1'], beta_2=params['beta2'], amsgrad=params['amsgrad'])
  model.compile('adam',loss='categorical_crossentropy', metrics=[f1_macro, f1_micro])

  #run model
  hist = model.fit(train, labels_train.values, validation_data=(CV, labels_CV.values), 
          epochs = params['epoch'], verbose = 1, batch_size=params['batchsize'])
  
  result = model.evaluate(CV, labels_CV.values, batch_size=params['batchsize'])
  loss_CV_folds[i] = result[0]
  f1_macro_CV_folds[i] = result[1]
  f1_micro_CV_folds[i] = result[2]

print('CV loss', loss_CV_folds, 'mean', np.asarray(loss_CV_folds).mean())
print('CV FScore macro', f1_macro_CV_folds, 'mean', np.asarray(f1_macro_CV_folds).mean())
print('CV FScore micro', f1_micro_CV_folds, 'mean', np.asarray(f1_micro_CV_folds).mean())

  0%|          | 0/5 [00:00<?, ?it/s]

Performing KFold iteration on chunk 1/5
Nb columns to scale : 58
train shape (19872, 140) train label shape (19872, 4)
CV shape (4968, 140) CV shape (4968, 4)
Epochs : 30 Batch size : 512 Step per epochs : 38
Train on 19872 samples, validate on 4968 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Perform Training on full data

In [None]:
train = pd.concat([train_folds[i].copy() for i in range(params['nfold'])])

#extract labels & convert to categorical
labels_train = pd.get_dummies(train['label'])

#remove labels from train
train.drop(columns=['label'], inplace=True)

In [None]:
test.shape

In [None]:
#apply normalization
train, test = NormalizeData(PowerData(train), PowerData(test),PCA_comp=params['PCA_comp'])

In [None]:
print('train shape', train.shape, 'train label shape', labels_train.shape)
STEPS_PER_EPOCH = len(train) // params['batchsize']
print('Epochs :', params['epoch'],'Batch size :', params['batchsize'], 'Step per epochs :', STEPS_PER_EPOCH)

In [None]:
model = BuildModel(train.shape[1], activation=params['activation'])
model.summary()

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate=params['lr'], beta_1=params['beta1'], beta_2=params['beta2'], amsgrad=params['amsgrad'])
model.compile(adam,loss='categorical_crossentropy', metrics=[f1_macro, f1_micro])

In [None]:
hist = model.fit(train, labels_train.values, epochs = params['epoch'], batch_size=params['batchsize'])

In [None]:
model.evaluate(train, labels_train.values, verbose=1, batch_size=params['batchsize'])

## Build Submission

In [None]:
test.shape

In [None]:
#create submission file
preds = model.predict(test)
pred_df = pd.DataFrame(preds.round().argmax(axis=1), columns=['label'])
pred_df.to_csv('submissionNN.csv', index=True, index_label='Id')

## Get model logits for ensembling
Finally a bit complex to use for ensembling with other model as the range of logits (before softmax) are differents for each model

In [None]:
modelTest = tf.keras.Model(inputs=model.inputs, outputs=model.layers[-2].output)
testpred = modelTest.predict(test)
print(testpred[0])
print(preds[0])

In [None]:
testpred / np.sum(testpred, -1).reshape(-1, 1)