Table of contents:

1. [Import relevant libraries](#Libraries)
2. [Load files](#Load)
3. [Append Datasets with level-1 predictions](#Import)
4. [NN Model training](#Train)
5. [Model predictions](#Predictions)
6. [Improve model performance](#Conclusion)

<a name = "Libraries"></a>
## 1. Import relevant libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter
from os.path import exists

  import pandas.util.testing as tm


<a name = "Load"></a>
## 2. Load files

In [2]:
lgbm_df = pd.read_csv('LGBM_predictions.csv')
xgb_df = pd.read_csv('XGB_predictions.csv')
catboost_df = pd.read_csv('CatBoost_predictions.csv')
stacking_df = pd.read_csv('Stacking_predictions.csv')
cnn_df = pd.read_csv('CNN_predictions.csv')

lgbm_train_df = pd.read_csv('LGBM_train_predictions.csv')
xgb_train_df = pd.read_csv('XGB_train_predictions.csv')
catboost_train_df = pd.read_csv('CatBoost_train_predictions.csv')
stacking_train_df = pd.read_csv('Stacking_train_predictions.csv')
cnn_train_df = pd.read_csv('CNN_train_predictions.csv')

<a name = "Import"></a>
## 3. Append Datasets and Dataset Generation

In [3]:
train_df = pd.read_csv('Train_OHE.csv') # Shuffled
train_df = train_df.sort_values('Sample_ID', ascending=True).reset_index(drop=True) # Unshuffled
train_df['Predicted_LGBM'] = lgbm_train_df[["Label"]] # Unshuffled
train_df['Predicted_XGB'] = xgb_train_df[["Label"]] # Unshuffled
train_df['Predicted_CatBoost'] = catboost_train_df[["Label"]] # Unshuffled
train_df['Predicted_stack'] = stacking_train_df[["Label"]] # Unshuffled
train_df['Predicted_CNN'] = cnn_train_df[["Label"]] # Unshuffled
train_df = train_df.sample(frac=1, random_state=2022).reset_index(drop=True) # Shuffled

train_y_df = train_df[['Label']]
train_y = train_y_df.to_numpy()

train_X_df = train_df.drop(['Label', 'Sample_ID'], axis=1)
train_X = train_X_df.to_numpy()

In [4]:
split_ratio = 0.8

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y).squeeze()

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        sample = self.X[idx].float() # Both, the data and model parameters, should have the same dtype.
        label = self.y[idx]
        return sample, label # returns Tensor([channels, H, W])

full_dataset = CustomDataset(train_X, train_y)

train_size = int(split_ratio * len(full_dataset))
val_size = len(full_dataset) - train_size

train_dataset = torch.utils.data.Subset(full_dataset, range(train_size))
val_dataset = torch.utils.data.Subset(full_dataset, range(train_size, train_size + val_size))

<a name = "Train"></a>
## 4. Model training

#### Model architecture

In [5]:
device = torch.device('cpu')
if torch.cuda.is_available():
    print("CUDA is available")
    device = torch.device('cuda')
    
fc_size = 3550
class StackedNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.input = nn.Linear(355, fc_size)
        self.fc1a = nn.Linear(fc_size, fc_size)
        self.fc1b = nn.Linear(fc_size, fc_size)
        self.fc1c = nn.Linear(fc_size, fc_size)
        self.fc1d = nn.Linear(fc_size, fc_size)
        self.fc1e = nn.Linear(fc_size, fc_size)
        self.fc2 = nn.Linear(fc_size, 2244)
        self.fc3 = nn.Linear(2244, 244)
        self.output = nn.Linear(244, 2)

    def forward(self, x):
        x = F.relu(self.input(x))
        x = F.relu(self.fc1a(x))
        x = F.relu(self.fc1b(x))
        x = F.relu(self.fc1c(x))
        x = F.relu(self.fc1d(x))
        x = F.relu(self.fc1e(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.output(x)
        return x    

PATH = 'nn_submission.pth'
model = StackedNet()
if exists(PATH):   
    model.load_state_dict(torch.load(PATH))
model.to(device)
print(model)

CUDA is available
StackedNet(
  (input): Linear(in_features=355, out_features=3550, bias=True)
  (fc1a): Linear(in_features=3550, out_features=3550, bias=True)
  (fc1b): Linear(in_features=3550, out_features=3550, bias=True)
  (fc1c): Linear(in_features=3550, out_features=3550, bias=True)
  (fc1d): Linear(in_features=3550, out_features=3550, bias=True)
  (fc1e): Linear(in_features=3550, out_features=3550, bias=True)
  (fc2): Linear(in_features=3550, out_features=2244, bias=True)
  (fc3): Linear(in_features=2244, out_features=244, bias=True)
  (output): Linear(in_features=244, out_features=2, bias=True)
)


#### Training loop

In [6]:
# Both, the data and model parameters, should have the same dtype.

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.8)

batch_size = 512
num_epochs = 100
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)

writer = SummaryWriter()

for epoch in range(num_epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, (inputs, labels) in enumerate(trainloader, start=0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = inputs.to(device), labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if epoch % 5 == 4 and i == 0:
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss:.6f}')
            writer.add_scalar("train/loss", running_loss, epoch)
            running_loss = 0.0

writer.close()
print('Finished Training')

PATH = 'nn_submission.pth'
torch.save(model.state_dict(), PATH)

[5,     1] loss: 0.014567
[10,     1] loss: 0.006932
[15,     1] loss: 0.003552
[20,     1] loss: 0.011510
[25,     1] loss: 0.002045
[30,     1] loss: 0.001578
[35,     1] loss: 0.001386
[40,     1] loss: 0.011793
[45,     1] loss: 0.011910
[50,     1] loss: 0.000910
Finished Training


In [7]:
PATH = 'nn_submission.pth'
trained_model = StackedNet()
trained_model.load_state_dict(torch.load(PATH))

labels = val_dataset[:][1]
samples = val_dataset[:][0]
with torch.no_grad():
    outputs = trained_model(samples)
    _, val_predictions = torch.max(outputs.data, 1)

print(classification_report(labels, val_predictions))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1649
           1       0.71      0.71      0.71       524

    accuracy                           0.86      2173
   macro avg       0.81      0.81      0.81      2173
weighted avg       0.86      0.86      0.86      2173



<a name = "Predictions"></a>
## 5. Model predictions

#### NN test predictions

In [8]:
test_df = pd.read_csv('Test_OHE.csv')

test_X_df = test_df.drop(['Sample_ID'], axis=1)
test_X_df['Predicted_LGBM'] = lgbm_df[["Label"]]
test_X_df['Predicted_XGB'] = xgb_df[["Label"]]
test_X_df['Predicted_CatBoost'] = catboost_df[["Label"]]
test_X_df['Predicted_stack'] = stacking_df[["Label"]]
test_X_df['Predicted_CNN'] = cnn_df[["Label"]]
test_X = test_X_df.to_numpy()

class TestDataset(Dataset):
    def __init__(self, X):
        self.X = torch.from_numpy(X)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sample = self.X[idx].float() # Both, the data and model parameters, should have the same dtype.
        return sample # returns Tensor([channels, H, W])
    
test_dataset = TestDataset(test_X)

with torch.no_grad():
    test_outputs = trained_model(test_dataset[:])
    _, test_predictions = torch.max(test_outputs.data, 1)

pd.DataFrame({'Sample_ID': test_df.Sample_ID, 'Label': test_predictions}).to_csv("NN_predictions.csv", index=None)

<a name = "Conclusion"></a>
## 6. To improve model performance

- Apply SMOTE and Tomek to the Neural Network, along with using a wider array of models trained under varying hyperparameters and datasets.
- Try other Network architectures.
- Use stratified cross-validation for the data split.
- Try some heavier feature engineering. Generate new features from max, min, range, ratio of max:min, ratios of featureX:featurey, means, medians, calculate metrics for each horizontal and vertical strip, count number of unique categorical features for each category, try binning features into categories.
- Plot learning curves to see where improves can be made.

In [9]:
foo = pd.read_csv("NN_predictions.csv")
print(len(foo))
foo.head()

5430


Unnamed: 0,Sample_ID,Label
0,10865,1
1,10866,0
2,10867,0
3,10868,0
4,10869,1
