# Single Simple Model Notebook
This is a notebook to test a single model.
I.e, no multiple runs (bootstrap), no ensambling, and no parameter scan.

# Fetch Git Repo

In [1]:
# Set log in info
!git config --global user.email "joel.w.ottosson@gmail.com"
!git config --global user.name "AllaVinner"
# Access token for you account
token = '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' # Refresh after 90 days
# Set repo info
username = 'LukasGardberg'
repo = 'sinch-kaggle'
!git clone https://{token}@github.com/{username}/{repo}
%cd {repo}

Cloning into 'sinch-kaggle'...
remote: Enumerating objects: 59, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 59 (delta 20), reused 48 (delta 11), pack-reused 0[K
Unpacking objects: 100% (59/59), done.
/content/sinch-kaggle


# Imports

In [2]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.9.3-py3-none-any.whl (419 kB)
[K     |████████████████████████████████| 419 kB 6.2 MB/s 
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.9.3


In [3]:
# Imports
import os
import numpy as np
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import plotly.express as px
from tqdm import trange
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchmetrics import F1Score
from torchmetrics.functional import f1_score, recall, precision
from components.datasets import SinchDataset
from components import models, spliters, datasets
import importlib

In [97]:
#importlib.reload(spliters)

<module 'components.spliters' from '/content/sinch-kaggle/components/spliters.py'>

# Constants

In [4]:
DATAPATH = os.path.join(os.getcwd(), 'data')
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


# Load Data

In [5]:
# X: sample x dims
X_train = np.load(os.path.join(DATAPATH,'X_train.npy'))
Ydf = pd.read_csv(os.path.join(DATAPATH,'y_train.csv'))
# y: samples
Y_train = Ydf.Predicted.to_numpy()
nbr_classes = len(np.unique(Y_train))
nbr_dims = len(X_train[0])


# Train loop

In [124]:
# Pre-set
#spliter = spliters.NFromEveryClassSpliter(X_train, Y_train, n=2)
spliter = spliters.SimpleSpliter(X_train, Y_train, test_size=0.20)
#spliter = spliters.FractionFromEveryClassSpliter(X_train, Y_train, fraction=0.20)

# Values to record
iteration, train_loss, val_loss, val_f1 = [], [], [], []

# Run configuration
n_epochs = 50

# split data
x_train, x_val, y_train, y_val = spliter.split()
# Count number of instance in each class
train_count = [0 for _ in range(nbr_classes)]
for label in y_train:
  train_count[label] += 1 
  
val_count = [0 for _ in range(nbr_classes)]
for label in y_val:
  val_count[label] += 1


# Initialize Data loader
batch_size = 64
#train_dataset = datasets.OversamplingDataset(x_train, y_train)
train_dataset = datasets.SinchDataset(x_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = SinchDataset(x_val, y_val)

# Initialize Model
model = models.LinearNet(nbr_dims, nbr_classes, dropout_rate=0.3).to(device)
#model = models.LinearNet(nbr_dims, nbr_classes, layer_size=128, dropout_rate=0.3)

# Criterion
criterion = nn.CrossEntropyLoss()

# Optimizer
learning_rate = 0.001
decay=0
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=decay)

with trange(n_epochs) as pbar:
    for epoch in pbar:
        pbar.set_description(f"Epoch {epoch}")
        for x, y in train_dataloader:
            
            # Reset gradient
            optimizer.zero_grad()
            # Backprop
            z = model(x)
            loss = criterion(z, y)
            loss.backward()
            optimizer.step()
            
            # Save train data 
            train_loss.append(loss.data)
            
            # Validate
            model.eval()
            z_pred = model(val_dataset.x)
            loss = criterion(z_pred, val_dataset.y)
            eval_score = f1_score(target=val_dataset.y, preds=z_pred,
                                  average='macro', num_classes=nbr_classes).item()
            model.train()
            
            # Save validation data
            val_loss.append(loss.data)
            val_f1.append(eval_score)
            
# Evaluation
model.eval()
z_pred = model(val_dataset.x).detach()
y_pred = torch.argmax(z_pred, dim=1)

# Calculate eval values
eval_class_f1 = f1_score(num_classes=nbr_classes, average="none", target=val_dataset.y, preds=y_pred)
eval_class_precision = precision(num_classes=nbr_classes, average="none", target=val_dataset.y, preds=y_pred)
eval_class_recall = recall(num_classes=nbr_classes, average="none", target=val_dataset.y, preds=y_pred)
eval_f1 = f1_score(num_classes=nbr_classes, average="macro", target=val_dataset.y, preds=y_pred)

Epoch 49: 100%|██████████| 50/50 [00:08<00:00,  6.12it/s]


In [125]:
print('Final eval F1: ', eval_f1.item())

Final eval F1:  0.505726158618927


In [126]:
# Convert to data frames
iteration_df = pd.DataFrame({'iteration': range(len(train_loss)),
                             'train_loss': train_loss,
                             'val_loss': val_loss,
                             'val_f1': val_f1})

class_df = pd.DataFrame({'label': range(nbr_classes),
                         'num_train': train_count,
                         'num_val': val_count,
                         'precision': eval_class_precision,
                         'recall': eval_class_recall,
                         'f1': eval_class_f1})

# Visualizations

## Development over iterations

In [127]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
df = iteration_df
column_order = ['iteration', 'train_loss', 'val_loss', 'val_f1']
custom_data = list(zip(*[df[name] for name in column_order]))

fig.add_trace(
    go.Scatter(y=df['train_loss'], x=df['iteration'], name="train loss"),
    secondary_y=False)

fig.add_trace(
    go.Scatter(y=df['val_loss'], x=df['iteration'], name="val loss"),
    secondary_y=False)


fig.add_trace(
    go.Scatter(y=df['val_f1'], x=df['iteration'], name="val f1"), 
    secondary_y=True)

fig.show()


## Evaluation of Classes

In [128]:
val_count = [0 for _ in range(nbr_classes)]
for label in y_val:
  val_count[label] = val_count[label] + 1

tempdf = class_df

tempdf = tempdf.sort_values('f1').assign(order=range(nbr_classes))

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
custom_data = list(zip(tempdf['label'], tempdf['f1'], tempdf['precision'], tempdf['recall'], tempdf['num_val'], tempdf['num_train'], tempdf['order']))
hover_string = """<br>Label: %{customdata[0]}
                  <br>F1: %{customdata[1]:.2f}
                  <br>Pre: %{customdata[2]:.2f}
                  <br>Rec: %{customdata[3]:.2f}
                  <br>Num Val: %{customdata[4]}
                  <br>num Train: %{customdata[5]}
                  <br>Order: %{customdata[6]}"""

fig.add_trace(
    go.Bar(y=tempdf['f1'], x=tempdf['order'], name="f1",
           customdata=custom_data,
           hovertemplate=hover_string),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(y=tempdf['num_val'], x=tempdf['order'], name="Num Val",
                mode='markers', marker=dict(color="#000000", symbol='square-cross', size=8),
                customdata=custom_data,
                hovertemplate=hover_string),
    secondary_y=True,
)


fig.add_trace(
    go.Scatter(y=tempdf['precision'], x=tempdf['order'], name="Precision",
                mode='markers', marker=dict(color="#0000ff", symbol='star', size=8),
                customdata=custom_data,
                hovertemplate=hover_string),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(y=tempdf['recall'], x=tempdf['order'], name="Recall",
                mode='markers', marker=dict(color="#888800", symbol='triangle-up', size=8),
                customdata=custom_data,
                hovertemplate=hover_string),
    secondary_y=False,
)
fig.show()
