# Classification avec Transformer (Encoder seul)

In [13]:
# !pip install dill

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision.datasets as datasets
import torchvision.transforms as transforms

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, roc_auc_score

import sys
sys.path.append('..')
from utils import *

## Chargement et prétraitement des données (seulement en anglais)

In [2]:
toots = pd.read_csv('../data/100k_en_toots_labeled.csv').dropna()

ys, nb_ys = np.unique(toots['y'], return_counts=True)
print('Taille des toots(toutes les langues)', len(toots))
print(f'Nb pos({ys[1]}): {nb_ys[1]} -- Nb neg({ys[0]}): {nb_ys[0]} -- total: {nb_ys.sum()}')

# toots en anglais
itoots_en = toots['language'] == 'en'
toots_en = toots[itoots_en]
toots_en = toots_en[['content', 'y']]

ys, nb_ys = np.unique(toots_en['y'], return_counts=True)
print('Taille des toots en anglais', len(toots_en))
print(f'Nb pos({ys[1]}): {nb_ys[1]} -- Nb neg({ys[0]}): {nb_ys[0]} -- total: {nb_ys.sum()}')

toots_en.head()

Taille des toots(toutes les langues) 93273
Nb pos(1): 1569 -- Nb neg(0): 91704 -- total: 93273
Taille des toots en anglais 47465
Nb pos(1): 1011 -- Nb neg(0): 46454 -- total: 47465


Unnamed: 0,content,y
0,Chikmagalur Tourist Places: Your Ultimate Guid...,0
1,"Dancing Adélie Penguins, McMurdo Sound, Antar...",0
2,2 Macdonald trip leaving Burrard Station @ Bay...,0
3,"Here you go seekers, some more good music (the...",0
11,The Future of Nuclear Energy in a Carbon-Const...,0


## Prétraitement et sauvegarde des embeddings (20 minutes)

In [6]:
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # Encoder les toots
# embeddings = model.encode(toots_en['content'].tolist())

Batches:   0%|          | 0/1484 [00:00<?, ?it/s]

In [14]:
# y = toots_en['y'].to_list()
# save_object('../data/embedding_100k_en_toots_labeled_eng.dill', [(embi,yi) for embi,yi in zip(embeddings, y)])

## Charger les embeddings déjà prétraités et enregistrés

mettre les données des embeddings dans: src/data/embedding_100k_en_toots_labeled_eng.csv

In [6]:
data = load_object('../data/embedding_100k_en_toots_labeled_eng.pkl')

## DataLoader

In [7]:
class MyData(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]

dataset = MyData(data)

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_indices, test_indices = next(iter(splitter.split(dataset, [label for _, label in data])))

# Use the indices to create training and testing datasets
train_dataset = torch.utils.data.Subset(dataset, train_indices)
test_dataset = torch.utils.data.Subset(dataset, test_indices)

# Create DataLoaders for iterating over the training and testing sets
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# Données équilibrées
# equilibrer train
ipos = [i for i,(d,y) in enumerate(train_dataset)  if y==1]
ineg = [i for i,(d,y) in enumerate(train_dataset)  if y==0]
np.random.shuffle(ineg)
train_pos = torch.utils.data.Subset(train_dataset, ipos)
train_neg = torch.utils.data.Subset(train_dataset, ineg[:len(ipos)])
train_dataset_eq = torch.utils.data.ConcatDataset([train_pos,train_neg])
# equilibrer test
ipos = [i for i,(d,y) in enumerate(test_dataset)  if y==1]
ineg = [i for i,(d,y) in enumerate(test_dataset)  if y==0]
np.random.shuffle(ineg)
train_pos = torch.utils.data.Subset(test_dataset, ipos)
train_neg = torch.utils.data.Subset(test_dataset, ineg[:len(ipos)])
test_dataset_eq = torch.utils.data.ConcatDataset([train_pos,train_neg])

train_dataloader_eq = DataLoader(train_dataset_eq, batch_size=batch_size, shuffle=True)
test_dataloader_eq = DataLoader(test_dataset_eq, batch_size=batch_size, shuffle=False)

In [87]:
class Net(nn.Module):
  def __init__(self, dim=384):
    super(Net, self).__init__()
    
    self.main = nn.Sequential(
      nn.Linear(dim, 2 * dim),
      nn.ReLU(),
      nn.Linear(2 * dim, 4 * dim),
      nn.ReLU(),
      nn.Linear(4 * dim, 2 * dim),
      nn.ReLU(),
      nn.Linear(2 * dim, dim),
      nn.ReLU(),
      nn.Linear(dim, 1),
      nn.Sigmoid()
    )
  
  def forward(self, x):
    return self.main(x)

class BNet(nn.Module):
  def __init__(self, dim=384):
    super(BNet, self).__init__()
    
    self.main = nn.Sequential(
      nn.Linear(dim, 2 * dim),
      nn.BatchNorm2d(dim * 2),
      nn.ReLU(),
      nn.Linear(2 * dim, 4 * dim),
      nn.BatchNorm2d(dim * 4),
      nn.ReLU(),
      nn.Linear(4 * dim, 2 * dim),
      nn.BatchNorm2d(dim * 2),
      nn.ReLU(),
      nn.Linear(2 * dim, dim),
      nn.BatchNorm2d(dim),
      nn.ReLU(),
      nn.Linear(dim, 1),
      nn.Sigmoid()
    )
  
  def forward(self, x):
    return self.main(x)

class F1Loss(nn.Module):
  def __init__(self, epsilon=1e-7):
    super(F1Loss, self).__init__()
    self.epsilon = epsilon

  def forward(self, y_true, y_pred):
    # Calculez les vrais positifs, faux positifs et faux négatifs
    tp = torch.sum(y_true * y_pred)
    fp = torch.sum((1 - y_true) * y_pred)
    fn = torch.sum(y_true * (1 - y_pred))

    # Calculez la précision, le rappel et le F1 score
    precision = tp / (tp + fp + self.epsilon)
    recall = tp / (tp + fn + self.epsilon)
    f1 = 2 * (precision * recall) / (precision + recall + self.epsilon)

    # Utilisez 1 - F1 comme la perte (car PyTorch minimise)
    loss = 1 - f1
    return loss

class F1MacroLoss(nn.Module):
    def __init__(self, ):
        super(F1MacroLoss, self).__init__()
        self.f1_loss = F1Loss()

    def forward(self, y_true, y_pred):
      f1_p = 1 - self.f1_loss(y_true, y_pred)
      f1_n = 1 - self.f1_loss((1-y_true), (1-y_pred))
      # print(f'--F1 score (positifs): {f1_p}')
      # print(f'--F1 score (négatifs): {f1_n}')
      return 1 - (f1_p + f1_n) / 2

class RaucAuc(nn.Module):
  def __init__(self, ):
    super(RaucAuc, self).__init__()

  def forward(self, lyhat, ly):
    # Convertir les listes en tenseurs PyTorch
    predictions_prob = torch.tensor(predictions_prob)
    targets = torch.tensor(targets)

    # Utiliser scikit-learn pour calculer l'aire sous la courbe ROC (ROC-AUC)
    roc_auc = roc_auc_score(np.array(lyhat), np.array(ly))

    loss = 1 - roc_auc
    return loss

def balanced_accuracy(lyhat, ly):
  lyhat, ly = torch.tensor(lyhat), torch.tensor(ly)

  ipos, ineg = ly == 1, ly == 0
  ly_pos, ly_neg = ly[ipos], ly[ineg]
  lyhat_pos, lyhat_neg = lyhat[ipos], lyhat[ineg]

  tp = (lyhat_pos == ly_pos).sum()
  tn = (lyhat_neg == ly_neg).sum()
  
  sensitive_p = tp / len(ly_pos)
  sensitive_n = tn / len(ly_neg)

  ba = (sensitive_p + sensitive_n) / 2
  
  return ba

def f1_macro(lyhat, ly):

  # Utiliser scikit-learn pour calculer le F1 score macro
  f1_macro = f1_score(np.array(ly), np.array(lyhat), average='macro')

  return f1_macro

def f_roc_auc(predictions_prob, targets):
  # Convertir les listes en tenseurs PyTorch
  predictions_prob = torch.tensor(predictions_prob)
  targets = torch.tensor(targets)

  # Utiliser scikit-learn pour calculer l'aire sous la courbe ROC (ROC-AUC)
  roc_auc = roc_auc_score(targets.numpy(), predictions_prob.numpy())

  return roc_auc

In [89]:
def apprentissage(net, criterion, optimizer, train_dl, test_dl, epochs):
  train_loss, test_loss = [], []
  for epoch in range(epochs):
    print(f"Epoch [{(epoch+1):4d}/{epochs:4d}] ")
    lep = []
    net.train()
    for batch_idx, (batch, y) in enumerate(train_dl):

      yhat = net(batch).squeeze(dim=1)
      y = y.float()
      loss = criterion(y, yhat)
      
      net.zero_grad()
      loss.backward()
      optimizer.step()
      
      # save loss train
      lep.append(loss.detach().numpy())
      
      
      yhat_label = torch.tensor([1 if yihat > 0.5 else 0 for yihat in yhat])
      ipos, ineg = y == 1, y == 0
      y_pos, y_neg = y[ipos], y[ineg]
      yhat_pos, yhat_neg = yhat_label[ipos], yhat_label[ineg]
      if len(y_pos)>0:
        pred_pos = (yhat_pos == y_pos).sum()/ len(y_pos)
      else:
        pred_pos = - 0.01
      if len(y_neg)>0:
        pred_neg = (yhat_neg == y_neg).sum()/ len(y_neg)
      else:
        pred_neg = - 0.01
      print(
        f"\ttrain batch [{(batch_idx+1):4d}/{len(train_dl):4d}] - "
        f"Loss : {loss:.4f} - "
        f"TP/P : {(100*pred_pos):.2f} - "
        f"TN/N : {(100*pred_neg):.2f}", end="\r"
      )
    print(
      f"\ttrain batch [{(batch_idx+1):4d}/{len(train_dl):4d}] - "
      f"Loss : {loss:.4f} - "
      f"TP/P : {(100*pred_pos):.2f} - "
      f"TN/N : {(100*pred_neg):.2f}"
    )
    train_loss.append(np.mean(lep))
    
    lep = []
    net.eval()
    for batch_idx, (batch, y) in enumerate(test_dl):

      yhat = net(batch).squeeze(dim=1)
      loss = criterion(y, yhat)
      lep.append(loss.detach().numpy())
      
      yhat_label = torch.tensor([1 if yihat > 0.5 else 0 for yihat in yhat])
      ipos, ineg = y == 1, y == 0
      y_pos, y_neg = y[ipos], y[ineg]
      yhat_pos, yhat_neg = yhat_label[ipos], yhat_label[ineg]
      if len(y_pos)>0:
        pred_pos = (yhat_pos == y_pos).sum()/ len(y_pos)
      if len(y_neg)>0:
        pred_neg = (yhat_neg == y_neg).sum()/ len(y_neg)
      print(
        f"\ttest batch [{(batch_idx+1):4d}/{len(test_dl):4d}] - "
        f"Loss : {loss:.4f} - "
        f"TP/P : {(100*pred_pos):.2f} - "
        f"TN/N : {(100*pred_neg):.2f}", end="\r"
      )
    print(
      f"\ttest batch [{(batch_idx+1):4d}/{len(test_dl):4d}] - "
      f"Loss : {loss:.4f} - "
      f"TP/P : {(100*pred_pos):.2f} - "
      f"TN/N : {(100*pred_neg):.2f}"
    )
    test_loss.append(np.mean(lep))
  
  return train_loss, test_loss

In [81]:
def prediction(net, dl):
  net.eval()
  lyhat, ly = [], []
  for batch_idx, (batch, y) in enumerate(dl):
    
    yhat = net(batch).squeeze(dim=1)
    ly.extend(y)
    lyhat.extend([1 if yihat > 0.5 else 0 for yihat in yhat])

  lyhat, ly = torch.tensor(lyhat), torch.tensor(ly)

  ipos, ineg = ly == 1, ly == 0
  ly_pos, ly_neg = ly[ipos], ly[ineg]
  lyhat_pos, lyhat_neg = lyhat[ipos], lyhat[ineg]

  print(f'Taux de prediction total: {(100*((lyhat == ly).sum()/ len(ly))):.2f}%')
  print(f'Taux de prediction sur les pos: {(100*((lyhat_pos == ly_pos).sum()/ len(ly_pos))):.2f}%')
  print(f'Taux de prediction sur les neg: {(100*((lyhat_neg == ly_neg).sum()/ len(ly_neg))):.2f}%')
  
  f1_loss = F1Loss()
  print(f'F1 score (positifs): {(100*(1-f1_loss(ly, lyhat))):.2f}%')
  print(f'F1 score (négatifs): {(100*(1-f1_loss(1-ly, 1-lyhat))):.2f}%')
  
  f1_macro_loss = F1MacroLoss()
  sc = 1-f1_macro_loss(ly, lyhat)
  print(f'f1 macro score : {(100*sc):.2f}%')

  # scores
  tp = (lyhat_pos == ly_pos).sum()
  tn = (lyhat_neg == ly_neg).sum()
  fp = (lyhat_pos != ly_pos).sum()
  fn = (lyhat_neg != ly_neg).sum()

  accuracy_score = (tp + tn) / (tp + tn + fp + fn)
  precision_score = tp / (tp + fp)
  recall_score = tp / (tp + fn)
  f1_score = 2*precision_score*recall_score / (precision_score + recall_score)

  print(f'\naccuracy score: {(100*accuracy_score):.2f}%')
  print(f'precision score: {(100*precision_score):.2f}%')
  print(f'recall score: {(100*recall_score):.2f}%')
  print(f'f1 score: {(100*f1_score):.2f}%')
  
  return lyhat


## Apprentissage avec des classes non équilibrées avec MSELoss

In [90]:
net = Net()
optimizer = torch.optim.Adam(net.parameters(), lr=3e-4)
# criterion = F1Loss()
criterion = F1MacroLoss()

epochs = 7

In [91]:
train_loss, test_loss = apprentissage(net, criterion, optimizer, train_dataloader, test_dataloader, epochs)

Epoch [   1/   7] 
	train batch [1187/1187] - Loss : 0.5000 - TP/P : -1.00 - TN/N : 100.000
	test batch [ 297/ 297] - Loss : 0.5120 - TP/P : 0.00 - TN/N : 95.24070
Epoch [   2/   7] 
	train batch [1187/1187] - Loss : 0.5000 - TP/P : -1.00 - TN/N : 100.000
	test batch [ 297/ 297] - Loss : 0.5000 - TP/P : 0.00 - TN/N : 100.0000
Epoch [   3/   7] 
	train batch [1187/1187] - Loss : 0.5000 - TP/P : -1.00 - TN/N : 100.000
	test batch [ 297/ 297] - Loss : 0.5004 - TP/P : 0.00 - TN/N : 100.0050
Epoch [   4/   7] 
	train batch [1187/1187] - Loss : 0.5128 - TP/P : -1.00 - TN/N : 95.0050
	test batch [ 297/ 297] - Loss : 0.5001 - TP/P : 100.00 - TN/N : 100.00
Epoch [   5/   7] 
	train batch [1187/1187] - Loss : 0.0000 - TP/P : 100.00 - TN/N : 100.00
	test batch [ 297/ 297] - Loss : 0.5000 - TP/P : 0.00 - TN/N : 100.0050
Epoch [   6/   7] 
	train batch [1187/1187] - Loss : 0.5000 - TP/P : -1.00 - TN/N : 100.000
	test batch [ 297/ 297] - Loss : 0.5000 - TP/P : 0.00 - TN/N : 100.0070
Epoch [   7/   7

### Taux de bonnes prédiction dans train

In [92]:
lyhat = prediction(net, train_dataloader)

Taux de prediction total: 97.72%
Taux de prediction sur les pos: 10.88%
Taux de prediction sur les neg: 99.61%
F1 score (positifs): 16.87%
F1 score (négatifs): 98.84%
f1 macro score : 57.86%

accuracy score: 97.72%
precision score: 10.88%
recall score: 37.61%
f1 score: 16.87%


### Taux de bonnes prédiction dans test

In [93]:
lyhat = prediction(net, test_dataloader)

Taux de prediction total: 97.74%
Taux de prediction sur les pos: 11.39%
Taux de prediction sur les neg: 99.61%
F1 score (positifs): 17.62%
F1 score (négatifs): 98.85%
f1 macro score : 58.24%

accuracy score: 97.74%
precision score: 11.39%
recall score: 38.98%
f1 score: 17.62%


## Apprentissage avec des classes non équilibrées avec f1_loss

In [59]:
net_f1 = Net()
optimizer_f1 = torch.optim.Adam(net_f1.parameters(), lr=3e-4)
criterion_f1 = F1Loss()
# criterion = nn.MSELoss()

epochs = 7

In [60]:
train_loss, test_loss = apprentissage(net_f1, criterion_f1, optimizer_f1, train_dataloader, test_dataloader, epochs)


Epoch [   1/   7] 
	train batch [1187/1187] - Loss : 1.0000 - TP/P : -1.00 - TN/N : 100.000
	test batch [ 297/ 297] - Loss : 1.0000 - TP/P : 0.00 - TN/N : 95.24000
Epoch [   2/   7] 
	train batch [1187/1187] - Loss : 1.0000 - TP/P : -1.00 - TN/N : 100.000
	test batch [ 297/ 297] - Loss : 1.0000 - TP/P : 0.00 - TN/N : 95.24000
Epoch [   3/   7] 
	train batch [1187/1187] - Loss : 1.0000 - TP/P : -1.00 - TN/N : 95.0000
	test batch [ 297/ 297] - Loss : 1.0000 - TP/P : 0.00 - TN/N : 95.24000
Epoch [   4/   7] 
	train batch [1187/1187] - Loss : 1.0000 - TP/P : -1.00 - TN/N : 100.000
	test batch [ 297/ 297] - Loss : 1.0000 - TP/P : 0.00 - TN/N : 95.24000
Epoch [   5/   7] 
	train batch [1187/1187] - Loss : 1.0000 - TP/P : -1.00 - TN/N : 100.000
	test batch [ 297/ 297] - Loss : 1.0000 - TP/P : 0.00 - TN/N : 95.24000
Epoch [   6/   7] 
	train batch [1187/1187] - Loss : 1.0000 - TP/P : -1.00 - TN/N : 100.000
	test batch [ 297/ 297] - Loss : 1.0000 - TP/P : 0.00 - TN/N : 95.24000
Epoch [   7/   7

### Taux de bonnes prédiction dans train

In [61]:
lyhat = prediction(net_f1, train_dataloader)

Taux de prediction total: 96.22%
Taux de prediction sur les pos: 26.45%
Taux de prediction sur les neg: 97.74%
F1 score: 22.97%
f1 macro score: 39.48%

accuracy score: 96.22%
precision score: 26.45%
recall score: 20.30%
f1 score: 22.97%


### Taux de bonnes prédiction dans test

In [62]:
lyhat = prediction(net_f1, test_dataloader)

Taux de prediction total: 96.05%
Taux de prediction sur les pos: 22.28%
Taux de prediction sur les neg: 97.65%
F1 score: 19.35%
f1 macro score: 41.33%

accuracy score: 96.05%
precision score: 22.28%
recall score: 17.11%
f1 score: 19.35%


## Apprentissage avec des classes équilibrées

In [63]:
net_eq = Net()
optimizer_eq = torch.optim.Adam(net_eq.parameters(), lr=3e-4)
# criterion = F1Loss()
criterion_eq = nn.MSELoss()

epochs = 7
train_loss = []
test_loss = []

In [64]:
train_loss, test_loss = apprentissage(net_eq, criterion_eq, optimizer_eq, train_dataloader_eq, test_dataloader_eq, epochs)

Epoch [   1/   7] 
	train batch [  51/  51] - Loss : 0.1766 - TP/P : 83.33 - TN/N : 66.672
	test batch [  13/  13] - Loss : 0.1479 - TP/P : 90.00 - TN/N : 85.00
Epoch [   2/   7] 
	train batch [  51/  51] - Loss : 0.1552 - TP/P : 90.00 - TN/N : 62.5000
	test batch [  13/  13] - Loss : 0.1224 - TP/P : 80.00 - TN/N : 85.00
Epoch [   3/   7] 
	train batch [  51/  51] - Loss : 0.0686 - TP/P : 100.00 - TN/N : 90.000
	test batch [  13/  13] - Loss : 0.1895 - TP/P : 90.00 - TN/N : 80.00
Epoch [   4/   7] 
	train batch [  51/  51] - Loss : 0.1156 - TP/P : 100.00 - TN/N : 75.000
	test batch [  13/  13] - Loss : 0.1876 - TP/P : 80.00 - TN/N : 80.00
Epoch [   5/   7] 
	train batch [  51/  51] - Loss : 0.0010 - TP/P : 100.00 - TN/N : 100.00
	test batch [  13/  13] - Loss : 0.2054 - TP/P : 80.00 - TN/N : 80.000
Epoch [   6/   7] 
	train batch [  51/  51] - Loss : 0.0558 - TP/P : 100.00 - TN/N : 90.000
	test batch [  13/  13] - Loss : 0.2643 - TP/P : 80.00 - TN/N : 65.00
Epoch [   7/   7] 
	train ba

### Taux de bonnes prédiction dans train avec équilibre

In [71]:
lyhat = prediction(net_eq, train_dataloader_eq)

Taux de prediction total: 98.45%
Taux de prediction sur les pos: 98.76%
Taux de prediction sur les neg: 98.15%
F1 score (positifs): 98.46%
F1 score (négatifs): 98.45%
f1 macro score : 1.55%

accuracy score: 98.45%
precision score: 98.76%
recall score: 98.16%
f1 score: 98.46%


### Taux de bonnes prédiction dans train

In [72]:
lyhat = prediction(net_eq, train_dataloader)

Taux de prediction total: 75.12%
Taux de prediction sur les pos: 98.76%
Taux de prediction sur les neg: 74.61%
F1 score (positifs): 14.47%
F1 score (négatifs): 85.44%
f1 macro score : 50.04%

accuracy score: 75.12%
precision score: 98.76%
recall score: 7.81%
f1 score: 14.47%


### Taux de bonnes prédiction dans test avec equilibre

In [67]:
lyhat = prediction(net_eq, test_dataloader_eq)

Taux de prediction total: 80.20%
Taux de prediction sur les pos: 87.13%
Taux de prediction sur les neg: 73.27%
F1 score: 81.48%
f1 macro score: 19.90%

accuracy score: 80.20%
precision score: 87.13%
recall score: 76.52%
f1 score: 81.48%


### Taux de bonnes prédiction dans test

In [None]:
lyhat = prediction(net_eq, test_dataloader)

Taux de prediction total: 72.38%
Taux de prediction sur les pos: 87.62%
Taux de prediction sur les neg: 72.05%
F1 score: 11.90%

accuracy score: 72.38%
precision score: 87.62%
recall score: 6.38%
f1 score: 11.90%
