# Beevibe - Binary tutorial


## Manage Packages

### Install packages

In [None]:
# Get the last Transformers version for ModernBert
! pip install git+https://github.com/huggingface/transformers --quiet

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-bkazmkfq
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-bkazmkfq
  Resolved https://github.com/huggingface/transformers to commit 62db3e6ed67a74cc1ed1436acd9973915c0a4475
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.49.0.dev0-py3-none-any.whl size=10627237 sha256=71d9670d0164a0101591d94947c94e26517d29711671bb5b94704b26414e6b1d
  Stored in directory: /tmp/pip-ephem-wheel-cache-d8fcvk1u/wheels/04/a3/f1/b88775f8e1665827525b19ac7590250f1038d947067beba9fb
Successfully built transformer

In [None]:
# Installe Beevibe 0.13 from Pippy Test
! pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ beevibe==0.1.0.dev13 --quiet

In [None]:
! pip install watermark --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.6 MB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m25.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25h

### Import packages

In [None]:
import sys
import pandas as pd
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import matthews_corrcoef
import numpy as np
from watermark import watermark
import torch.nn as nn
from torch.optim import AdamW
from beevibe import BeeTrainer, BeeMLMClassifier


## GPU Card

In [None]:
!nvidia-smi

Sun Feb  2 16:07:35 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Drive Directory


In [None]:
# Path sur le projet
sys.path.insert(0, "..")

## Packages versions

In [None]:
print(watermark())

Last updated: 2025-02-02T16:07:35.273874+00:00

Python implementation: CPython
Python version       : 3.11.11
IPython version      : 7.34.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.1.85+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 8
Architecture: 64bit



In [None]:
print(watermark(packages="pandas,numpy,scipy,sklearn,torch,transformers,tokenizers,sentencepiece,datasets,beevibe"))

pandas       : 2.2.2
numpy        : 1.26.4
scipy        : 1.13.1
sklearn      : 1.6.1
torch        : 2.5.1+cu124
transformers : 4.49.0.dev0
tokenizers   : 0.21.0
sentencepiece: 0.2.0
datasets     : 3.2.0
beevibe      : 0.1.0.dev13



## Load dataset

### Get Train & Test

In [None]:
data_files = {
    "train": "elegana_train_v0_1.csv",
    "test": "elegana_test_v0_1.csv",
}

dataset = load_dataset(
    "Franbul/elegana_relation_client_FR",
    data_files=data_files,
    sep="|")

pd_train = dataset["train"].to_pandas()
pd_test = dataset["test"].to_pandas()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

### Get Themes

In [None]:
data_files = {
    "themes": "elegana_themes_v0_1.csv"
}

dataset = load_dataset(
    "Franbul/elegana_relation_client_FR",
    data_files=data_files,
    sep="|")

pd_themes = dataset["themes"].to_pandas()


Generating themes split: 0 examples [00:00, ? examples/s]

### Merge datas

In [None]:
# Merge train, test and thems
pd_data = pd.merge(pd_train, pd_themes, on="THEME", how='left')
pd_data_test = pd.merge(pd_test, pd_themes, on="THEME", how='left')

# Get a sample here
#pd_data = pd_data.sample(200, random_state=1811)

In [None]:
pd_data.shape

(2364, 10)

In [None]:
pd_data.head()

Unnamed: 0,CLIENT,CONSEILLER,THEME,DESCRIPTION,2_CLASSES,5_CLASSES,LABEL_1,LABEL_2,LABEL_3,LABEL_4
0,Quelle est la taille de ce chapeau ?,La taille de ce chapeau est de 58 cm de circon...,Demande d'informations produit,Questions spécifiques sur les caractéristiques...,Informations et services spécialisés,Conseils et informations produits,Produit,Caractéristique,Spécifique,Demande
1,Quels sont vos délais de livraison pour les co...,Nos délais de livraison pour les commandes en ...,Demande d'informations sur les achats en gros,Conditions et possibilités pour les achats en ...,Informations et services spécialisés,"Services exclusifs, programmes et personnalisa...",Gros,Condition,Professionnel,Possibilité
2,Nous avons besoin de tenues pour un événement ...,"Oui, en fonction du thème de l'événement, nous...",Demande de conseils pour les achats de groupe,Conseils pour effectuer des achats groupés (po...,Informations et services spécialisés,Conseils et informations produits,Achat,Groupe,Événement,Mariage
3,Le gilet que j'ai commandé est trop petit. Pui...,Je suis désolé d'apprendre que la taille du gi...,Échange de produit,"Demande d'échange pour un autre taille, couleu...",Support client opérationnel,"Commandes, livraison et suivi",Produit,Taille,Couleur,Option
4,Est-ce que vous proposez des remises pour les ...,"Oui, nous offrons des remises aux professionne...",Demande d'informations sur les achats en gros,Conditions et possibilités pour les achats en ...,Informations et services spécialisés,"Services exclusifs, programmes et personnalisa...",Gros,Condition,Professionnel,Possibilité


### Get texts & labels for training

In [None]:
pd_data['2_CLASSES'].value_counts()

Unnamed: 0_level_0,count
2_CLASSES,Unnamed: 1_level_1
Informations et services spécialisés,1629
Support client opérationnel,735


In [None]:
# Use THEME in 5-Classes for classification
classes = np.unique(pd_data['2_CLASSES'])
le_esg = LabelEncoder()
le_esg.fit(classes)

# Get sentences and labels to train
labels_names = classes[le_esg.transform(classes)]
labels = le_esg.transform(pd_data['2_CLASSES']).tolist()
classes = np.unique(labels)
texts = pd_data["CLIENT"].values.tolist()
print(f"Train : Nb texts:{len(texts)}, Nb labels:{len(labels)}, Nb classes:{len(classes)}")

# Get sentences and labels to predict
test_texts = pd_data_test["CLIENT"].values.tolist()
test_labels = le_esg.transform(pd_data_test['2_CLASSES']).tolist()
print(f"Test  : Nb texts:{len(test_texts)}, Nb labels:{len(test_labels)}, Nb classes:{len(np.unique(test_labels))}")


Train : Nb texts:2364, Nb labels:2364, Nb classes:2
Test  : Nb texts:591, Nb labels:591, Nb classes:2


## Holdout

### camembert-base

In [None]:
num_epochs = 30
batch_size = 8
patience = 3
min_delta = 0.001
val_size = 0.3


# Get number of classes to predict
num_labels = len(labels_names)

# Create Classification Model
model = BeeMLMClassifier(
    model_name = "camembert-base",
    num_labels = num_labels,
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                    )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()

Device : cuda
Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used
Epoch 0/29, Train Loss: 0.5328, Val Loss: 0.3725, Val MCC: 0.7017, lr: 1.000e-05
Epoch 1/29, Train Loss: 0.2945, Val Loss: 0.2411, Val MCC: 0.8348, lr: 1.000e-05
Epoch 2/29, Train Loss: 0.1922, Val Loss: 0.1900, Val MCC: 0.8649, lr: 1.000e-05
Epoch 3/29, Train Loss: 0.1505, Val Loss: 0.1753, Val MCC: 0.8655, lr: 1.000e-05
Epoch 4/29, Train Loss: 0.1119, Val Loss: 0.2032, Val MCC: 0.8657, lr: 1.000e-05
Epoch 5/29, Train Loss: 0.0996, Val Loss: 0.1142, Val MCC: 0.9236, lr: 1.000e-05
Epoch 6/29, Train Loss: 0.0669, Val Loss: 0.1160, Val MCC: 0.9201, lr: 1.000e-05
Epoch 7/29, Train Loss: 0.0560, Val Loss: 0.1210, Val MCC: 0.9101, lr: 1.000e-05
Epoch 8/29, Train Loss: 0.0451, Val Loss: 0.1081, Val MCC: 0.9142, lr: 1.000e-05
Epoch 9/29, Train Loss: 0.0450, Val Loss: 0.1574, Val MCC: 0.8824, lr: 1.000e-05
Epoch 10/29, Train Loss: 0.0543, Val Loss: 0.1262, Val MCC: 0.9164, lr: 1.000e-05
Epoch 11/29, Train Loss: 0.0437, Val L

### camembertav2-base


In [None]:
num_epochs = 30
batch_size = 8
patience = 3
min_delta = 0.001
val_size = 0.3


# Get number of classes to predict
num_labels = len(labels_names)

# Create Classification Model
model = BeeMLMClassifier(
    model_name = "almanach/camembertav2-base",
    num_labels = num_labels,
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                    )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()

Device : cuda
Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used
Epoch 0/29, Train Loss: 0.4886, Val Loss: 0.4939, Val MCC: 0.6011, lr: 1.000e-05
Epoch 1/29, Train Loss: 0.2485, Val Loss: 0.2814, Val MCC: 0.7832, lr: 1.000e-05
Epoch 2/29, Train Loss: 0.1581, Val Loss: 0.1787, Val MCC: 0.8557, lr: 1.000e-05
Epoch 3/29, Train Loss: 0.0791, Val Loss: 0.1271, Val MCC: 0.9010, lr: 1.000e-05
Epoch 4/29, Train Loss: 0.0628, Val Loss: 0.1529, Val MCC: 0.8938, lr: 1.000e-05
Epoch 5/29, Train Loss: 0.0391, Val Loss: 0.1757, Val MCC: 0.8815, lr: 1.000e-05
Epoch 6/29, Train Loss: 0.0314, Val Loss: 0.1775, Val MCC: 0.8834, lr: 1.000e-05
Best epoch: 3, Best loss: 0.1271


** Global metrics :

 - accuracy: 0.9577
 - f1_macro: 0.9505
 - f1_micro: 0.9577
 - f1_weighted: 0.9578
 - mcc: 0.9010


** Per-Classes metrics :

Class          Precision      Recall          F1     Support
Class 0           0.9714      0.9675      0.9695         492
Class 1           0.9273      0.9358      0.9315         21

### ModernBERT-base

In [None]:
num_epochs = 30
batch_size = 8
patience = 3
min_delta = 0.001
val_size = 0.3


# Get number of classes to predict
num_labels = len(labels_names)

# Create Classification Model
model = BeeMLMClassifier(
    model_name = "answerdotai/ModernBERT-base",
    num_labels = num_labels,
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                    )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()

Device : cuda
Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used
Epoch 0/29, Train Loss: 0.6853, Val Loss: 0.6278, Val MCC: 0.1190, lr: 1.000e-05
Epoch 1/29, Train Loss: 0.4851, Val Loss: 0.3439, Val MCC: 0.7177, lr: 1.000e-05
Epoch 2/29, Train Loss: 0.2688, Val Loss: 0.4197, Val MCC: 0.6229, lr: 1.000e-05
Epoch 3/29, Train Loss: 0.1980, Val Loss: 0.3832, Val MCC: 0.6696, lr: 1.000e-05
Epoch 4/29, Train Loss: 0.1193, Val Loss: 0.3511, Val MCC: 0.7050, lr: 1.000e-05
Best epoch: 1, Best loss: 0.3439


** Global metrics :

 - accuracy: 0.8676
 - f1_macro: 0.8533
 - f1_micro: 0.8676
 - f1_weighted: 0.8710
 - mcc: 0.7177


** Per-Classes metrics :

Class          Precision      Recall          F1     Support
Class 0           0.9523      0.8516      0.8991         492
Class 1           0.7296      0.9037      0.8074         218


** Confusion Matrix (FN/Row - FP/Col):

            Class 0  Class 1 
Class 0     419      73    
Class 1     21       197   
Elapsed time: 00:07:01


### ModernBERT-Embed-BE

In [None]:
num_epochs = 30
batch_size = 8
patience = 3
min_delta = 0.001
val_size = 0.3


# Get number of classes to predict
num_labels = len(labels_names)

# Create Classification Model
model = BeeMLMClassifier(
    model_name = "Parallia/Fairly-Multilingual-ModernBERT-Embed-BE",
    num_labels = num_labels,
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                    )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()

Device : cuda


The repository for Parallia/Fairly-Multilingual-ModernBERT-Embed-BE contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Parallia/Fairly-Multilingual-ModernBERT-Embed-BE.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used
Epoch 0/29, Train Loss: 0.4023, Val Loss: 0.2955, Val MCC: 0.7639, lr: 1.000e-05
Epoch 1/29, Train Loss: 0.1589, Val Loss: 0.1642, Val MCC: 0.8782, lr: 1.000e-05
Epoch 2/29, Train Loss: 0.0469, Val Loss: 0.1223, Val MCC: 0.9044, lr: 1.000e-05
Epoch 3/29, Train Loss: 0.0168, Val Loss: 0.1295, Val MCC: 0.8798, lr: 1.000e-05
Epoch 4/29, Train Loss: 0.0089, Val Loss: 0.1326, Val MCC: 0.9020, lr: 1.000e-05
Epoch 5/29, Train Loss: 0.0188, Val Loss: 0.1330, Val MCC: 0.9059, lr: 1.000e-05
Best epoch: 2, Best loss: 0.1223


** Global metrics :

 - accuracy: 0.9592
 - f1_macro: 0.9522
 - f1_micro: 0.9592
 - f1_weighted: 0.9592
 - mcc: 0.9044


** Per-Classes metrics :

Class          Precision      Recall          F1     Support
Class 0           0.9734      0.9675      0.9704         492
Class 1           0.9276      0.9404      0.9339         218


** Confusion Matrix (FN/Row - FP/Col):

            Class 0  Class 1 
Class 0     476      

### ModernBERT-Embed-BE-FR

In [None]:
num_epochs = 30
batch_size = 8
patience = 3
min_delta = 0.001
val_size = 0.3


# Get number of classes to predict
num_labels = len(labels_names)

# Create Classification Model
model = BeeMLMClassifier(
    model_name = "Parallia/Fairly-Multilingual-ModernBERT-Embed-BE-FR",
    num_labels = num_labels,
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                    )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()

Device : cuda
Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used
Epoch 0/29, Train Loss: 0.4924, Val Loss: 0.3909, Val MCC: 0.6552, lr: 1.000e-05
Epoch 1/29, Train Loss: 0.2484, Val Loss: 0.2886, Val MCC: 0.7475, lr: 1.000e-05
Epoch 2/29, Train Loss: 0.1430, Val Loss: 0.2317, Val MCC: 0.7894, lr: 1.000e-05
Epoch 3/29, Train Loss: 0.0553, Val Loss: 0.1442, Val MCC: 0.8799, lr: 1.000e-05
Epoch 4/29, Train Loss: 0.0418, Val Loss: 0.1419, Val MCC: 0.8975, lr: 1.000e-05
Epoch 5/29, Train Loss: 0.0293, Val Loss: 0.1435, Val MCC: 0.9069, lr: 1.000e-05
Epoch 6/29, Train Loss: 0.0154, Val Loss: 0.2048, Val MCC: 0.8878, lr: 1.000e-05
Epoch 7/29, Train Loss: 0.0047, Val Loss: 0.1628, Val MCC: 0.9103, lr: 1.000e-05
Best epoch: 4, Best loss: 0.1419


** Global metrics :

 - accuracy: 0.9563
 - f1_macro: 0.9488
 - f1_micro: 0.9563
 - f1_weighted: 0.9564
 - mcc: 0.8975


** Per-Classes metrics :

Class          Precision      Recall          F1     Support
Class 0           0.9695      0.9675   

### distilcamembert-base

In [None]:
num_epochs = 30
batch_size = 8
patience = 3
min_delta = 0.001
val_size = 0.3


# Get number of classes to predict
num_labels = len(labels_names)

# Create Classification Model
model = BeeMLMClassifier(
    model_name = "cmarkea/distilcamembert-base",
    num_labels = num_labels,
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                    )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()

Device : cuda
Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used
Epoch 0/29, Train Loss: 0.4851, Val Loss: 0.3733, Val MCC: 0.6838, lr: 1.000e-05
Epoch 1/29, Train Loss: 0.2568, Val Loss: 0.2477, Val MCC: 0.7755, lr: 1.000e-05
Epoch 2/29, Train Loss: 0.1588, Val Loss: 0.1545, Val MCC: 0.8878, lr: 1.000e-05
Epoch 3/29, Train Loss: 0.1090, Val Loss: 0.1801, Val MCC: 0.8567, lr: 1.000e-05
Epoch 4/29, Train Loss: 0.0786, Val Loss: 0.1426, Val MCC: 0.8957, lr: 1.000e-05
Epoch 5/29, Train Loss: 0.0602, Val Loss: 0.1872, Val MCC: 0.8561, lr: 1.000e-05
Epoch 6/29, Train Loss: 0.0442, Val Loss: 0.1283, Val MCC: 0.9105, lr: 1.000e-05
Epoch 7/29, Train Loss: 0.0301, Val Loss: 0.1552, Val MCC: 0.9033, lr: 1.000e-05
Epoch 8/29, Train Loss: 0.0294, Val Loss: 0.1392, Val MCC: 0.9041, lr: 1.000e-05
Epoch 9/29, Train Loss: 0.0125, Val Loss: 0.1582, Val MCC: 0.9082, lr: 1.000e-05
Best epoch: 6, Best loss: 0.1283


** Global metrics :

 - accuracy: 0.9620
 - f1_macro: 0.9553
 - f1_micro: 0.9620
 - f

#### Modify head and add AdamW

In [None]:
num_epochs = 30
batch_size = 8
patience = 3
min_delta = 0.001
val_size = 0.3


# Get number of classes to predict
num_labels = len(labels_names)

# Define a custom classification head
head_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels},
    ]

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "cmarkea/distilcamembert-base",
    num_labels = num_labels,
    head_layers=head_layer_configs
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                     labels_names=labels_names,
                     optimizer_class=AdamW
                     )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()


Device : cuda
Use optimizer : AdamW
 - {'lr': 1e-05}
No scheduler used
Epoch 0/29, Train Loss: 0.5381, Val Loss: 0.4064, Val MCC: 0.6387, lr: 1.000e-05
Epoch 1/29, Train Loss: 0.3453, Val Loss: 0.2999, Val MCC: 0.7631, lr: 1.000e-05
Epoch 2/29, Train Loss: 0.2393, Val Loss: 0.2141, Val MCC: 0.8196, lr: 1.000e-05
Epoch 3/29, Train Loss: 0.1622, Val Loss: 0.1543, Val MCC: 0.8670, lr: 1.000e-05
Epoch 4/29, Train Loss: 0.1104, Val Loss: 0.1318, Val MCC: 0.8951, lr: 1.000e-05
Epoch 5/29, Train Loss: 0.0737, Val Loss: 0.1564, Val MCC: 0.8749, lr: 1.000e-05
Epoch 6/29, Train Loss: 0.0482, Val Loss: 0.1259, Val MCC: 0.8969, lr: 1.000e-05
Epoch 7/29, Train Loss: 0.0412, Val Loss: 0.1327, Val MCC: 0.9009, lr: 1.000e-05
Epoch 8/29, Train Loss: 0.0368, Val Loss: 0.1562, Val MCC: 0.8857, lr: 1.000e-05
Epoch 9/29, Train Loss: 0.0255, Val Loss: 0.1767, Val MCC: 0.8770, lr: 1.000e-05
Best epoch: 6, Best loss: 0.1259


** Global metrics :

 - accuracy: 0.9549
 - f1_macro: 0.9479
 - f1_micro: 0.9549
 - 

#### Add a Lora configuration

In [None]:
num_epochs = 30
batch_size = 8
patience = 3
min_delta = 0.001
val_size = 0.3

# Get number of classes to predict
num_labels = len(labels_names)

# Define a custom classification head
head_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels},
    ]

# Create Classification Model
model = BeeMLMClassifier(
    model_name = "cmarkea/distilcamembert-base", #"camembert-base",
    num_labels = num_labels,
    head_layers=head_layer_configs
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                            labels_names=labels_names,
                            optimizer_class=AdamW,
                            use_lora=True,
                            lora_r = 64,
                            lora_alpha= 128,
                            lora_dropout = 0.01,
                            )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()

Device : cuda
Using Lora
Target modules : ['base_model.encoder.layer.0.attention.self.query', 'base_model.encoder.layer.0.attention.self.key', 'base_model.encoder.layer.0.attention.output.dense', 'base_model.encoder.layer.0.intermediate.dense', 'base_model.encoder.layer.0.output.dense', 'base_model.encoder.layer.1.attention.self.query', 'base_model.encoder.layer.1.attention.self.key', 'base_model.encoder.layer.1.attention.output.dense', 'base_model.encoder.layer.1.intermediate.dense', 'base_model.encoder.layer.1.output.dense', 'base_model.encoder.layer.2.attention.self.query', 'base_model.encoder.layer.2.attention.self.key', 'base_model.encoder.layer.2.attention.output.dense', 'base_model.encoder.layer.2.intermediate.dense', 'base_model.encoder.layer.2.output.dense', 'base_model.encoder.layer.3.attention.self.query', 'base_model.encoder.layer.3.attention.self.key', 'base_model.encoder.layer.3.attention.output.dense', 'base_model.encoder.layer.3.intermediate.dense', 'base_model.encoder.

#### Try multiple heads

In [None]:
# Large list of possible classification heads

heads_network_patterns = [

    # Pattern 0: One layer Network
    [
        {"input_size": 768, "output_size": num_labels, "activation": None},
    ],

    # Pattern 1: Basic Feedforward Network
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 2: Shallow Network with Dropout
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.2},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 3: Batch-Normalized Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 4: Wide Hidden Layers
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.GELU},
        {"input_size": 1024, "output_size": 512, "activation": nn.GELU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 5: Layer Normalization
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 6: Minimalistic Network
    [
        {"input_size": 768, "output_size": 128, "activation": nn.Tanh},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 7: Deep Feedforward Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 8: Dropout Regularization
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 9: Deep Residual Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "residual": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 10: Compact Network
    [
        {"input_size": 768, "output_size": 64, "activation": nn.SiLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 11: Fully Connected Bottleneck Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 12: Dense Network with No Activation in Final Layer
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 13: Layer Normalization with High Dropout
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "layer_norm": True, "dropout_rate": 0.5},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "layer_norm": True, "dropout_rate": 0.5},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 14: Gated Activation with Dropout
    [
        {"input_size": 768, "output_size": 512, "activation": nn.GELU, "dropout_rate": 0.2},
        {"input_size": 512, "output_size": 256, "activation": nn.SiLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 15: Progressive Layer Expansion
    [
        {"input_size": 768, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 16: Deep Wide Network
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 17: Dense Residual Network with Skip Connections
    [
        {"input_size": 768, "output_size": 768, "activation": nn.ReLU, "residual": True},
        {"input_size": 768, "output_size": 768, "activation": nn.ReLU, "residual": True},
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 18: Alternating Normalization
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 19: Advanced Progressive Shrinkage
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 20: Feature Extractor with Sparse Hidden Units
    [
        {"input_size": 768, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": 32, "activation": nn.ReLU},
        {"input_size": 32, "output_size": num_labels, "activation": None},
    ],

    # Pattern 21: Multi-Layer Sparse Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 22: Alternating Dropout Intensities
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.2},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.5},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.3},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 23: High-Dimensional Bottleneck with Residuals
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.GELU, "residual": True},
        {"input_size": 1024, "output_size": 256, "activation": nn.GELU, "residual": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 24: Fully Dense Network with Gradient Clipping
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": num_labels, "activation": None},
    ],

    # Pattern 25: Deeply Layer-Normalized Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 26: Modular Feedforward Blocks
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 27: High-Frequency Regularization Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.1},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.1},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.1},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 28: Alternating Activations Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.SiLU},
        {"input_size": 256, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 29: Sequential Dropout and Bottleneck
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 128, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 30: Multi-Head Modular Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 31: Sparse Progressive Expansion
    [
        {"input_size": 768, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 32: Residual Shrinking Layers
    [
        {"input_size": 768, "output_size": 512, "activation": nn.GELU, "residual": True},
        {"input_size": 512, "output_size": 256, "activation": nn.GELU, "residual": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 33: Dual Activation Fusion
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.SiLU},
        {"input_size": 256, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 34: Deeply Narrow Network
    [
        {"input_size": 768, "output_size": 128, "activation": nn.Tanh},
        {"input_size": 128, "output_size": 64, "activation": nn.Tanh},
        {"input_size": 64, "output_size": 32, "activation": nn.Tanh},
        {"input_size": 32, "output_size": num_labels, "activation": None},
    ],

    # Pattern 35: Cyclic Dropout Network
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.2},
        {"input_size": 256, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 36: Wide Bottleneck Layers
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": num_labels, "activation": None},
    ],

    # Pattern 37: Alternating Sparse Connectivity
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 38: Progressive Layer Normalization
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 39: Activation Modulated Network
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": 64, "activation": nn.Tanh},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 40: Double Residual Connections
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "residual": True},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 41: Dense Alternating Widths
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 42: Multi-Normalization Layers
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 43: Sparse Expansion with Skip
    [
        {"input_size": 768, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 44: Mixed Activations Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.SiLU},
        {"input_size": 512, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 45: Dense Shallow Layers
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 46: Split-and-Merge Architecture
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 47: Gradient Clipping Network
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": num_labels, "activation": None},
    ],

    # Pattern 48: High-Dropout Bottleneck
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.5},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.5},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 49: Alternating Nonlinearities
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.Tanh},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 50: Wide-to-Narrow Progressive Network
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

]

In [None]:
# Try the first 20 heads
nb_first_heads = 20

# Trainer parameters
num_epochs = 30
batch_size = 8
patience = 3
min_delta = 0.001
val_size = 0.3

# Get number of classes to predict
num_labels = len(labels_names)

# Save results
save_results = []

# Try different heads
for i, head_network_pattern in enumerate(heads_network_patterns[:nb_first_heads]):

  print(f"Pattern {i}")

  # Create Classification Model
  model = BeeMLMClassifier(
      model_name = "cmarkea/distilcamembert-base",
      num_labels = num_labels,
      head_layers=head_network_pattern
  )

  # Create Trainer with Lora parameters
  trainer = BeeTrainer(model=model,
                              labels_names=labels_names,
                              optimizer_class=AdamW,
                              use_lora=True,
                              lora_r = 64,
                              lora_alpha= 128,
                              lora_dropout = 0.01,
                              verbose=False
                              )

  # Train over a Holdout with Earlystopping
  ret = trainer.holdout(texts=texts,
                      labels=labels,
                      val_size=val_size,
                      num_epochs=num_epochs,
                      batch_size=batch_size,
                      patience=patience,
                      min_delta=min_delta,
                      balanced=True
                      )

  print(f"  - MCC: {ret.get('val_metrics')[-1].get('mcc')}\n")

  save_results.append(ret)

  # Free CPU/GPU memory
  trainer.release_model()


Pattern 0
  - MCC: 0.8664800576874168

Pattern 1
  - MCC: 0.8915078506582653

Pattern 2
  - MCC: 0.8989510600044248

Pattern 3
  - MCC: 0.901526932729356

Pattern 4
  - MCC: 0.8245886564924707

Pattern 5
  - MCC: 0.9094196874538816

Pattern 6
  - MCC: 0.8593536848714783

Pattern 7
  - MCC: 0.831353776399996

Pattern 8
  - MCC: 0.8600392775817288

Pattern 9
  - MCC: 0.8310210716550828

Pattern 10
  - MCC: 0.8974157269985179

Pattern 11
  - MCC: 0.8714323175633598

Pattern 12
  - MCC: 0.8791926307378476

Pattern 13
  - MCC: 0.8466672558728766

Pattern 14
  - MCC: 0.8209634920219804

Pattern 15
  - MCC: 0.8597878237468666

Pattern 16
  - MCC: 0.8858564957148359

Pattern 17
  - MCC: 0.8451189122501

Pattern 18
  - MCC: 0.8577746624335467

Pattern 19
  - MCC: 0.831353776399996



In [None]:
pattern_index = np.argmax([k.get('val_metrics')[-1].get('mcc') for k in save_results])
pattern_mcc = save_results[pattern_index].get('val_metrics')[-1].get('mcc')
print("Best pattern : ")
print(heads_network_patterns[pattern_index])
print(f"MCC:{pattern_mcc}")

Best pattern : 
[{'input_size': 768, 'output_size': 256, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'layer_norm': True}, {'input_size': 256, 'output_size': 128, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'layer_norm': True}, {'input_size': 128, 'output_size': 2, 'activation': None}]
MCC:0.9094196874538816


In [None]:
arr = np.array([k.get('val_metrics')[-1].get('mcc') for k in save_results])
sorted_indexes = np.argsort(arr)[::-1]
top_3_indexes = sorted_indexes[:3]

for i, ind in enumerate(top_3_indexes):
  print(f"Top {i} pattern")
  pattern_mcc = save_results[ind].get('val_metrics')[-1].get('mcc')
  print("Best pattern : ")
  print(heads_network_patterns[ind])
  print(f"MCC:{pattern_mcc}")
  print("\n")


Top 0 pattern
Best pattern : 
[{'input_size': 768, 'output_size': 256, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'layer_norm': True}, {'input_size': 256, 'output_size': 128, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'layer_norm': True}, {'input_size': 128, 'output_size': 2, 'activation': None}]
MCC:0.9094196874538816


Top 1 pattern
Best pattern : 
[{'input_size': 768, 'output_size': 512, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'batch_norm': True}, {'input_size': 512, 'output_size': 256, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'batch_norm': True}, {'input_size': 256, 'output_size': 2, 'activation': None}]
MCC:0.901526932729356


Top 2 pattern
Best pattern : 
[{'input_size': 768, 'output_size': 256, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'dropout_rate': 0.2}, {'input_size': 256, 'output_size': 2, 'activation': None}]
MCC:0.8989510600044248




## Cross-validation

In [None]:
num_epochs = 30
batch_size = 8
patience = 3
min_delta = 0.001
n_splits = 5

# Get number of classes to predict
num_labels = len(labels_names)

# Define a custom classification head
head_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels},
    ]

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "cmarkea/distilcamembert-base",
    num_labels = num_labels,
    head_layers=head_layer_configs
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                            labels_names=labels_names,
                            optimizer_class=AdamW,
                            use_lora=True,
                            lora_r = 64,
                            lora_alpha= 128,
                            lora_dropout = 0.01,
                            )

# Cross validate with Earlystopping
rets = trainer.cross_validation(texts=texts,
              labels=labels,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)

# Free CPU/GPU memory
trainer.release_model()

Device : cuda


> Fold 1
Using Lora
Target modules : ['base_model.encoder.layer.0.attention.self.query', 'base_model.encoder.layer.0.attention.self.key', 'base_model.encoder.layer.0.attention.output.dense', 'base_model.encoder.layer.0.intermediate.dense', 'base_model.encoder.layer.0.output.dense', 'base_model.encoder.layer.1.attention.self.query', 'base_model.encoder.layer.1.attention.self.key', 'base_model.encoder.layer.1.attention.output.dense', 'base_model.encoder.layer.1.intermediate.dense', 'base_model.encoder.layer.1.output.dense', 'base_model.encoder.layer.2.attention.self.query', 'base_model.encoder.layer.2.attention.self.key', 'base_model.encoder.layer.2.attention.output.dense', 'base_model.encoder.layer.2.intermediate.dense', 'base_model.encoder.layer.2.output.dense', 'base_model.encoder.layer.3.attention.self.query', 'base_model.encoder.layer.3.attention.self.key', 'base_model.encoder.layer.3.attention.output.dense', 'base_model.encoder.layer.3.intermediate.dense', 'base_mod

In [None]:
epochs_list = [k.get("best_epoch") for k in rets.get("cv_folds")]
max_epoch = int(np.max(epochs_list))
print("CV Folds max epoch:", max_epoch)

CV Folds max epoch: 16


## Full Training

In [None]:
num_epochs = max_epoch
batch_size = 8
train_size = 1.0

num_labels = len(labels_names)

# Define a custom classification head
head_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels},
    ]

# Create Classification Model
model = BeeMLMClassifier(
    model_name = "cmarkea/distilcamembert-base",
    num_labels = num_labels,
    head_layers=head_layer_configs
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                            labels_names=labels_names,
                            optimizer_class=AdamW,
                            use_lora=True,
                            lora_r = 64,
                            lora_alpha= 128,
                            lora_dropout = 0.01,
                            )

# Train the model
ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    balanced=True
                    )

# Save model and adaptater
trainer.save_model("./model-binary_v1")
trainer.save_adaptater("./adaptater-binary_v1")

# Free CPU/GPU memory
trainer.release_model()

Device : cuda
Using Lora
Target modules : ['base_model.encoder.layer.0.attention.self.query', 'base_model.encoder.layer.0.attention.self.key', 'base_model.encoder.layer.0.attention.output.dense', 'base_model.encoder.layer.0.intermediate.dense', 'base_model.encoder.layer.0.output.dense', 'base_model.encoder.layer.1.attention.self.query', 'base_model.encoder.layer.1.attention.self.key', 'base_model.encoder.layer.1.attention.output.dense', 'base_model.encoder.layer.1.intermediate.dense', 'base_model.encoder.layer.1.output.dense', 'base_model.encoder.layer.2.attention.self.query', 'base_model.encoder.layer.2.attention.self.key', 'base_model.encoder.layer.2.attention.output.dense', 'base_model.encoder.layer.2.intermediate.dense', 'base_model.encoder.layer.2.output.dense', 'base_model.encoder.layer.3.attention.self.query', 'base_model.encoder.layer.3.attention.self.key', 'base_model.encoder.layer.3.attention.output.dense', 'base_model.encoder.layer.3.intermediate.dense', 'base_model.encoder.

## Predictions

### Reload model

In [None]:
# Load the trained model
bee_mlm_model = BeeMLMClassifier.load_model_safetensors("./model-binary_v1")

### Predictions

In [None]:
y_preds = bee_mlm_model.predict(test_texts, batch_size=50, device="cpu")
mcc = matthews_corrcoef(test_labels, y_preds)
print(mcc)

0.8405947518771781


# End of game