# Beevibe - Tests Trainer


## Manage Packages

### Import package

In [1]:
import sys
import pandas as pd
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import matthews_corrcoef
import numpy as np
from watermark import watermark
import torch.nn as nn
from torch.optim import AdamW
from beevibe import BeeTrainer, BeeMLMClassifier, HuggingFaceHub


  from .autonotebook import tqdm as notebook_tqdm


## GPU Card

In [2]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


## Drive Directory

<!> Please adjust to your notebook path

In [3]:
# Path sur le projet
sys.path.insert(0, "..")

## Packages versions

In [4]:
print(watermark())

Last updated: 2025-02-09T17:19:32.413125+00:00

Python implementation: CPython
Python version       : 3.12.1
IPython version      : 8.30.0

Compiler    : GCC 9.4.0
OS          : Linux
Release     : 6.5.0-1025-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit



In [5]:
print(watermark(packages="pandas,numpy,scipy,sklearn,torch,transformers,tokenizers,sentencepiece,datasets,beevibe"))

pandas       : 2.2.3
numpy        : 2.2.1
scipy        : 1.14.1
sklearn      : 1.6.0
torch        : 2.5.1
transformers : 4.47.1
tokenizers   : 0.21.0
sentencepiece: 0.2.0
datasets     : 3.2.0
beevibe      : 0.1.0.dev13



## Load dataset

### Get Train & Test

In [6]:
data_files = {
    "train": "elegana_train_v0_1.csv",
    "test": "elegana_test_v0_1.csv",
}

dataset = load_dataset(
    "Franbul/elegana_relation_client_FR",
    data_files=data_files,
    sep="|")

pd_train = dataset["train"].to_pandas()
pd_test = dataset["test"].to_pandas()

### Get Themes

In [7]:
data_files = {
    "themes": "elegana_themes_v0_1.csv"
}

dataset = load_dataset(
    "Franbul/elegana_relation_client_FR",
    data_files=data_files,
    sep="|")

pd_themes = dataset["themes"].to_pandas()


### Merge datas

In [8]:
# Merge train, test and thems
pd_data = pd.merge(pd_train, pd_themes, on="THEME", how='left')
pd_data_test = pd.merge(pd_test, pd_themes, on="THEME", how='left')

# Get a sample here
pd_data = pd_data.sample(200, random_state=1811)

In [9]:
pd_data.shape

(200, 10)

In [10]:
pd_data.head()

Unnamed: 0,CLIENT,CONSEILLER,THEME,DESCRIPTION,2_CLASSES,5_CLASSES,LABEL_1,LABEL_2,LABEL_3,LABEL_4
320,"Je veux offrir un cadeau à ma meilleure amie, ...","Pour un style bohème, nos robes à motifs, nos ...",Demande de conseils pour cadeaux,Aide pour choisir le cadeau parfait.,Informations et services spécialisés,"Services exclusifs, programmes et personnalisa...",Cadeau,Idée,Choix,Spécial
1518,Votre collaboration avec l'artiste X sera-t-el...,"Oui, notre collaboration avec l'artiste X sera...",Informations sur les collaborations artistiques,Détails sur les partenariats avec des artistes...,Informations et services spécialisés,"Engagements, événements et initiatives",Collaboration,Artistique,Partenariat,Créatif
1322,"Mon père a un style classique, quel cadeau pou...",Je vous recommande de choisir parmi nos articl...,Demande de conseils pour cadeaux,Aide pour choisir le cadeau parfait.,Informations et services spécialisés,"Services exclusifs, programmes et personnalisa...",Cadeau,Idée,Choix,Spécial
1797,Le site ne me permet pas de modifier mon mode ...,Vous pouvez modifier votre mode de paiement lo...,Demande d'assistance pour un achat,Aide pour effectuer un achat sur le site.,Informations et services spécialisés,"Services exclusifs, programmes et personnalisa...",Achat,Site,Service,Client
2308,Comment nettoyer ma veste en daim ?,"Pour nettoyer votre veste en daim, utilisez un...",Demande de conseils d'entretien spécifiques,Conseils pour l'entretien de produits délicats.,Informations et services spécialisés,Conseils et informations produits,Spécifique,Conseil,Produit,Délicat


### Get texts & labels for training

In [None]:
pd_data['5_CLASSES'].value_counts()

5_CLASSES
Services exclusifs, programmes et personnalisations    90
Engagements, événements et initiatives                 46
Commandes, livraison et suivi                          31
Conseils et informations produits                      24
Assistance technique et support immédiat                9
Name: count, dtype: int64

In [12]:
# Use THEME in 5-Classes for classification
classes = np.unique(pd_data['5_CLASSES'])
le_esg = LabelEncoder()
le_esg.fit(classes)

# Get sentences and labels to train
labels_names = classes[le_esg.transform(classes)]
labels = le_esg.transform(pd_data['5_CLASSES']).tolist()
classes = np.unique(labels)
texts = pd_data["CLIENT"].values.tolist()
print(f"Train : Nb texts:{len(texts)}, Nb labels:{len(labels)}, Nb classes:{len(classes)}")

# Get sentences and labels to predict
test_texts = pd_data_test["CLIENT"].values.tolist()
test_labels = le_esg.transform(pd_data_test['5_CLASSES']).tolist()
print(f"Test  : Nb texts:{len(test_texts)}, Nb labels:{len(test_labels)}, Nb classes:{len(np.unique(test_labels))}")


Train : Nb texts:200, Nb labels:200, Nb classes:5
Test  : Nb texts:591, Nb labels:591, Nb classes:5


## Test train, save, load

In [14]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.15
loss_treshold = 1.3

num_labels = len(labels_names)

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "cmarkea/distilcamembert-base",
    num_labels = num_labels,
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                            labels_names=labels_names,
                            )

# Train the model
ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    loss_threshold=loss_treshold,
                    balanced=True
                    )

Device : cpu
Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used
Epoch 0/0, Training Loss: 1.5447
Elapsed time: 00:00:18


In [16]:
model.save_model_safetensors(save_path="../model-test_v2")

In [17]:
bee_mlm_model_loaded = BeeMLMClassifier.load_model_safetensors("../model-test_v2")

## Test train_test_split

In [14]:
#lr_dict = {"lr": 1e-5}
#lr = 1e-5

lr = 1e-6
lr_dict = {"lr": 1e-4}

if lr != 1e-5:
    lr_dict['lr'] = lr
else:
    if lr_dict.get('lr') is None:
        lr_dict['lr'] = lr
    else:
        lr=lr_dict.get('lr')

print(lr, lr_dict)

1e-06 {'lr': 1e-06}


In [8]:
from sklearn.model_selection import train_test_split

texts = [1,2,3,4,5]
labels = [1,2,3,4,5]
val_size = None
seed=1811


train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=val_size, shuffle=True, random_state=seed
)

print(len(train_texts), len(val_texts), len(train_labels), len(val_labels))
print(train_texts,train_labels)
print(val_texts,val_labels)


3 2 3 2
[2, 4, 1] [2, 4, 1]
[5, 3] [5, 3]


## Tests HF Hub

In [18]:
# Get HuggingFace token
hf_hub = HuggingFaceHub()
hf_token = hf_hub.load_hf_token()

In [19]:
# Save model to hub
repo_name = "Franbul/distill-camembert-esrs-v0"
hf_hub.save_to_hf_hub(
    directory_path="../model-test_v2",
    repo_name=repo_name,
    token=hf_token
    )

In [20]:
# Load model from hub
repo_name = "Franbul/distill-camembert-esrs-v0"
hf_hub.load_from_hf_hub(
    directory_path="../model-test_v2_bis",
    repo_name=repo_name,
    token=hf_token
    )

In [21]:
bee_mlm_model_loaded = BeeMLMClassifier.load_model_safetensors("../model-test_v2_bis")

In [22]:
help(bee_mlm_model_loaded)

Help on BeeMLMClassifier in module beevibe.core.models object:

class BeeMLMClassifier(BeeBaseModel)
 |  BeeMLMClassifier(model_name: str, num_labels: int, head_layers: list[dict] = None)
 |
 |  A custom model for sequence classification with a flexible linear stack on top of a pretrained transformer.
 |
 |  Method resolution order:
 |      BeeMLMClassifier
 |      BeeBaseModel
 |      torch.nn.modules.module.Module
 |      builtins.object
 |
 |  Methods defined here:
 |
 |  __init__(self, model_name: str, num_labels: int, head_layers: list[dict] = None)
 |      Initializes the CustomModel class.
 |
 |      Args:
 |          model_name (str): The name of the pretrained model.
 |          num_labels (int): The number of labels for classification.
 |          head_layers (list of dict): Configuration for custom layers.
 |
 |  forward(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, labels: torch.Tensor = None, inputs_embeds: Optional[torch.FloatTensor] = None

In [23]:
bee_mlm_model_loaded.model_name

'cmarkea/distilcamembert-base'

In [24]:
bee_mlm_model_loaded.num_labels

5

In [25]:
bee_mlm_model_loaded.head_layers

[{'input_size': 768, 'output_size': 5}]

## Test Train with threshold

In [14]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.15
loss_treshold = 1.3

num_labels = len(labels_names)

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "cmarkea/distilcamembert-base",
    num_labels = num_labels,
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                            labels_names=labels_names,
                            )

# Train the model
ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    loss_treshold=loss_treshold,
                    balanced=True
                    )

Device : cpu
Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used
Epoch 0/0, Training Loss: 1.5447
Elapsed time: 00:00:16


In [15]:
trainer.save_model("../model-test_v0")

layer_config [{'input_size': 768, 'output_size': 5, 'activation': None}]


In [16]:
trainer.release_model()

In [17]:
bee_mlm_model = BeeMLMClassifier.load_model_safetensors("../model-test_v0")

In [18]:
y_preds = bee_mlm_model.predict(texts, batch_size=50, device="cpu")
mcc = matthews_corrcoef(labels, y_preds)
print(mcc)

0.10963781686019765


In [19]:
# Get HuggingFace token
hf_hub = HuggingFaceHub()
hf_token = hf_hub.load_hf_token()

In [21]:
# Save model to hub
repo_name = "Franbul/distill-camembert-esrs-v0"
hf_hub.save_to_hf_hub(
    directory_path="../model-test_v0",
    repo_name=repo_name,
    token=hf_token
    )

In [22]:
# Load model from hub
repo_name = "Franbul/distill-camembert-esrs-v0"
hf_hub.load_from_hf_hub(
    directory_path="../model-test_v1",
    repo_name=repo_name,
    token=hf_token
    )

In [23]:
bee_mlm_model = BeeMLMClassifier.load_model_safetensors("../model-test_v1")

In [24]:
y_preds = bee_mlm_model.predict(texts, batch_size=50, device="cpu")
mcc = matthews_corrcoef(labels, y_preds)
print(mcc)

0.10963781686019765


## Test Holdout with Validation Set

In [14]:
num_epochs = 1
batch_size = 2 # 8
patience = 3
min_delta = 0.001
val_size = 0.8

# Get number of classes to predict
num_labels = len(labels_names)

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "cmarkea/distilcamembert-base",
    num_labels = num_labels,
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                    #labels_names=labels_names,
                    )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts[:200],
                    labels=labels[:200],
                    val_size=val_size,
                    val_texts=test_texts[:100],
                    val_labels=test_labels[:100],
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()

Device : cpu
Holdout uses the given validation texts and labels
Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used
Epoch 0/0, Train Loss: 1.5167, Val Loss: 1.3120, Val MCC: 0.3784, lr: 1.000e-05
Best epoch: 0, Best loss: 1.3120


** Global metrics :

 - accuracy: 0.6400
 - f1_macro: 0.2848
 - f1_micro: 0.6400
 - f1_weighted: 0.5355
 - mcc: 0.3784


** Per-Classes metrics :

Class          Precision      Recall          F1     Support
Class 0           0.0000      0.0000      0.0000           4
Class 1           0.0000      0.0000      0.0000          10
Class 2           0.0000      0.0000      0.0000          13
Class 3           0.7857      0.5789      0.6667          19
Class 4           0.6163      0.9815      0.7571          54


** Confusion Matrix (FN/Row - FP/Col):

            Class 0  Class 1  Class 2  Class 3  Class 4 
Class 0      0        0        0        0        4    
Class 1      0        0        0        0       10    
Class 2      0        0        0        2  

## Tests Multilabels Training

In [16]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.15
loss_treshold = 1.3

num_labels = len(labels_names)

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "cmarkea/distilcamembert-base",
    num_labels = num_labels,
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                    labels_names=labels_names,
                    multilabel=True
                    )

# Train the model
ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    loss_treshold=loss_treshold,
                    #balanced=True
                    )

Device : cpu
Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used


ValueError: Target size (torch.Size([2])) must be the same as input size (torch.Size([2, 5]))

## Holdout

In [13]:
# Create custom model : Normalized Layers
#head_layer_configs = [
#        {"input_size": 768, "output_size": num_labels, "activation": None},
#    ]

### Starter code

In [14]:
num_epochs = 1
batch_size = 2 # 8
patience = 3
min_delta = 0.001
val_size = 0.8

# Get number of classes to predict
num_labels = len(labels_names)

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "camembert-base",
    num_labels = num_labels,
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                    #labels_names=labels_names,
                    )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()

Device : cpu
Holdout creates a 0.8 % validation texts and labels from train
Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used
Epoch 0/0, Train Loss: 1.6187, Val Loss: 1.5694, Val MCC: 0.0086, lr: 1.000e-05
Best epoch: 0, Best loss: 1.5694


** Global metrics :

 - accuracy: 0.4437
 - f1_macro: 0.1388
 - f1_micro: 0.4437
 - f1_weighted: 0.2873
 - mcc: 0.0086


** Per-Classes metrics :

Class          Precision      Recall          F1     Support
Class 0           0.0000      0.0000      0.0000           8
Class 1           0.3333      0.0455      0.0800          22
Class 2           0.0000      0.0000      0.0000          21
Class 3           0.0000      0.0000      0.0000          37
Class 4           0.4487      0.9722      0.6140          72


** Confusion Matrix (FN/Row - FP/Col):

            Class 0  Class 1  Class 2  Class 3  Class 4 
Class 0      0        0        0        0        8    
Class 1      0        1        0        0       21    
Class 2      0        1        

### Modify classification  head

In [16]:
num_epochs = 1
batch_size = 2 # 8
patience = 3
min_delta = 0.001
val_size = 0.8

# Get number of classes to predict
num_labels = len(labels_names)

# Define a custom classification head
head_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels},
    ]

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "camembert-base",
    num_labels = num_labels,
    head_layers=head_layer_configs
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                     labels_names=labels_names,
                     )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()


Device : cpu


Holdout creates a 0.8 % validation texts and labels from train
Use optimizer : Adam
 - {'lr': 1e-05}
No scheduler used
Epoch 0/0, Train Loss: 1.6052, Val Loss: 1.6484, Val MCC: -0.0883, lr: 1.000e-05
Best epoch: 0, Best loss: 1.6484


** Global metrics :

 - accuracy: 0.1437
 - f1_macro: 0.0793
 - f1_micro: 0.1437
 - f1_weighted: 0.0767
 - mcc: -0.0883


** Per-Classes metrics :

Class                                                      Precision      Recall          F1     Support
Assistance technique et support immédiat                      0.0000      0.0000      0.0000           8
Commandes, livraison et suivi                                 0.0000      0.0000      0.0000          22
Conseils et informations produits                             0.1017      0.2857      0.1500          21
Engagements, événements et initiatives                        0.1683      0.4595      0.2464          37
Services exclusifs, programmes et personnalisations           0.0000      0.0000      0.0000

### Use AdamW

In [17]:
num_epochs = 1
batch_size = 2 # 8
patience = 3
min_delta = 0.001
val_size = 0.8

# Get number of classes to predict
num_labels = len(labels_names)

# Define a custom classification head
head_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels},
    ]

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "camembert-base",
    num_labels = num_labels,
    head_layers=head_layer_configs
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                    labels_names=labels_names,
                    optimizer_class=AdamW
                    )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()

Device : cpu
Holdout creates a 0.8 % validation texts and labels from train
Use optimizer : AdamW
 - {'lr': 1e-05}
No scheduler used
Epoch 0/0, Train Loss: 1.6052, Val Loss: 1.6485, Val MCC: -0.0870, lr: 1.000e-05
Best epoch: 0, Best loss: 1.6485


** Global metrics :

 - accuracy: 0.1437
 - f1_macro: 0.0793
 - f1_micro: 0.1437
 - f1_weighted: 0.0768
 - mcc: -0.0870


** Per-Classes metrics :

Class                                                      Precision      Recall          F1     Support
Assistance technique et support immédiat                      0.0000      0.0000      0.0000           8
Commandes, livraison et suivi                                 0.0000      0.0000      0.0000          22
Conseils et informations produits                             0.1000      0.2857      0.1481          21
Engagements, événements et initiatives                        0.1700      0.4595      0.2482          37
Services exclusifs, programmes et personnalisations           0.0000      0.00

### Add a Lora configuration

In [18]:
num_epochs = 1
batch_size = 2 # 8
patience = 3
min_delta = 0.001
val_size = 0.8

# Get number of classes to predict
num_labels = len(labels_names)

# Define a custom classification head
head_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels},
    ]

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "camembert-base",
    num_labels = num_labels,
    head_layers=head_layer_configs
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                            labels_names=labels_names,
                            optimizer_class=AdamW,
                            use_lora=True,
                            lora_r = 64,
                            lora_alpha= 128,
                            lora_dropout = 0.01,
                            )

# Train over a Holdout with Earlystopping
ret = trainer.holdout(texts=texts,
                    labels=labels,
                    val_size=val_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    patience=patience,
                    min_delta=min_delta,
                    balanced=True
                    )

# Free CPU/GPU memory
trainer.release_model()

Device : cpu
Holdout creates a 0.8 % validation texts and labels from train
Using Lora
Target modules : ['base_model.encoder.layer.0.attention.self.query', 'base_model.encoder.layer.0.attention.self.key', 'base_model.encoder.layer.0.attention.output.dense', 'base_model.encoder.layer.0.intermediate.dense', 'base_model.encoder.layer.0.output.dense', 'base_model.encoder.layer.1.attention.self.query', 'base_model.encoder.layer.1.attention.self.key', 'base_model.encoder.layer.1.attention.output.dense', 'base_model.encoder.layer.1.intermediate.dense', 'base_model.encoder.layer.1.output.dense', 'base_model.encoder.layer.2.attention.self.query', 'base_model.encoder.layer.2.attention.self.key', 'base_model.encoder.layer.2.attention.output.dense', 'base_model.encoder.layer.2.intermediate.dense', 'base_model.encoder.layer.2.output.dense', 'base_model.encoder.layer.3.attention.self.query', 'base_model.encoder.layer.3.attention.self.key', 'base_model.encoder.layer.3.attention.output.dense', 'base_m

### 

## Cross-validation

In [19]:
num_epochs = 1
batch_size = 2 # 8
patience = 3
min_delta = 0.001
n_splits = 3

# Get number of classes to predict
num_labels = len(labels_names)

# Define a custom classification head
head_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels},
    ]

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "camembert-base",
    num_labels = num_labels,
    head_layers=head_layer_configs
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                            labels_names=labels_names,
                            optimizer_class=AdamW,
                            use_lora=True,
                            lora_r = 64,
                            lora_alpha= 128,
                            lora_dropout = 0.01,
                            )

# Cross validate with Earlystopping
rets = trainer.cross_validation(texts=texts,
              labels=labels,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)

# Free CPU/GPU memory
trainer.release_model()

Device : cpu


> Fold 1
Using Lora
Target modules : ['base_model.encoder.layer.0.attention.self.query', 'base_model.encoder.layer.0.attention.self.key', 'base_model.encoder.layer.0.attention.output.dense', 'base_model.encoder.layer.0.intermediate.dense', 'base_model.encoder.layer.0.output.dense', 'base_model.encoder.layer.1.attention.self.query', 'base_model.encoder.layer.1.attention.self.key', 'base_model.encoder.layer.1.attention.output.dense', 'base_model.encoder.layer.1.intermediate.dense', 'base_model.encoder.layer.1.output.dense', 'base_model.encoder.layer.2.attention.self.query', 'base_model.encoder.layer.2.attention.self.key', 'base_model.encoder.layer.2.attention.output.dense', 'base_model.encoder.layer.2.intermediate.dense', 'base_model.encoder.layer.2.output.dense', 'base_model.encoder.layer.3.attention.self.query', 'base_model.encoder.layer.3.attention.self.key', 'base_model.encoder.layer.3.attention.output.dense', 'base_model.encoder.layer.3.intermediate.dense', 'base_mode

## Full Training

In [26]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.15

num_labels = len(labels_names)

# Define a custom classification head
head_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels},
    ]

# Create Classification Model from "Camembert-base"
model = BeeMLMClassifier(
    model_name = "camembert-base",
    num_labels = num_labels,
    head_layers=head_layer_configs
)

# Create Trainer with Lora parameters
trainer = BeeTrainer(model=model,
                            labels_names=labels_names,
                            optimizer_class=AdamW,
                            use_lora=True,
                            lora_r = 64,
                            lora_alpha= 128,
                            lora_dropout = 0.01,
                            )

# Train the model
ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    balanced=True
                    )

# Save model and adaptater
trainer.save_model("./model-multiclass_v1")
trainer.save_adaptater("./adaptater-multiclass_v1")

# Free CPU/GPU memory
trainer.release_model()

Device : cpu


Using Lora
Target modules : ['base_model.encoder.layer.0.attention.self.query', 'base_model.encoder.layer.0.attention.self.key', 'base_model.encoder.layer.0.attention.output.dense', 'base_model.encoder.layer.0.intermediate.dense', 'base_model.encoder.layer.0.output.dense', 'base_model.encoder.layer.1.attention.self.query', 'base_model.encoder.layer.1.attention.self.key', 'base_model.encoder.layer.1.attention.output.dense', 'base_model.encoder.layer.1.intermediate.dense', 'base_model.encoder.layer.1.output.dense', 'base_model.encoder.layer.2.attention.self.query', 'base_model.encoder.layer.2.attention.self.key', 'base_model.encoder.layer.2.attention.output.dense', 'base_model.encoder.layer.2.intermediate.dense', 'base_model.encoder.layer.2.output.dense', 'base_model.encoder.layer.3.attention.self.query', 'base_model.encoder.layer.3.attention.self.key', 'base_model.encoder.layer.3.attention.output.dense', 'base_model.encoder.layer.3.intermediate.dense', 'base_model.encoder.layer.3.output

## Predictions

### Reload model

In [13]:
# Load the trained model
bee_mlm_model = BeeMLMClassifier.load_model_safetensors("./model-multiclass_v1")

### Test predictions

In [14]:
y_preds = bee_mlm_model.predict(test_texts, batch_size=50, device="cpu")
mcc = matthews_corrcoef(test_labels, y_preds)
print(mcc)

-0.0421334602516051


## Test Classification Head

In [15]:
num_labels = 3

network_patterns = [

    # Pattern 1: Basic Feedforward Network
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 2: Shallow Network with Dropout
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.2},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 3: Batch-Normalized Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 4: Wide Hidden Layers
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.GELU},
        {"input_size": 1024, "output_size": 512, "activation": nn.GELU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 5: Layer Normalization
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 6: Minimalistic Network
    [
        {"input_size": 768, "output_size": 128, "activation": nn.Tanh},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 7: Deep Feedforward Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 8: Dropout Regularization
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 9: Deep Residual Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "residual": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 10: Compact Network
    [
        {"input_size": 768, "output_size": 64, "activation": nn.SiLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 11: Fully Connected Bottleneck Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 12: Dense Network with No Activation in Final Layer
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 13: Layer Normalization with High Dropout
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "layer_norm": True, "dropout_rate": 0.5},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "layer_norm": True, "dropout_rate": 0.5},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 14: Gated Activation with Dropout
    [
        {"input_size": 768, "output_size": 512, "activation": nn.GELU, "dropout_rate": 0.2},
        {"input_size": 512, "output_size": 256, "activation": nn.SiLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 15: Progressive Layer Expansion
    [
        {"input_size": 768, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 16: Deep Wide Network
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 17: Dense Residual Network with Skip Connections
    [
        {"input_size": 768, "output_size": 768, "activation": nn.ReLU, "residual": True},
        {"input_size": 768, "output_size": 768, "activation": nn.ReLU, "residual": True},
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 18: Alternating Normalization
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 19: Advanced Progressive Shrinkage
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 20: Feature Extractor with Sparse Hidden Units
    [
        {"input_size": 768, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": 32, "activation": nn.ReLU},
        {"input_size": 32, "output_size": num_labels, "activation": None},
    ],

    # Pattern 21: Multi-Layer Sparse Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 22: Alternating Dropout Intensities
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.2},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.5},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.3},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 23: High-Dimensional Bottleneck with Residuals
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.GELU, "residual": True},
        {"input_size": 1024, "output_size": 256, "activation": nn.GELU, "residual": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 24: Fully Dense Network with Gradient Clipping
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": num_labels, "activation": None},
    ],

    # Pattern 25: Deeply Layer-Normalized Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 26: Modular Feedforward Blocks
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 27: High-Frequency Regularization Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.1},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.1},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.1},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 28: Alternating Activations Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.SiLU},
        {"input_size": 256, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 29: Sequential Dropout and Bottleneck
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 128, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 30: Multi-Head Modular Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 31: Sparse Progressive Expansion
    [
        {"input_size": 768, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 32: Residual Shrinking Layers
    [
        {"input_size": 768, "output_size": 512, "activation": nn.GELU, "residual": True},
        {"input_size": 512, "output_size": 256, "activation": nn.GELU, "residual": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 33: Dual Activation Fusion
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.SiLU},
        {"input_size": 256, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 34: Deeply Narrow Network
    [
        {"input_size": 768, "output_size": 128, "activation": nn.Tanh},
        {"input_size": 128, "output_size": 64, "activation": nn.Tanh},
        {"input_size": 64, "output_size": 32, "activation": nn.Tanh},
        {"input_size": 32, "output_size": num_labels, "activation": None},
    ],

    # Pattern 35: Cyclic Dropout Network
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.2},
        {"input_size": 256, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 36: Wide Bottleneck Layers
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": num_labels, "activation": None},
    ],

    # Pattern 37: Alternating Sparse Connectivity
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 38: Progressive Layer Normalization
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 39: Activation Modulated Network
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": 64, "activation": nn.Tanh},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 40: Double Residual Connections
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "residual": True},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 41: Dense Alternating Widths
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 42: Multi-Normalization Layers
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 43: Sparse Expansion with Skip
    [
        {"input_size": 768, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 44: Mixed Activations Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.SiLU},
        {"input_size": 512, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 45: Dense Shallow Layers
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 46: Split-and-Merge Architecture
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 47: Gradient Clipping Network
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": num_labels, "activation": None},
    ],

    # Pattern 48: High-Dropout Bottleneck
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.5},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.5},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 49: Alternating Nonlinearities
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.Tanh},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 50: Wide-to-Narrow Progressive Network
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

]

In [23]:

custom_layer_configs = [
    {"input_size": 768, "output_size": 5, "activation": nn.AvgPool1d},
]

network_patterns = [[
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ]]

for custom_layer_configs in network_patterns:

    result = [
        {k: (v.__name__ if k == "activation" and v else v) for k, v in layer.items() if not (k == "activation" and v is None)}
        for layer in custom_layer_configs
    ]

    print(result)

    layer_configs = [
        {**layer, "activation": getattr(nn, layer.get("activation")) if layer.get("activation") else None}
        for layer in result
        ]

    print(layer_configs)


[{'input_size': 768, 'output_size': 512, 'activation': 'ReLU', 'batch_norm': True}, {'input_size': 512, 'output_size': 256, 'activation': 'ReLU', 'layer_norm': True}, {'input_size': 256, 'output_size': 3}]
[{'input_size': 768, 'output_size': 512, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'batch_norm': True}, {'input_size': 512, 'output_size': 256, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'layer_norm': True}, {'input_size': 256, 'output_size': 3, 'activation': None}]


In [26]:
config = {"head_layer_config":[{'input_size': 768, 'output_size': 512, 'activation': 'ReLU', 'batch_norm': True}, {'input_size': 512, 'output_size': 256, 'activation': 'ReLU', 'layer_norm': True}, {'input_size': 256, 'output_size': 3}]}

layer_configs = [
    {**layer, "activation": getattr(nn, layer.get("activation")) if layer.get("activation") else None}
    for layer in config.get("head_layer_config")
    ]

print(layer_configs)


[{'input_size': 768, 'output_size': 512, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'batch_norm': True}, {'input_size': 512, 'output_size': 256, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'layer_norm': True}, {'input_size': 256, 'output_size': 3, 'activation': None}]


In [27]:
from torch import nn

config = {
    "head_layer_config": [
        {'input_size': 768, 'output_size': 512, 'activation': 'ReLU', 'batch_norm': True},
        {'input_size': 512, 'output_size': 256, 'activation': 'ReLU', 'layer_norm': True},
        {'input_size': 256, 'output_size': 3, 'activation': None}
    ]
}

layer_configs = [
    {k: (getattr(nn, v) if k == "activation" and v else v) for k, v in layer.items() if not (k == "activation" and v is None)}
    for layer in config.get("head_layer_config")
]

print(layer_configs)

[{'input_size': 768, 'output_size': 512, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'batch_norm': True}, {'input_size': 512, 'output_size': 256, 'activation': <class 'torch.nn.modules.activation.ReLU'>, 'layer_norm': True}, {'input_size': 256, 'output_size': 3}]


# End of game