# Multiclass tutorial


# Manage Packages

## Import package

In [20]:
import gc
import sys
import pandas as pd
import numpy as np
from watermark import watermark
import torch.nn as nn
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from beevibe import BeeTrainer, BeeMLMClassifier
#from beevibe import AIHelper

## GPU Card

In [21]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


## Drive Directory

<!> Please adjust to your notebook path

In [22]:
# Path sur le projet
sys.path.insert(0, "..")

## Packages versions

In [23]:
print(watermark())

Last updated: 2025-01-18T19:24:34.987326+00:00

Python implementation: CPython
Python version       : 3.12.1
IPython version      : 8.30.0

Compiler    : GCC 9.4.0
OS          : Linux
Release     : 6.5.0-1025-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit



In [24]:
print(watermark(packages="pandas,numpy,scipy,sklearn,torch,transformers,tokenizers,sentencepiece,datasets,beevibe"))

pandas       : 2.2.3
numpy        : 2.2.1
scipy        : 1.14.1
sklearn      : 1.6.0
torch        : 2.5.1
transformers : 4.47.1
tokenizers   : 0.21.0
sentencepiece: 0.2.0
datasets     : 3.2.0
beevibe      : 0.1.0.dev11



## Global parameters

In [25]:
SEED = 1811
EXCEL_PATH = '../dataset/elegana'
CSV_FILE_NAME_TRAIN = "elegana_train_v0_1.csv"
CSV_FILE_NAME_TEST = "elegana_test_v0_1.csv"
CSV_FILE_NAME_THEMES = "elegana_themes_v0_1.csv"

# Chargement des données

In [26]:
pd_train = pd.read_csv(EXCEL_PATH+'/'+CSV_FILE_NAME_TRAIN, sep="|")
pd_themes = pd.read_csv(EXCEL_PATH+'/'+CSV_FILE_NAME_THEMES, sep="|")
pd_data = pd.merge(pd_train, pd_themes, on="THEME", how='left')
pd_data = pd_data.sample(200, random_state=1811)

## Get Train

In [27]:
pd_data.columns

Index(['CLIENT', 'CONSEILLER', 'THEME', 'DESCRIPTION', '2_CLASSES',
       '5_CLASSES', 'LABEL_1', 'LABEL_2', 'LABEL_3', 'LABEL_4'],
      dtype='object')

In [28]:
from sklearn.preprocessing import LabelEncoder

# Get THEME Labels
classes = np.unique(pd_data['5_CLASSES'])
le_esg = LabelEncoder()
le_esg.fit(classes)
#le_esg.transform(classes)
classes_names = classes[le_esg.transform(classes)]
labels = le_esg.transform(pd_data['5_CLASSES']).tolist() #[:100]).tolist()
classes = np.unique(labels)


# Get sentences
texts = pd_data["CLIENT"].values.tolist() #[:100]
len(texts), len(labels), len(classes)

(200, 200, 5)

In [29]:
pd_data['5_CLASSES'].value_counts()

5_CLASSES
Services exclusifs, programmes et personnalisations    90
Engagements, événements et initiatives                 46
Commandes, livraison et suivi                          31
Conseils et informations produits                      24
Assistance technique et support immédiat                9
Name: count, dtype: int64

# Helper

In [11]:
! pip uninstall httpx -y
! pip install httpx #==0.27.2

Found existing installation: httpx 0.28.1
Uninstalling httpx-0.28.1:
  Successfully uninstalled httpx-0.28.1
Collecting httpx
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Using cached httpx-0.28.1-py3-none-any.whl (73 kB)
Installing collected packages: httpx
Successfully installed httpx-0.28.1


In [None]:
! pip uninstall openai -y
#! pip install openai==1.56.1
! pip install openai #==0.28

In [None]:
from watermark import watermark
print(watermark(packages="openai,httpx"))

In [21]:
from transformers import AutoModel, AutoTokenizer

max_len = 128
model_name = "camembert-base"


preprocessing_config = {
    "add_special_tokens": True,
    "truncation": True,
    "padding": "max_length",
    "max_length": max_len,
    "return_token_type_ids": False,
    "return_attention_mask": True,
    "return_tensors": "pt",
}

tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path=model_name,
            clean_up_tokenization_spaces=True)


raw_reviews = ["Hello Word !"]

encoded_batch_1 = tokenizer(
    raw_reviews,
    **preprocessing_config
)

encoded_batch_2 = tokenizer.encode_plus(
    raw_reviews[0],
    **preprocessing_config
)



In [19]:
encoded_batch

{'input_ids': tensor([[    5,  9774, 15975,    83,     6,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,  

In [None]:
from openai import OpenAI
import configparser
import httpx

class CustomHTTPClient(httpx.Client):
    def __init__(self, *args, **kwargs):
        kwargs.pop("proxy", None)  # Remove the 'proxies' argument if present
        super().__init__(*args, **kwargs)


config_file = "config.ini"
config = configparser.ConfigParser()
config.read(config_file)
api_key = config.get("openai", "api_key", fallback=None)

client = OpenAI(
    http_client=CustomHTTPClient(),
    api_key=api_key,
)

question = "What is Flair ?"

messages = [
    {"role": "system", "content": "You are a friendly assistant helping developers understand a Python project."},
    {"role": "system", "content": "You answer questions about the Flair GitHub project you will find here: https://github.com/flairNLP/flair."},
    {"role": "system", "content": "Don't answer questions outside the scope of this project."},
    {"role": "user", "content": question},
]

response = client.chat.completions.create(
    messages=messages,
    model="gpt-4o-2024-08-06",
)

answer = response.choices[0].message.content

print(answer)


In [None]:
pwd

In [None]:
Beehelp = AIHelper()

In [None]:
import sys
import textwrap
from IPython.display import display, Markdown

response = Beehelp.help("Can you display the README of the project ?")
wrapper = textwrap.TextWrapper(width=50, replace_whitespace=False, drop_whitespace=False)
wrapped_lines = [wrapper.fill(line) for line in response.splitlines()]
wrapped_text = "\n".join(wrapped_lines)
display(Markdown(wrapped_text))


# Training

### Nouveau

In [None]:
def model():
    model_name = "bert-base-uncased"
    num_labels = 3
    layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.1},
        {"input_size": 512, "output_size": num_labels, "activation": None, "dropout_rate": None},
    ]
    return BeeMLMClassifier(model_name=model_name, num_labels=num_labels, layer_configs=layer_configs)

In [1]:
from beevibe import BeeMLMClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from torch import nn 
def model():
    model_name = "bert-base-uncased"
    num_labels = 3
    layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.1},
        {"input_size": 512, "output_size": num_labels, "activation": None, "dropout_rate": None},
    ]
    return BeeMLMClassifier(model_name=model_name, num_labels=num_labels, layer_configs=layer_configs)

In [5]:
model = model()

In [14]:
from unittest.mock import MagicMock

def test_save_and_load_model(tmp_path, model):
    # Mock base model and classifier
    model.base_model = MagicMock()
    model.classifier = MagicMock()

    # Mock valid layer_configs with serializable activations
    model.layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 3, "activation": None},
    ]

    save_directory = tmp_path / "model_dir"
    model.save_model_safetensors(save_directory)

    assert (save_directory / "model.safetensors").exists()
    assert (save_directory / "config.json").exists()

    # Load model
    loaded_model = BeeMLMClassifier.load_model_safetensors(save_directory)
    assert loaded_model.model_name == model.model_name
    assert loaded_model.num_labels == model.num_labels
    assert loaded_model.layer_configs[0]["activation"] == nn.ReLU
    

In [15]:
from pathlib import Path
tmp_path = Path("./")
test_save_and_load_model(tmp_path=tmp_path, model=model)

RuntimeError: Error(s) in loading state_dict for BertModel:
	Missing key(s) in state_dict: "embeddings.word_embeddings.weight", "embeddings.position_embeddings.weight", "embeddings.token_type_embeddings.weight", "embeddings.LayerNorm.weight", "embeddings.LayerNorm.bias", "encoder.layer.0.attention.self.query.weight", "encoder.layer.0.attention.self.query.bias", "encoder.layer.0.attention.self.key.weight", "encoder.layer.0.attention.self.key.bias", "encoder.layer.0.attention.self.value.weight", "encoder.layer.0.attention.self.value.bias", "encoder.layer.0.attention.output.dense.weight", "encoder.layer.0.attention.output.dense.bias", "encoder.layer.0.attention.output.LayerNorm.weight", "encoder.layer.0.attention.output.LayerNorm.bias", "encoder.layer.0.intermediate.dense.weight", "encoder.layer.0.intermediate.dense.bias", "encoder.layer.0.output.dense.weight", "encoder.layer.0.output.dense.bias", "encoder.layer.0.output.LayerNorm.weight", "encoder.layer.0.output.LayerNorm.bias", "encoder.layer.1.attention.self.query.weight", "encoder.layer.1.attention.self.query.bias", "encoder.layer.1.attention.self.key.weight", "encoder.layer.1.attention.self.key.bias", "encoder.layer.1.attention.self.value.weight", "encoder.layer.1.attention.self.value.bias", "encoder.layer.1.attention.output.dense.weight", "encoder.layer.1.attention.output.dense.bias", "encoder.layer.1.attention.output.LayerNorm.weight", "encoder.layer.1.attention.output.LayerNorm.bias", "encoder.layer.1.intermediate.dense.weight", "encoder.layer.1.intermediate.dense.bias", "encoder.layer.1.output.dense.weight", "encoder.layer.1.output.dense.bias", "encoder.layer.1.output.LayerNorm.weight", "encoder.layer.1.output.LayerNorm.bias", "encoder.layer.2.attention.self.query.weight", "encoder.layer.2.attention.self.query.bias", "encoder.layer.2.attention.self.key.weight", "encoder.layer.2.attention.self.key.bias", "encoder.layer.2.attention.self.value.weight", "encoder.layer.2.attention.self.value.bias", "encoder.layer.2.attention.output.dense.weight", "encoder.layer.2.attention.output.dense.bias", "encoder.layer.2.attention.output.LayerNorm.weight", "encoder.layer.2.attention.output.LayerNorm.bias", "encoder.layer.2.intermediate.dense.weight", "encoder.layer.2.intermediate.dense.bias", "encoder.layer.2.output.dense.weight", "encoder.layer.2.output.dense.bias", "encoder.layer.2.output.LayerNorm.weight", "encoder.layer.2.output.LayerNorm.bias", "encoder.layer.3.attention.self.query.weight", "encoder.layer.3.attention.self.query.bias", "encoder.layer.3.attention.self.key.weight", "encoder.layer.3.attention.self.key.bias", "encoder.layer.3.attention.self.value.weight", "encoder.layer.3.attention.self.value.bias", "encoder.layer.3.attention.output.dense.weight", "encoder.layer.3.attention.output.dense.bias", "encoder.layer.3.attention.output.LayerNorm.weight", "encoder.layer.3.attention.output.LayerNorm.bias", "encoder.layer.3.intermediate.dense.weight", "encoder.layer.3.intermediate.dense.bias", "encoder.layer.3.output.dense.weight", "encoder.layer.3.output.dense.bias", "encoder.layer.3.output.LayerNorm.weight", "encoder.layer.3.output.LayerNorm.bias", "encoder.layer.4.attention.self.query.weight", "encoder.layer.4.attention.self.query.bias", "encoder.layer.4.attention.self.key.weight", "encoder.layer.4.attention.self.key.bias", "encoder.layer.4.attention.self.value.weight", "encoder.layer.4.attention.self.value.bias", "encoder.layer.4.attention.output.dense.weight", "encoder.layer.4.attention.output.dense.bias", "encoder.layer.4.attention.output.LayerNorm.weight", "encoder.layer.4.attention.output.LayerNorm.bias", "encoder.layer.4.intermediate.dense.weight", "encoder.layer.4.intermediate.dense.bias", "encoder.layer.4.output.dense.weight", "encoder.layer.4.output.dense.bias", "encoder.layer.4.output.LayerNorm.weight", "encoder.layer.4.output.LayerNorm.bias", "encoder.layer.5.attention.self.query.weight", "encoder.layer.5.attention.self.query.bias", "encoder.layer.5.attention.self.key.weight", "encoder.layer.5.attention.self.key.bias", "encoder.layer.5.attention.self.value.weight", "encoder.layer.5.attention.self.value.bias", "encoder.layer.5.attention.output.dense.weight", "encoder.layer.5.attention.output.dense.bias", "encoder.layer.5.attention.output.LayerNorm.weight", "encoder.layer.5.attention.output.LayerNorm.bias", "encoder.layer.5.intermediate.dense.weight", "encoder.layer.5.intermediate.dense.bias", "encoder.layer.5.output.dense.weight", "encoder.layer.5.output.dense.bias", "encoder.layer.5.output.LayerNorm.weight", "encoder.layer.5.output.LayerNorm.bias", "encoder.layer.6.attention.self.query.weight", "encoder.layer.6.attention.self.query.bias", "encoder.layer.6.attention.self.key.weight", "encoder.layer.6.attention.self.key.bias", "encoder.layer.6.attention.self.value.weight", "encoder.layer.6.attention.self.value.bias", "encoder.layer.6.attention.output.dense.weight", "encoder.layer.6.attention.output.dense.bias", "encoder.layer.6.attention.output.LayerNorm.weight", "encoder.layer.6.attention.output.LayerNorm.bias", "encoder.layer.6.intermediate.dense.weight", "encoder.layer.6.intermediate.dense.bias", "encoder.layer.6.output.dense.weight", "encoder.layer.6.output.dense.bias", "encoder.layer.6.output.LayerNorm.weight", "encoder.layer.6.output.LayerNorm.bias", "encoder.layer.7.attention.self.query.weight", "encoder.layer.7.attention.self.query.bias", "encoder.layer.7.attention.self.key.weight", "encoder.layer.7.attention.self.key.bias", "encoder.layer.7.attention.self.value.weight", "encoder.layer.7.attention.self.value.bias", "encoder.layer.7.attention.output.dense.weight", "encoder.layer.7.attention.output.dense.bias", "encoder.layer.7.attention.output.LayerNorm.weight", "encoder.layer.7.attention.output.LayerNorm.bias", "encoder.layer.7.intermediate.dense.weight", "encoder.layer.7.intermediate.dense.bias", "encoder.layer.7.output.dense.weight", "encoder.layer.7.output.dense.bias", "encoder.layer.7.output.LayerNorm.weight", "encoder.layer.7.output.LayerNorm.bias", "encoder.layer.8.attention.self.query.weight", "encoder.layer.8.attention.self.query.bias", "encoder.layer.8.attention.self.key.weight", "encoder.layer.8.attention.self.key.bias", "encoder.layer.8.attention.self.value.weight", "encoder.layer.8.attention.self.value.bias", "encoder.layer.8.attention.output.dense.weight", "encoder.layer.8.attention.output.dense.bias", "encoder.layer.8.attention.output.LayerNorm.weight", "encoder.layer.8.attention.output.LayerNorm.bias", "encoder.layer.8.intermediate.dense.weight", "encoder.layer.8.intermediate.dense.bias", "encoder.layer.8.output.dense.weight", "encoder.layer.8.output.dense.bias", "encoder.layer.8.output.LayerNorm.weight", "encoder.layer.8.output.LayerNorm.bias", "encoder.layer.9.attention.self.query.weight", "encoder.layer.9.attention.self.query.bias", "encoder.layer.9.attention.self.key.weight", "encoder.layer.9.attention.self.key.bias", "encoder.layer.9.attention.self.value.weight", "encoder.layer.9.attention.self.value.bias", "encoder.layer.9.attention.output.dense.weight", "encoder.layer.9.attention.output.dense.bias", "encoder.layer.9.attention.output.LayerNorm.weight", "encoder.layer.9.attention.output.LayerNorm.bias", "encoder.layer.9.intermediate.dense.weight", "encoder.layer.9.intermediate.dense.bias", "encoder.layer.9.output.dense.weight", "encoder.layer.9.output.dense.bias", "encoder.layer.9.output.LayerNorm.weight", "encoder.layer.9.output.LayerNorm.bias", "encoder.layer.10.attention.self.query.weight", "encoder.layer.10.attention.self.query.bias", "encoder.layer.10.attention.self.key.weight", "encoder.layer.10.attention.self.key.bias", "encoder.layer.10.attention.self.value.weight", "encoder.layer.10.attention.self.value.bias", "encoder.layer.10.attention.output.dense.weight", "encoder.layer.10.attention.output.dense.bias", "encoder.layer.10.attention.output.LayerNorm.weight", "encoder.layer.10.attention.output.LayerNorm.bias", "encoder.layer.10.intermediate.dense.weight", "encoder.layer.10.intermediate.dense.bias", "encoder.layer.10.output.dense.weight", "encoder.layer.10.output.dense.bias", "encoder.layer.10.output.LayerNorm.weight", "encoder.layer.10.output.LayerNorm.bias", "encoder.layer.11.attention.self.query.weight", "encoder.layer.11.attention.self.query.bias", "encoder.layer.11.attention.self.key.weight", "encoder.layer.11.attention.self.key.bias", "encoder.layer.11.attention.self.value.weight", "encoder.layer.11.attention.self.value.bias", "encoder.layer.11.attention.output.dense.weight", "encoder.layer.11.attention.output.dense.bias", "encoder.layer.11.attention.output.LayerNorm.weight", "encoder.layer.11.attention.output.LayerNorm.bias", "encoder.layer.11.intermediate.dense.weight", "encoder.layer.11.intermediate.dense.bias", "encoder.layer.11.output.dense.weight", "encoder.layer.11.output.dense.bias", "encoder.layer.11.output.LayerNorm.weight", "encoder.layer.11.output.LayerNorm.bias", "pooler.dense.weight", "pooler.dense.bias". 

#### Training

In [16]:
num_epochs = 4
batch_size = 2 # 8
train_size = 0.1

num_classes = len(classes_names)

# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": num_classes, "activation": None},
    ]

model = BeeCustomMaskModelForClassification(
    model_name = "camembert-base",
    num_labels = num_classes,
    layer_configs=custom_layer_configs
)

trainer = MultiClassTrainer(model=model,
                            classes_names=classes_names,
                            optimizer_class=AdamW,
                            use_lora=True,
                            lora_r = 64,
                            lora_alpha= 128,
                            lora_dropout = 0.01,
                            #quantization_type="4bits",
                            )

ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    )

trainer.save_model("./model_v2")
trainer.save_adaptater("./lora_v2")

NameError: name 'classes_names' is not defined

In [12]:
loaded_model = BeeCustomMaskModelForClassification.load_model_safetensors("./model_v2")

In [13]:
preds = loaded_model.predict(texts)

In [14]:
len(preds)

200

In [15]:
preds

[2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 4,
 2,
 2,
 2,
 4,
 4,
 2,
 4,
 2,
 2,
 2,
 2,
 4,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 4,
 2,
 2,
 2]

In [30]:
num_epochs = 4
batch_size = 2 # 8
train_size = 0.1

num_classes = len(classes_names)

# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": num_classes, "activation": None},
    ]

model = BeeMLMClassifier(
    model_name = "camembert-base",
    num_labels = num_classes,
    layer_configs=custom_layer_configs
)

trainer = BeeTrainer(model=model,
                            classes_names=classes_names,
                            optimizer_class=AdamW,
                            use_lora=True,
                            lora_r = 64,
                            lora_alpha= 128,
                            lora_dropout = 0.01,
                            #quantization_type="4bits",
                            )

ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    )

trainer.save_model("./model_v2")
trainer.save_adaptater("./lora_v2")

Device : cpu
Using Lora
Target modules : ['base_model.encoder.layer.0.attention.self.query', 'base_model.encoder.layer.0.attention.self.key', 'base_model.encoder.layer.0.attention.output.dense', 'base_model.encoder.layer.0.intermediate.dense', 'base_model.encoder.layer.0.output.dense', 'base_model.encoder.layer.1.attention.self.query', 'base_model.encoder.layer.1.attention.self.key', 'base_model.encoder.layer.1.attention.output.dense', 'base_model.encoder.layer.1.intermediate.dense', 'base_model.encoder.layer.1.output.dense', 'base_model.encoder.layer.2.attention.self.query', 'base_model.encoder.layer.2.attention.self.key', 'base_model.encoder.layer.2.attention.output.dense', 'base_model.encoder.layer.2.intermediate.dense', 'base_model.encoder.layer.2.output.dense', 'base_model.encoder.layer.3.attention.self.query', 'base_model.encoder.layer.3.attention.self.key', 'base_model.encoder.layer.3.attention.output.dense', 'base_model.encoder.layer.3.intermediate.dense', 'base_model.encoder.l

TypeError: Object of type ndarray is not JSON serializable

In [42]:
[{**layer, "activation": layer["activation"].__name__ if layer["activation"] else None} for layer in model.layer_configs]

[{'input_size': 768, 'output_size': 5, 'activation': None}]

In [32]:
config = {
    "model_name": model.model_name,
    "num_labels": model.num_labels,
    "classes_names": model.classes_names,
    "multilabel": model.multilabel,
    "layer_configs": [
        {**layer, "activation": layer["activation"].__name__ if layer["activation"] else None}
        for layer in model.layer_configs
    ],
}

In [33]:
config

{'model_name': 'camembert-base',
 'num_labels': 5,
 'classes_names': array(['Assistance technique et support immédiat',
        'Commandes, livraison et suivi',
        'Conseils et informations produits',
        'Engagements, événements et initiatives',
        'Services exclusifs, programmes et personnalisations'],
       dtype=object),
 'multilabel': False,
 'layer_configs': [{'input_size': 768, 'output_size': 5, 'activation': None}]}

In [36]:
config = {
    "model_name": model.model_name,
    "num_labels": model.num_labels,
    "classes_names": model.classes_names,
    "multilabel": model.multilabel,
    "layer_configs": model.layer_configs,
}

In [37]:
config

{'model_name': 'camembert-base',
 'num_labels': 5,
 'classes_names': array(['Assistance technique et support immédiat',
        'Commandes, livraison et suivi',
        'Conseils et informations produits',
        'Engagements, événements et initiatives',
        'Services exclusifs, programmes et personnalisations'],
       dtype=object),
 'multilabel': False,
 'layer_configs': [{'input_size': 768, 'output_size': 5, 'activation': None}]}

In [39]:
import json, os

In [40]:
save_directory = "./"
with open(os.path.join(save_directory, "test_config.json"), "w") as f:
    json.dump(config, f)

TypeError: Object of type ndarray is not JSON serializable

In [45]:
custom_layer_configs = [
    {"input_size": 768, "output_size": 5, "activation": nn.ReLU},
]

result = [
    {k: (v.__name__ if k == "activation" and v else v) for k, v in layer.items() if not (k == "activation" and v is None)}
    for layer in custom_layer_configs
]


print(result)

[{'input_size': 768, 'output_size': 5, 'activation': 'ReLU'}]


In [31]:
trainer.save_model("./model_v2")

TypeError: Object of type ndarray is not JSON serializable

In [12]:
loaded_model = BeeMLMClassifier.load_model_safetensors("./model_v2")

In [13]:
preds = loaded_model.predict(texts)

In [15]:
len(preds)

200

In [20]:
#loaded_model.layer_configs

In [12]:
from beevibe import HFTokenizer, TextDatasetMC
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

preprocessing_config = {
    "add_special_tokens": True,
    "truncation": True,
    "padding": "max_length",
    "max_length": 128,
    "return_token_type_ids": False,
    "return_attention_mask": True,
    "return_tensors": "pt",
        }

hftokenizer = HFTokenizer(preprocessing_config).from_pretrained(
    "camembert-base",
    clean_up_tokenization_spaces=True
)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=(1.0 - 0.2),
    shuffle=True,
    random_state=1811,
)
print(len(texts), len(labels))
print(texts[:5], labels[:5])  # Print a small sample

train_dataset = TextDatasetMC(
    train_texts, train_labels, hftokenizer #.tokenizer
)
print(f"Dataset length: {len(train_dataset)}")
print(train_dataset[0])  # Print the first item in the dataset


train_loader = DataLoader(train_dataset, batch_size=128, drop_last=True)
for batch in train_loader:
    print(batch)
    break  # Stop after the first batch

device = "cpu"

for i, batch in enumerate(train_loader):
    print(f"Batch {i} contents:")
    print(batch)  # Print the entire batch dictionary

    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["label"].to(device)

    print(f"input_ids shape: {input_ids.shape}")
    print(f"attention_mask shape: {attention_mask.shape}")
    print(f"labels shape: {labels.shape}")


200 200
['Je veux offrir un cadeau à ma meilleure amie, elle a un style bohème. Que pouvez-vous me suggérer ?', "Votre collaboration avec l'artiste X sera-t-elle disponible à l'international ?", 'Mon père a un style classique, quel cadeau pourrais-je lui offrir de votre marque ?', 'Le site ne me permet pas de modifier mon mode de paiement.', 'Comment nettoyer ma veste en daim ?'] [4, 3, 4, 4, 2]
Dataset length: 40
{'text': "Je trouve que vous utilisez trop d'emballage pour vos produits. Pouvez-vous réduire la quantité ?", 'input_ids': tensor([    5,   100,   396,    27,    39,  5451,   237,    18,    11, 10183,
           24,   140,   336,     9, 19214,    26,   315,  2020,    13,  2329,
          106,     6,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1

In [12]:
! ls -lia ./model_v0
# 38177540
# 38177540

total 469468
1573320 drwxrwxrwx+ 2 codespace codespace      4096 Jan  6 14:14 .
1310802 drwxrwxrwx+ 4 codespace root           4096 Jan  6 13:44 ..
1573321 -rw-rw-rw-  1 codespace codespace      5088 Jan  6 13:44 README.md
1573323 -rw-rw-rw-  1 codespace codespace       738 Jan  6 13:44 adapter_config.json
1573322 -rw-rw-rw-  1 codespace codespace  38175436 Jan  6 13:44 adapter_model.safetensors
1573329 -rw-rw-rw-  1 codespace codespace       988 Jan  6 14:14 config.pth
1573328 -rw-rw-rw-  1 codespace codespace 442525684 Jan  6 14:14 model.safetensors


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
! cat ./model_v0/README.md

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


---
base_model: camembert-base
library_name: peft
---

# Model Card for Model ID

<!-- Provide a quick summary of what the model is/does. -->



## Model Details

### Model Description

<!-- Provide a longer summary of what this model is. -->



- **Developed by:** [More Information Needed]
- **Funded by [optional]:** [More Information Needed]
- **Shared by [optional]:** [More Information Needed]
- **Model type:** [More Information Needed]
- **Language(s) (NLP):** [More Information Needed]
- **License:** [More Information Needed]
- **Finetuned from model [optional]:** [More Information Needed]

### Model Sources [optional]

<!-- Provide the basic links for the model. -->

- **Repository:** [More Information Needed]
- **Paper [optional]:** [More Information Needed]
- **Demo [optional]:** [More Information Needed]

## Uses

<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->

### Direct Use


In [23]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.1

num_classes = len(classes_names)

# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_classes, "activation": None},
    ]

custom_layer_configs = [
        {"input_size": 768, "output_size": num_classes, "activation": None},
    ]

model = BeeCustomMaskModelForClassification(
    model_name = "camembert-base",
    num_labels = num_classes,
    layer_configs=custom_layer_configs
)

trainer = MultiClassTrainer(model=model,
                            classes_names=classes_names,
                            optimizer_class=AdamW
                            )

ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    )

Device : cpu
Use optimizer : AdamW
 - {'lr': 1e-05}
Use scheduler : ReduceLROnPlateau
 - {'mode': 'min', 'factor': 0.8, 'patience': 2}
Epoch 0/0, Training Loss: 1.6298
ReduceLROnPlateau scheduler can't be used without validation losses


In [None]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.1

model = "camembert-base"

trainer = MultiClassTrainer(model=model,
                            classes_names=classes_names,
                            optimizer_class=Adam,
                            optimizer_params={"lr": 0.005},
                            scheduler_class=ReduceLROnPlateau,
                            scheduler_params={"mode":"min", "factor":0.5, "patience":3}
                            )

ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    )

In [None]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.1

model = "camembert-base"

trainer = MultiClassTrainer(model=model,
                            classes_names=classes_names,
                            optimizer_class=AdamW
                            )

ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    )

In [None]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.1

model = BeeSimpleMaskModelForClassification(
    model_name = "camembert-base",
    num_labels = len(classes_names)
)

trainer = MultiClassTrainer(model=model,
                            classes_names=classes_names,
                            optimizer_class=AdamW
                            )

ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    )

In [None]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.1

model = BeeSimpleMaskModelForClassification(
    model_name = "camembert-base",
    num_labels = len(classes_names)
)

trainer = MultiClassTrainer(model=model,
                            classes_names=classes_names,
                            )

ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size
                    )

### Ancien

In [None]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.1

model_name = "camembert-base" #"google-bert/bert-large-cased-whole-word-masking" #"google-bert/bert-base-multilingual-cased" #"google-bert/bert-base-uncased" #"camembert-base" # "almanach/camembertv2-base"

trainer = MultiClassTrainer(model_name=model_name,
                            classes_names=classes_names,
                            )

ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size
                    )

### Test Optimizer parameter

In [8]:
import torch
import torch.nn as nn
from torch.optim import Adam

# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self, optimizer_class, optimizer_params):
        super(SimpleModel, self).__init__()
        self.optimizer_class = optimizer_class
        self.optimizer_params = optimizer_params
        self.optimizer = None

        self.fc1 = nn.Linear(10, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return torch.sigmoid(self.fc2(x))

    def create_optimizer(self, model_params):
        # Instantiate the optimizer when compiling
        self.optimizer = self.optimizer_class(**self.optimizer_params, params=model_params)


optimizer_class = Adam  # or another optimizer class
optimizer_params = {"lr": 0.001, "betas": (0.9, 0.999)}

model = SimpleModel(optimizer_class, optimizer_params)

model_params = model.parameters()

model.create_optimizer(model_params)


### Test Trainer parameters

In [None]:
classes_names

In [None]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.1

model_name = "camembert-base" #"google-bert/bert-large-cased-whole-word-masking" #"google-bert/bert-base-multilingual-cased" #"google-bert/bert-base-uncased" #"camembert-base" # "almanach/camembertv2-base"

trainer = MultiClassTrainer(model_name=model_name,
                            classes_names=classes_names.tolist(),
                            use_lora=True,
                            device="cpu"
                            )

### Test Pydantic

In [11]:
from beevibe.utils.validator import DatasetConfig

In [None]:
try:
    config = DatasetConfig(seed=1.0, texts=["1"], labels=[3])
except TypeError as e:
    print(e)  # "seed must be an integer."
    sys.exit(1)  # Stop execution


### Test Model Forward

In [None]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.1


model_name = "camembert-base" #"google-bert/bert-large-cased-whole-word-masking" #"google-bert/bert-base-multilingual-cased" #"google-bert/bert-base-uncased" #"camembert-base" # "almanach/camembertv2-base"
num_classes = len(classes)

# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_classes, "activation": None},
    ]

def custom_model_creator(model_name, num_classes) :
  model = BVCustomMaskModelForClassification(
    model_name=model_name,
    num_labels=num_classes,
    layer_configs=custom_layer_configs
  )
  return model


trainer = MultiClassTrainer(model_name=model_name,
                            model_creator=custom_model_creator,
                            classes_names=classes_names,
                            using_lora=True
                            )


ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size
                    )


In [None]:
for name, param in trainer.model.named_parameters():
    print(name)

In [11]:
model_name = "camembert-base" #"google-bert/bert-large-cased-whole-word-masking" #"google-bert/bert-base-multilingual-cased" #"google-bert/bert-base-uncased" #"camembert-base" # "almanach/camembertv2-base"
num_classes = len(classes)

# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_classes, "activation": None},
    ]

model = BVCustomMaskModelForClassification(
model_name=model_name,
num_labels=num_classes,
layer_configs=custom_layer_configs
)

from peft import LoraConfig, TaskType

def find_target_modules(model):
    target_modules = []
    for name, module in model.named_modules():
        if any(keyword in name for keyword in ["query", "key", "dense"]):
            target_modules.append(name)
    return target_modules

target_modules = find_target_modules(model)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,                        # Rank of low-rank decomposition
    lora_alpha=32,              # Scaling factor
    target_modules=target_modules,  # Target modules for LoRA
    lora_dropout=0.1,           # Dropout for regularization
    bias="none"                 # Do not fine-tune biases
)


from peft import get_peft_model
model = get_peft_model(model, lora_config)



In [None]:
model.config

In [22]:
model_name = "camembert-base"
base_model = AutoModel.from_pretrained(model_name)

target_modules = find_target_modules(base_model)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,                        # Rank of low-rank decomposition
    lora_alpha=32,              # Scaling factor
    target_modules=target_modules,  # Target modules for LoRA
    lora_dropout=0.1,           # Dropout for regularization
    bias="none"                 # Do not fine-tune biases
)


from peft import get_peft_model
peft_model = get_peft_model(base_model, lora_config)


In [None]:
peft_model.config

In [None]:
# Training features
num_epochs = 15 # 15
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8 # 8

# Model features
model_name = "camembert-base" # "almanach/camembertv2-base"
num_classes = len(classes)

# Create optimizer :AdamW
custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))

# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_classes, "activation": None},
    ]

def custom_model_creator(model_name, num_classes) :
  model = BVCustomMaskModelForClassification(
    model_name=model_name,
    num_labels=num_classes,
    layer_configs=custom_layer_configs
  )
  return model

# Create Multi-classes trainer
print("Create Trainer")
trainer = MultiClassTrainer(model_name=model_name,
                            optimizer_creator=custom_optimizer_creator,
                            optimizer_params=custom_optimizer_params,
                            model_creator=custom_model_creator,
                            classes_names=classes_names,
                            using_lora=True
                            )


# Train an Holdout
print("Train with early stopping ..")
ret = trainer.holdout(texts=texts,
              labels=labels,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True
              )

# Free CPU/GPU memory
trainer.release_model()

### Test QLora Quantization

In [None]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.1

model_name = "camembert-base" #"google-bert/bert-large-cased-whole-word-masking" #"google-bert/bert-base-multilingual-cased" #"google-bert/bert-base-uncased" #"camembert-base" # "almanach/camembertv2-base"

trainer = MultiClassTrainer(model_name=model_name,
                            classes_names=classes_names.tolist(),
                            quantization_type="4bit",
                            use_lora=True,
                            device="cpu"
                            )

In [None]:
trainer.quantization_config

In [None]:
num_epochs = 1
batch_size = 2 # 8
train_size = 0.1


model_name = "camembert-base" #"google-bert/bert-large-cased-whole-word-masking" #"google-bert/bert-base-multilingual-cased" #"google-bert/bert-base-uncased" #"camembert-base" # "almanach/camembertv2-base"
num_classes = len(classes)

# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_classes, "activation": None},
    ]

def custom_model_creator(model_name, num_classes, quantization_config) :
  model = BeeCustomMaskModelForClassification(
    model_name=model_name,
    num_labels=num_classes,
    layer_configs=custom_layer_configs,
    quantization_config=quantization_config
  )
  return model


trainer = MultiClassTrainer(model_name=model_name,
                            model_creator=custom_model_creator,
                            classes_names=classes_names,
                            quantization_type="4bit",
                            use_lora=True,
                            )


ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size
                    )

In [None]:
# Training features
num_epochs = 1 # 15
patience = 3
min_delta = 0.001
batch_size = 2 # 8

# Model features
model_name = "camembert-base" # "almanach/camembertv2-base"
num_classes = len(classes)

# Create optimizer :AdamW
custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))

# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_classes, "activation": None},
    ]

def custom_model_creator(model_name, num_classes) :
  model = BeeCustomMaskModelForClassification(
    model_name=model_name,
    num_labels=num_classes,
    layer_configs=custom_layer_configs
  )
  return model

# Create Multi-classes trainer
print("Create Trainer")
trainer = MultiClassTrainer(model_name=model_name,
                            optimizer_creator=custom_optimizer_creator,
                            optimizer_params=custom_optimizer_params,
                            model_creator=custom_model_creator,
                            classes_names=classes_names,
                            )


# Train an Holdout
print("Train with early stopping ..")
ret = trainer.train(texts=texts,
              labels=labels,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True
              )

# Free CPU/GPU memory
trainer.release_model()

# Holdout

In [None]:
# Training features
num_epochs = 1 # 15
patience = 3
min_delta = 0.001
val_size = 0.8
batch_size = 2 # 8

# Model features
model_name = "camembert-base" # "almanach/camembertv2-base"
num_classes = len(classes)

# Create optimizer :AdamW
custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))

# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_classes, "activation": None},
    ]

def custom_model_creator(model_name, num_classes) :
  model = BVCustomMaskModelForClassification(
    model_name=model_name,
    num_labels=num_classes,
    layer_configs=custom_layer_configs
  )
  return model

# Create Multi-classes trainer
print("Create Trainer")
trainer = MultiClassTrainer(model_name=model_name,
                            optimizer_creator=custom_optimizer_creator,
                            optimizer_params=custom_optimizer_params,
                            model_creator=custom_model_creator,
                            classes_names=classes_names,
                            )


# Train an Holdout
print("Train with early stopping ..")
ret = trainer.holdout(texts=texts,
              labels=labels,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True
              )

# Free CPU/GPU memory
trainer.release_model()

# Cross Validation

In [None]:
# Training features
num_epochs = 1 # 15
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 2 # 8
n_splits = 3

# Model features
model_name = "camembert-base" # "almanach/camembertv2-base"
num_classes = len(classes)

# Create optimizer :AdamW
custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))

# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_classes, "activation": None},
    ]

def custom_model_creator(model_name, num_classes) :
  model = CustomModelParams(
    model_name=model_name,
    num_labels=num_classes,
    layer_configs=custom_layer_configs
  )
  return model

# Create Multi-classes trainer
print("Create Trainer")
trainer = MultiClassTrainer(model_name=model_name,
                            optimizer_creator=custom_optimizer_creator,
                            optimizer_params=custom_optimizer_params,
                            model_creator=custom_model_creator,
                            classes_names=classes_names,
                            )


# Train an Holdout
print("Train with early stopping ..")
rets = trainer.cross_validation(texts=texts,
              labels=labels,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True
              )


# Free CPU/GPU memory
trainer.release_model()

# End of game

### ESRS multi-classes

In [None]:
# Training features
num_epochs = 15
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

# Model features
model_name = "camembert-base"
num_classes = len(classes_ESRS)
classes_names = classes_names_ESRS
labels = labels_ESRS

# Create optimizer :AdamW
custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))


# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_classes, "activation": None},
    ]

def custom_model_creator(model_name, num_labels) :
  model = CustomModelParams(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
  )
  return model

# Create Multi-classes trainer
print("Create Trainer")
trainer = MultiClassTrainer(model_name=model_name,
                            optimizer_creator=custom_optimizer_creator,
                            optimizer_params=custom_optimizer_params,
                            model_creator=custom_model_creator,
                            classes_names=classes_names,
                            )


# Train an Holdout
print("Train with early stopping ..")
ret = trainer.holdout(texts=texts,
              labels=labels,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True
              )

# Free GPU memory
print("Release model ..  ")
trainer.model = None
gc.collect()
if device.startswith("cuda"):
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

print("\n")



# Cross validation

### ESG multi-classes

In [None]:
# Training features
num_epochs = 15
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8
n_splits = 3


# Model features
model_name = "camembert-base"
num_classes = len(classes_ESG)
classes_names = classes_names_ESG
labels = labels_ESG

# Create optimizer :AdamW
custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))


# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_classes, "activation": None},
    ]

def custom_model_creator(model_name, num_labels) :
  model = CustomModelParams(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
  )
  return model

# Create Multi-classes trainer
print("Create Trainer")
trainer = MultiClassTrainer(model_name=model_name,
                            optimizer_creator=custom_optimizer_creator,
                            optimizer_params=custom_optimizer_params,
                            model_creator=custom_model_creator,
                            classes_names=classes_names,
                            )


# Train an Holdout
print("Train with early stopping ..")
rets = trainer.cross_validation(texts=texts,
              labels=labels,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True
              )


# Free GPU memory
print("Release model ..  ")
trainer.model = None
gc.collect()
if device.startswith("cuda"):
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

print("\n")



### ESRS multi-classes

In [None]:
# Training features
num_epochs = 15
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8
n_splits = 3


# Model features
model_name = "camembert-base"
num_classes = len(classes_ESRS)
classes_names = classes_names_ESRS
labels = labels_ESRS

# Create optimizer :AdamW
custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))


# Create custom model : Normalized Layers
custom_layer_configs = [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_classes, "activation": None},
    ]

def custom_model_creator(model_name, num_labels) :
  model = CustomModelParams(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
  )
  return model

# Create Multi-classes trainer
print("Create Trainer")
trainer = MultiClassTrainer(model_name=model_name,
                            optimizer_creator=custom_optimizer_creator,
                            optimizer_params=custom_optimizer_params,
                            model_creator=custom_model_creator,
                            classes_names=classes_names,
                            )


# Train an Holdout
print("Train with early stopping ..")
rets = trainer.cross_validation(texts=texts,
              labels=labels,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True
              )


# Free GPU memory
print("Release model ..  ")
trainer.model = None
gc.collect()
if device.startswith("cuda"):
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

print("\n")



# All the tests

## Train

#### multi-label

In [None]:
np.unique(labels_ESG)

In [None]:
len(texts), len(labels_ESG), len(labels_ESRS), len(classes_ESG), len(labels_ESRS), len(classes_ESRS)

In [None]:
num_epochs = 1
batch_size = 8
train_size = 0.7

trainer = MultiClassTrainer(model_name="camembert-base",
                            classes_names=classes_name_ESG,
                            multilabel=True
                            )

ret = trainer.train(texts=texts,
                    labels=labels_ESG,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size
                    )

#### multi-class

In [None]:
num_epochs = 1
batch_size = 8
train_size = 0.7

trainer = MultiClassTrainer(model_name="camembert-base",
                            num_classes=len(np.unique(labels_ESG)),
                            classes_names=labels_ESG,
                            )

ret = trainer.train(texts=texts,
                    labels=labels_ESG,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size
                    )


In [None]:
num_epochs = 5
batch_size = 8
train_size = 0.7

trainer = MultiClassTrainer(model_name="camembert-base",
                            num_classes=len(np.unique(labels)))

ret = trainer.train(texts=texts,
                    labels=labels,
                    train_size=train_size,
                    num_epochs=num_epochs,
                    batch_size=batch_size,
                    balanced=True
                    )

## Save

In [None]:
trainer.save(path="/content/custom_model") #"/content/drive/MyDrive/Colab_Datasets/elegana/synthetic")

## Load

In [None]:
trainer = MultiClassTrainer(model_name="camembert-base",
                            num_classes=len(np.unique(labels))
                            )
trainer.load("/content/custom_model")

## Predict

In [None]:
# Tester la durée et la paralélisation
reviews=test_texts[:300]  # 34 sec
preds = trainer.predict(reviews)
print(preds[10:40])
print(test_labels[10:40])

## Metrics

In [None]:
y_true = test_labels[:300] #[10:40]
y_pred = preds[:300] #[10:40]
target_names = le.inverse_transform(np.unique(test_labels))

report = classification_report(y_true, y_pred, output_dict=True)
print(report)

report = classification_report(y_true, y_pred)
print(report)


In [None]:
confusion_matrix(y_true, y_pred)

In [None]:
#tn, fp, fn, tp =
confusion_matrix(y_true, y_pred).ravel()

In [None]:
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(y_true, y_pred)

In [None]:
from sklearn.metrics import f1_score
print("f1 macro", f1_score(y_true, y_pred, average='macro'))
print("f1 micro", f1_score(y_true, y_pred, average='micro'))
print("f1 weighted", f1_score(y_true, y_pred, average='weighted'))
print("f1 per class", f1_score(y_true, y_pred, average=None))

In [None]:
from sklearn.metrics import precision_recall_fscore_support
prfs = precision_recall_fscore_support(y_true, y_pred, average=None)
print("precision", prfs[0])
print("recall", prfs[1])
print("f1", prfs[2])

In [None]:
def calculate_metrics(y_true, y_pred):
  res = {}

  prfs = precision_recall_fscore_support(y_true, y_pred, average=None)
  res["class_precision"] = prfs[0]
  res["class_recall"] = prfs[1]
  res["class_f1"] = prfs[2]
  res["class_support"] = prfs[3]

  res["f1_macro"] = f1_score(y_true, y_pred, average='macro')
  res["f1_micro"] = f1_score(y_true, y_pred, average='micro')
  res["f1_weighted"] = f1_score(y_true, y_pred, average='weighted')

  res["accuracy"] = accuracy_score(y_true, y_pred)
  res["mcc"] = matthews_corrcoef(y_true, y_pred)

  res["confusion_matrix"] = confusion_matrix(y_true, y_pred) #.ravel()

  return res

calculate_metrics(y_true, y_pred)


## Holdout ESG

In [None]:
len(np.unique(labels_ESG))

### without classes balancing

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

trainer = MultiClassTrainer(model_name="camembert-base",
                            num_classes=len(np.unique(labels_ESG)),
                            classes_names=classes_names_ESG,
                            )

ret = trainer.holdout(texts=texts,
              labels=labels_ESG,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)


### with classes balancing

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

trainer = MultiClassTrainer(model_name="camembert-base",
                            num_classes=len(np.unique(labels_ESG)),
                            classes_names=classes_names_ESG,
                            )

ret = trainer.holdout(texts=texts,
              labels=labels_ESG,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)


### with AdamW

In [None]:
num_epochs = 15 # 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8


custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))


trainer = MultiClassTrainer(model_name="camembert-base",
                                   classes_names=classes_names_ESG,
                                   optimizer_creator=custom_optimizer_creator,
                                   optimizer_params=custom_optimizer_params,
                                   #scheduler_creator=custom_scheduler_creator,
                                   #scheduler_params=custom_scheduler_params
                                   )


ret = trainer.holdout(texts=texts,
              labels=labels_ESG,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)


### with AdamW and StepLR

In [None]:
num_epochs = 1 #15 # 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8


custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))


custom_scheduler_params = {
    'step_size': 2,
    'gamma': 0.1,
}

def custom_scheduler_creator(optimizer, **params):
    return StepLR(optimizer,
                  step_size=params.get('step_size'),
                  gamma=params.get('gamma'),
                  )

trainer = MultiClassTrainer(model_name="camembert-base",
                                   classes_names=classes_names_ESG,
                                   optimizer_creator=custom_optimizer_creator,
                                   optimizer_params=custom_optimizer_params,
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params,
                                   scheduler_needs_loss = False,
                                   )


ret = trainer.holdout(texts=texts,
              labels=labels_ESG,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)


In [None]:
ret

In [None]:
num_epochs = 15 # 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8


custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))


custom_scheduler_params = {
    'step_size': 10,
    'gamma': 0.1,
}

def custom_scheduler_creator(optimizer, **params):
    return StepLR(optimizer,
                  step_size=params.get('step_size'),
                  gamma=params.get('gamma'),
                  )

trainer = MultiClassTrainer(model_name="camembert-base",
                                   classes_names=classes_names_ESG,
                                   optimizer_creator=custom_optimizer_creator,
                                   optimizer_params=custom_optimizer_params,
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params,
                                   scheduler_needs_loss = False,
                                   )


ret = trainer.holdout(texts=texts,
              labels=labels_ESG,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)


### with custom model

#### Class

In [None]:
# OK
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel

# DropConnect class implementation
class DropConnect(nn.Module):
    def __init__(self, layer, dropconnect_rate):
        super(DropConnect, self).__init__()
        self.layer = layer
        self.dropconnect_rate = dropconnect_rate

    def forward(self, x):
        if self.training:  # Only apply DropConnect during training
            # Ensure that the layer weight dimensions match
            mask = torch.rand_like(self.layer.weight) > self.dropconnect_rate
            weight = self.layer.weight * mask
            return F.linear(x, weight, self.layer.bias)
        else:
            return self.layer(x)  # No DropConnect during inference

# The CustomModelBis class with flexible layer config
class CustomModelBis(nn.Module):
    def __init__(self, model_name, num_labels, layer_configs):
        super(CustomModelBis, self).__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        self.base_model = AutoModel.from_pretrained(self.model_name)

        # Build custom linear stack
        self.linear_relu_stack = self._build_custom_stack(layer_configs)

    def _build_custom_stack(self, layer_configs):
        layers = []
        previous_size = layer_configs[0]['input_size']  # This should be 768 for camembert base

        for config in layer_configs:
            output_size = config['output_size']
            activation = config.get('activation')
            dropout_rate = config.get('dropout_rate')
            dropconnect_rate = config.get('dropconnect_rate')
            batch_norm = config.get('batch_norm', False)
            layer_norm = config.get('layer_norm', False)
            residual = config.get('residual', False)

            # Add linear layer
            linear_layer = nn.Linear(previous_size, output_size)
            layers.append(linear_layer)

            # Apply batch normalization
            if batch_norm:
                layers.append(nn.BatchNorm1d(output_size))

            # Apply layer normalization
            if layer_norm:
                layers.append(nn.LayerNorm(output_size))

            # Apply activation (instantiate the activation function if provided)
            if activation:
                layers.append(activation())  # Instantiate activation function

            # Apply dropout
            if dropout_rate:
                layers.append(nn.Dropout(dropout_rate))

            # Apply DropConnect
            if dropconnect_rate:
                layers.append(DropConnect(linear_layer, dropconnect_rate))

            # Handle residual connections (to be implemented)
            if residual:
                pass

            previous_size = output_size  # Update input size for the next layer

        return nn.Sequential(*layers)

    def forward(self, input_ids, attention_mask, labels=None):
        # Extract features from base model
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        embeddings = outputs[0][:, 0]  # CLS token representation (shape: [batch_size, 768])

        # Pass embeddings through custom stack
        logits = self.linear_relu_stack(embeddings)

        return transformers.modeling_outputs.SequenceClassifierOutput(logits=logits)

#### Patterns

##### Pattern short list 1

In [None]:
num_labels = 4

network_patterns_names = [
    "Pattern 1: Basic Deep Feedforward Network",
    "Pattern 3: Batch-Normalized Network",
    "Pattern 4: Residual Connections",
    "Pattern 5: Wide Hidden Layers",
    "Pattern 6: Layer Normalization",
    "Pattern 7: Dropout Regularization",
    "Pattern 8: Minimalistic Network",
    "Pattern 9: Deep Residual Network"
]


network_patterns = [
    # Pattern 1: Basic Deep Feedforward Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.01},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.01},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 3: Batch-Normalized Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 4: Residual Connections
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 5: Wide Hidden Layers
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.GELU, "dropout_rate": 0.3},
        {"input_size": 1024, "output_size": 512, "activation": nn.GELU, "dropout_rate": 0.3},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 6: Layer Normalization
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 7: Dropout Regularization
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 8: Minimalistic Network
    [
        {"input_size": 768, "output_size": 128, "activation": nn.Tanh},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 9: Deep Residual Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "residual": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 10: Intermediate DropConnect
    #[
    #    {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropconnect_rate": 0.2},
    #    {"input_size": 512, "output_size": 128, "activation": nn.ReLU},
    #    {"input_size": 128, "output_size": num_labels, "activation": None},
    #],
]

##### Pattern long list 2

In [None]:
num_labels = 4

network_patterns_names = [
    "Pattern 1: Basic Feedforward Network",
    "Pattern 2: Shallow Network with Dropout",
    "Pattern 3: Batch-Normalized Network",
    "Pattern 4: Wide Hidden Layers",
    "Pattern 5: Layer Normalization",
    "Pattern 6: Minimalistic Network",
    "Pattern 7: Deep Feedforward Network",
    "Pattern 8: Dropout Regularization",
    "Pattern 9: Deep Residual Network",
    "Pattern 10: Compact Network",
    "Pattern 11: Fully Connected Bottleneck Network",
    "Pattern 12: Dense Network with No Activation in Final Layer",
    "Pattern 13: Layer Normalization with High Dropout",
    "Pattern 14: Gated Activation with Dropout",
    "Pattern 15: Progressive Layer Expansion",
    "Pattern 16: Deep Wide Network",
    "Pattern 17: Dense Residual Network with Skip Connections",
    "Pattern 18: Alternating Normalization",
    "Pattern 19: Advanced Progressive Shrinkage",
    "Pattern 20: Feature Extractor with Sparse Hidden Units",
    "Pattern 21: Multi-Layer Sparse Network",
    "Pattern 22: Alternating Dropout Intensities",
    "Pattern 23: High-Dimensional Bottleneck with Residuals",
    "Pattern 24: Fully Dense Network with Gradient Clipping",
    "Pattern 25: Deeply Layer-Normalized Network",
    "Pattern 26: Modular Feedforward Blocks",
    "Pattern 27: High-Frequency Regularization Network",
    "Pattern 28: Alternating Activations Network",
    "Pattern 29: Sequential Dropout and Bottleneck",
    "Pattern 30: Multi-Head Modular Network",
    "Pattern 31: Sparse Progressive Expansion",
    "Pattern 32: Residual Shrinking Layers",
    "Pattern 33: Dual Activation Fusion",
    "Pattern 34: Deeply Narrow Network",
    "Pattern 35: Cyclic Dropout Network",
    "Pattern 36: Wide Bottleneck Layers",
    "Pattern 37: Alternating Sparse Connectivity",
    "Pattern 38: Progressive Layer Normalization",
    "Pattern 39: Activation Modulated Network",
    "Pattern 40: Double Residual Connections",
    "Pattern 41: Dense Alternating Widths",
    "Pattern 42: Multi-Normalization Layers",
    "Pattern 43: Sparse Expansion with Skip",
    "Pattern 44: Mixed Activations Network",
    "Pattern 45: Dense Shallow Layers",
    "Pattern 46: Split-and-Merge Architecture",
    "Pattern 47: Gradient Clipping Network",
    "Pattern 48: High-Dropout Bottleneck",
    "Pattern 49: Alternating Nonlinearities",
    "Pattern 50: Wide-to-Narrow Progressive Network",
]

network_patterns = [

    # Pattern 1: Basic Feedforward Network
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 2: Shallow Network with Dropout
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.2},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 3: Batch-Normalized Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 4: Wide Hidden Layers
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.GELU},
        {"input_size": 1024, "output_size": 512, "activation": nn.GELU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 5: Layer Normalization
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 6: Minimalistic Network
    [
        {"input_size": 768, "output_size": 128, "activation": nn.Tanh},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 7: Deep Feedforward Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 8: Dropout Regularization
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 9: Deep Residual Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "residual": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 10: Compact Network
    [
        {"input_size": 768, "output_size": 64, "activation": nn.SiLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 11: Fully Connected Bottleneck Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 12: Dense Network with No Activation in Final Layer
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 13: Layer Normalization with High Dropout
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "layer_norm": True, "dropout_rate": 0.5},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "layer_norm": True, "dropout_rate": 0.5},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 14: Gated Activation with Dropout
    [
        {"input_size": 768, "output_size": 512, "activation": nn.GELU, "dropout_rate": 0.2},
        {"input_size": 512, "output_size": 256, "activation": nn.SiLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 15: Progressive Layer Expansion
    [
        {"input_size": 768, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 16: Deep Wide Network
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 17: Dense Residual Network with Skip Connections
    [
        {"input_size": 768, "output_size": 768, "activation": nn.ReLU, "residual": True},
        {"input_size": 768, "output_size": 768, "activation": nn.ReLU, "residual": True},
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 18: Alternating Normalization
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 19: Advanced Progressive Shrinkage
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 20: Feature Extractor with Sparse Hidden Units
    [
        {"input_size": 768, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": 32, "activation": nn.ReLU},
        {"input_size": 32, "output_size": num_labels, "activation": None},
    ],

    # Pattern 21: Multi-Layer Sparse Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 22: Alternating Dropout Intensities
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.2},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.5},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.3},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 23: High-Dimensional Bottleneck with Residuals
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.GELU, "residual": True},
        {"input_size": 1024, "output_size": 256, "activation": nn.GELU, "residual": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 24: Fully Dense Network with Gradient Clipping
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": num_labels, "activation": None},
    ],

    # Pattern 25: Deeply Layer-Normalized Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 26: Modular Feedforward Blocks
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 27: High-Frequency Regularization Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.1},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.1},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.1},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 28: Alternating Activations Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.SiLU},
        {"input_size": 256, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 29: Sequential Dropout and Bottleneck
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 128, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 30: Multi-Head Modular Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 31: Sparse Progressive Expansion
    [
        {"input_size": 768, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 32: Residual Shrinking Layers
    [
        {"input_size": 768, "output_size": 512, "activation": nn.GELU, "residual": True},
        {"input_size": 512, "output_size": 256, "activation": nn.GELU, "residual": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 33: Dual Activation Fusion
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.SiLU},
        {"input_size": 256, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 34: Deeply Narrow Network
    [
        {"input_size": 768, "output_size": 128, "activation": nn.Tanh},
        {"input_size": 128, "output_size": 64, "activation": nn.Tanh},
        {"input_size": 64, "output_size": 32, "activation": nn.Tanh},
        {"input_size": 32, "output_size": num_labels, "activation": None},
    ],

    # Pattern 35: Cyclic Dropout Network
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.2},
        {"input_size": 256, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 36: Wide Bottleneck Layers
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": num_labels, "activation": None},
    ],

    # Pattern 37: Alternating Sparse Connectivity
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 38: Progressive Layer Normalization
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 39: Activation Modulated Network
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": 64, "activation": nn.Tanh},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 40: Double Residual Connections
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "residual": True},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 41: Dense Alternating Widths
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 42: Multi-Normalization Layers
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 43: Sparse Expansion with Skip
    [
        {"input_size": 768, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 64, "activation": nn.ReLU},
        {"input_size": 64, "output_size": num_labels, "activation": None},
    ],

    # Pattern 44: Mixed Activations Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.SiLU},
        {"input_size": 512, "output_size": 128, "activation": nn.GELU},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 45: Dense Shallow Layers
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 46: Split-and-Merge Architecture
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 47: Gradient Clipping Network
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": num_labels, "activation": None},
    ],

    # Pattern 48: High-Dropout Bottleneck
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.5},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.5},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 49: Alternating Nonlinearities
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": 128, "activation": nn.Tanh},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 50: Wide-to-Narrow Progressive Network
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.ReLU},
        {"input_size": 1024, "output_size": 512, "activation": nn.ReLU},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

]


#### 1 epoch without balancing

In [None]:
num_epochs = 15
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

num_labels = 4
model_name = "camembert-base"

for i, custom_layer_configs in enumerate(network_patterns):

  print("\n")
  print("######### ", network_patterns_names[i])
  print(custom_layer_configs)

  def custom_model_creator(model_name, num_labels) :

    model = CustomModelBis(
      model_name=model_name,
      num_labels=num_labels,
      layer_configs=custom_layer_configs
    )

    return model

  print("Create Trainer")
  trainer = MultiClassTrainer(model_name="camembert-base",
                              model_creator=custom_model_creator,
                              classes_names=classes_names_ESG,
                              )
  print("Train for 1 epoch ..")
  ret = trainer.holdout(texts=texts,
                labels=labels_ESG,
                val_size=val_size,
                num_epochs=num_epochs,
                batch_size=batch_size,
                patience=patience,
                min_delta=min_delta)

  print("Release model  ")
  trainer.model = None
  gc.collect()
  if device.startswith("cuda"):
      torch.cuda.empty_cache()
      torch.cuda.synchronize()

  print("\n")

#### 15 epoch without balancing

In [None]:
num_epochs = 15
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

num_labels = 4
model_name = "camembert-base"

for i, custom_layer_configs in enumerate(network_patterns):

  print("\n")
  print("######### ", network_patterns_names[i])
  print(custom_layer_configs)

  def custom_model_creator(model_name, num_labels) :

    model = CustomModelBis(
      model_name=model_name,
      num_labels=num_labels,
      layer_configs=custom_layer_configs
    )

    return model

  print("Create Trainer")
  trainer = MultiClassTrainer(model_name="camembert-base",
                              model_creator=custom_model_creator,
                              classes_names=classes_names_ESG,
                              )
  print("Train for 1 epoch ..")
  ret = trainer.holdout(texts=texts,
                labels=labels_ESG,
                val_size=val_size,
                num_epochs=num_epochs,
                batch_size=batch_size,
                patience=patience,
                min_delta=min_delta)

  print("Release model  ")
  trainer.model = None
  gc.collect()
  if device.startswith("cuda"):
      torch.cuda.empty_cache()
      torch.cuda.synchronize()

  print("\n")

#### 1 epoch test

##### multi-class

In [None]:
num_epochs = 1
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

num_labels = 4
model_name = "camembert-base"

i = 0
custom_layer_configs  = network_patterns[i]

print("\n")
print("######### ", network_patterns_names[i])
print(custom_layer_configs)

def custom_model_creator(model_name, num_labels) :

  model = CustomModelBis(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
  )

  return model

print("Create Trainer")
trainer = MultiClassTrainer(model_name="camembert-base",
                            model_creator=custom_model_creator,
                            classes_names=classes_names_ESG,
                            )
print("Train for 1 epoch ..")
ret = trainer.holdout(texts=texts,
              labels=labels_ESG,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True
              )
print("\n")

##### multi-labels

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

num_labels = 4
model_name = "camembert-base"

i = 0
custom_layer_configs  = network_patterns[i]

print("\n")
print("######### ", network_patterns_names[i])
print(custom_layer_configs)

def custom_model_creator(model_name, num_labels) :

  model = CustomModelBis(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
  )

  return model

print("Create Trainer")
trainer = MultiClassTrainer(model_name="camembert-base",
                            model_creator=custom_model_creator,
                            classes_names=classes_names_ESG,
                            multilabel=True
                            )

print("Train for 1 epoch ..")
ret = trainer.holdout(texts=texts,
              labels=labels_ESG,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=False # !!!! manage balanced True
              )
print("\n")



In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

num_labels = 4
model_name = "camembert-base"

i = 0
custom_layer_configs  = network_patterns[i]

print("\n")
print("######### ", network_patterns_names[i])
print(custom_layer_configs)

def custom_model_creator(model_name, num_labels) :

  model = CustomModelBis(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
  )

  return model

print("Create Trainer")
trainer = MultiClassTrainer(model_name="camembert-base",
                            model_creator=custom_model_creator,
                            classes_names=classes_names_ESG,
                            multilabel=True
                            )

print("Train for 1 epoch ..")
ret = trainer.holdout(texts=texts,
              labels=labels_ESG,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True # !!!! manage balanced True
              )
print("\n")

#### 1 epoch with balancing

In [None]:
num_epochs = 1
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

num_labels = 4
model_name = "camembert-base"

for i, custom_layer_configs in enumerate(network_patterns):

  print("\n")
  print("######### ", network_patterns_names[i])
  print(custom_layer_configs)

  def custom_model_creator(model_name, num_labels) :

    model = CustomModelBis(
      model_name=model_name,
      num_labels=num_labels,
      layer_configs=custom_layer_configs
    )

    return model

  print("Create Trainer")
  trainer = MultiClassTrainer(model_name="camembert-base",
                              model_creator=custom_model_creator,
                              classes_names=classes_names_ESG,
                              )
  print("Train for 1 epoch ..")
  ret = trainer.holdout(texts=texts,
                labels=labels_ESG,
                val_size=val_size,
                num_epochs=num_epochs,
                batch_size=batch_size,
                patience=patience,
                min_delta=min_delta,
                balanced=True
                )
  print("\n")

#### 15 epoch with balancing

MCC > 0.84 : 1 7 8 12 15 18 (best) 40 42 49

In [None]:
num_epochs = 15
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

num_labels = 4
model_name = "camembert-base"

for i, custom_layer_configs in enumerate(network_patterns):

  if i <= 18: continue

  print("\n")
  print("######### ", network_patterns_names[i])
  print(custom_layer_configs)

  def custom_model_creator(model_name, num_labels) :

    model = CustomModelBis(
      model_name=model_name,
      num_labels=num_labels,
      layer_configs=custom_layer_configs
    )

    return model

  print("Create Trainer")
  trainer = MultiClassTrainer(model_name="camembert-base",
                              model_creator=custom_model_creator,
                              classes_names=classes_names_ESG,
                              )
  print("Train for 1 epoch ..")
  ret = trainer.holdout(texts=texts,
                labels=labels_ESG,
                val_size=val_size,
                num_epochs=num_epochs,
                batch_size=batch_size,
                patience=patience,
                min_delta=min_delta,
                balanced=True
                )

  print("Release model  ")
  trainer.model = None
  gc.collect()
  if device.startswith("cuda"):
      torch.cuda.empty_cache()
      torch.cuda.synchronize()

  print("\n")

#### Plein de tests

In [None]:
# OK
import torch
import torch.nn as nn
from transformers import AutoModel

# DropConnect class implementation
class DropConnect(nn.Module):
    def __init__(self, layer, dropconnect_rate):
        super(DropConnect, self).__init__()
        self.layer = layer
        self.dropconnect_rate = dropconnect_rate

    def forward(self, x):
        if self.training:  # Only apply DropConnect during training
            # Ensure that the layer weight dimensions match
            mask = torch.rand_like(self.layer.weight) > self.dropconnect_rate
            weight = self.layer.weight * mask
            return F.linear(x, weight, self.layer.bias)
        else:
            return self.layer(x)  # No DropConnect during inference

# The CustomModelBis class with flexible layer config
class CustomModelBis(nn.Module):
    def __init__(self, model_name, num_labels, layer_configs):
        super(CustomModelBis, self).__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        self.base_model = AutoModel.from_pretrained(self.model_name)

        # Build custom linear stack
        self.linear_relu_stack = self._build_custom_stack(layer_configs)

    def _build_custom_stack(self, layer_configs):
        layers = []
        previous_size = layer_configs[0]['input_size']  # This should be 768 for camembert base

        for config in layer_configs:
            output_size = config['output_size']
            activation = config.get('activation')
            dropout_rate = config.get('dropout_rate')
            dropconnect_rate = config.get('dropconnect_rate')
            batch_norm = config.get('batch_norm', False)
            layer_norm = config.get('layer_norm', False)
            residual = config.get('residual', False)

            # Add linear layer
            linear_layer = nn.Linear(previous_size, output_size)
            layers.append(linear_layer)

            # Apply batch normalization
            if batch_norm:
                layers.append(nn.BatchNorm1d(output_size))

            # Apply layer normalization
            if layer_norm:
                layers.append(nn.LayerNorm(output_size))

            # Apply activation (instantiate the activation function if provided)
            if activation:
                layers.append(activation())  # Instantiate activation function

            # Apply dropout
            if dropout_rate:
                layers.append(nn.Dropout(dropout_rate))

            # Apply DropConnect
            if dropconnect_rate:
                layers.append(DropConnect(linear_layer, dropconnect_rate))

            # Handle residual connections (to be implemented)
            if residual:
                pass

            previous_size = output_size  # Update input size for the next layer

        return nn.Sequential(*layers)

    def forward(self, input_ids, attention_mask, labels=None):
        # Extract features from base model
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        embeddings = outputs[0][:, 0]  # CLS token representation (shape: [batch_size, 768])

        # Pass embeddings through custom stack
        logits = self.linear_relu_stack(embeddings)

        return transformers.modeling_outputs.SequenceClassifierOutput(logits=logits)

In [None]:
num_labels = 4

network_patterns = [
    # Pattern 1: Basic Deep Feedforward Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.3},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.3},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 3: Batch-Normalized Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 4: Residual Connections
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 5: Wide Hidden Layers
    [
        {"input_size": 768, "output_size": 1024, "activation": nn.GELU, "dropout_rate": 0.3},
        {"input_size": 1024, "output_size": 512, "activation": nn.GELU, "dropout_rate": 0.3},
        {"input_size": 512, "output_size": num_labels, "activation": None},
    ],

    # Pattern 6: Layer Normalization
    [
        {"input_size": 768, "output_size": 256, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 256, "output_size": 128, "activation": nn.ReLU, "layer_norm": True},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 7: Dropout Regularization
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 512, "output_size": 128, "activation": nn.ReLU, "dropout_rate": 0.4},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 8: Minimalistic Network
    [
        {"input_size": 768, "output_size": 128, "activation": nn.Tanh},
        {"input_size": 128, "output_size": num_labels, "activation": None},
    ],

    # Pattern 9: Deep Residual Network
    [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 512, "activation": nn.ReLU, "residual": True},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "residual": True},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ],

    # Pattern 10: Intermediate DropConnect
    #[
    #    {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropconnect_rate": 0.2},
    #    {"input_size": 512, "output_size": 128, "activation": nn.ReLU},
    #    {"input_size": 128, "output_size": num_labels, "activation": None},
    #],
]

In [None]:
num_labels = 4
model_name = "camembert-base"

for i, custom_layer_configs in enumerate(network_patterns):

  print("Pattern : ", i)
  print(custom_layer_configs)

  model = CustomModelBis(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
  )


In [None]:
num_epochs = 1
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

num_labels = 4
model_name = "camembert-base"

for i, custom_layer_configs in enumerate(network_patterns):

  print("Pattern : ", i)
  print(custom_layer_configs)

  def custom_model_creator(model_name, num_labels) :

    model = CustomModelBis(
      model_name=model_name,
      num_labels=num_labels,
      layer_configs=custom_layer_configs
    )

    return model

  print("Create Trainer")
  trainer = MultiClassTrainer(model_name="camembert-base",
                              model_creator=custom_model_creator,
                              classes_names=classes_names_ESG,
                              )
  print("Train for 1 epoch ..")
  ret = trainer.holdout(texts=texts,
                labels=labels_ESG,
                val_size=val_size,
                num_epochs=num_epochs,
                batch_size=batch_size,
                patience=patience,
                min_delta=min_delta)
  print("\n")

In [None]:
custom_layer_configs =     [
        {"input_size": 768, "output_size": 512, "activation": nn.ReLU, "dropout_rate": 0.3},
        {"input_size": 512, "output_size": 256, "activation": nn.ReLU, "dropout_rate": 0.3},
        {"input_size": 256, "output_size": num_labels, "activation": None},
    ]

custom_layer_configs = [
    {"input_size": 768, "output_size": 256, "dropout_rate": 0.2, "activation": nn.ReLU, "batch_norm": True},
    {"input_size": 256, "output_size": 128, "dropout_rate": 0.3, "activation": nn.ReLU, "batch_norm": True},
    {"input_size": 128, "output_size": num_labels, "activation": None}  # Final layer with no activation
]

model = CustomModelBis(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
  )

trainer = MultiClassTrainer(model_name="camembert-base",
                              model_creator=model,
                              classes_names=classes_names_ESG,
                              )



In [None]:
num_epochs = 1
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

n_splits = 3

rets = trainer.cross_validation(texts=texts,
              labels=labels_ESG,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)

In [None]:
num_epochs = 1
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

ret = trainer.holdout(texts=texts,
                labels=labels_ESG,
                val_size=val_size,
                num_epochs=num_epochs,
                batch_size=batch_size,
                patience=patience,
                min_delta=min_delta)

In [None]:
class DropConnect(nn.Module):
    def __init__(self, layer, dropconnect_rate):
        super(DropConnect, self).__init__()
        self.layer = layer
        self.dropconnect_rate = dropconnect_rate

    def forward(self, x):
        print(f"Input shape to DropConnect: {x.shape}")
        if self.training:
            weight = F.dropout(self.layer.weight, p=self.dropconnect_rate, training=True)
            print(f"Weight shape after DropConnect: {weight.shape}")
            return F.linear(x, weight, self.layer.bias)
        else:
            return self.layer(x)



In [None]:
num_epochs = 1
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

num_labels = 4
model_name = "camembert-base"


def custom_model_creator(model_name, num_labels) :

  custom_layer_configs = [
    {"input_size": 768, "output_size": 256, "activation": nn.Tanh, "dropconnect_rate": 0.2},
    {"input_size": 256, "output_size": num_labels, "activation": None},
  ]

  model = CustomModelBis(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
  )

  return model

print("Create Trainer")
trainer = MultiClassTrainer(model_name="camembert-base",
                            model_creator=custom_model_creator,
                            classes_names=classes_names_ESG,
                            )
print("Train for 1 epoch ..")
ret = trainer.holdout(texts=texts,
              labels=labels_ESG,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)
print("\n")

In [None]:
num_epochs = 1
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

num_labels = 4
model_name = "camembert-base"

for i, custom_layer_configs in enumerate(network_patterns):

  print("Pattern : ", i)
  print(custom_layer_configs)

  def custom_model_creator(model_name, num_labels) :

    model = CustomModelBis(
      model_name=model_name,
      num_labels=num_labels,
      layer_configs=custom_layer_configs
    )

    return model

  print("Create Trainer")
  trainer = MultiClassTrainer(model_name="camembert-base",
                              model_creator=custom_model_creator,
                              classes_names=classes_names_ESG,
                              )
  print("Train for 1 epoch ..")
  ret = trainer.holdout(texts=texts,
                labels=labels_ESG,
                val_size=val_size,
                num_epochs=num_epochs,
                batch_size=batch_size,
                patience=patience,
                min_delta=min_delta)
  print("\n")



In [None]:
# OK
num_labels = 4
model_name = "camembert-base"

custom_layer_configs = [
    {"input_size": 768, "output_size": 256, "dropout_rate": 0.2, "activation": nn.ReLU, "batch_norm": True},
    {"input_size": 256, "output_size": 128, "dropout_rate": 0.3, "activation": nn.ReLU, "batch_norm": True},
    {"input_size": 128, "output_size": num_labels, "activation": None}  # Final layer with no activation
]

model = CustomModelBis(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
)

In [None]:
# OK
num_epochs = 8
patience = 3
min_delta = 0.001
n_splits = 3
batch_size = 8

def custom_model_creator(model_name, num_labels):

    custom_layer_configs = [
        {"input_size": 768, "output_size": 256, "dropout_rate": 0.2, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 256, "output_size": 128, "dropout_rate": 0.3, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 128, "output_size": num_labels, "activation": None}  # Final layer with no activation
    ]

    model = CustomModelBis(
        model_name=model_name,
        num_labels=num_labels,
        layer_configs=custom_layer_configs
    )

    return model

trainer = MultiClassTrainer(model_name="camembert-base",
                            model_creator=custom_model_creator,
                            classes_names=classes_names_ESG,
                            )

rets = trainer.cross_validation(texts=texts,
              labels=labels_ESG,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)

### Cross validation

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.333
batch_size = 8

trainer = MultiClassTrainer(model_name="camembert-base",
                            classes_names=classes_names_ESG,
                            )

ret = trainer.holdout(texts=texts,
              labels=labels_ESG,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              seed=12)

In [None]:
import torch
import torch.nn as nn


class CustomModelBis(nn.Module):
    def __init__(self, model_name, num_labels, layer_configs):
        """
        Args:
            model_name (str): Name of the pretrained model.
            num_labels (int): Number of output labels.
            layer_configs (list of dict): List of configurations for custom layers.
        """
        super(CustomModelBis, self).__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        self.base_model = AutoModel.from_pretrained(self.model_name)

        # Build custom linear stack
        self.linear_relu_stack = self._build_custom_stack(layer_configs)

    def _build_custom_stack(self, layer_configs):
        layers = []
        previous_size = layer_configs[0]['input_size']

        for config in layer_configs:
            output_size = config['output_size']
            activation = config.get('activation')
            dropout_rate = config.get('dropout_rate')
            dropconnect_rate = config.get('dropconnect_rate')
            batch_norm = config.get('batch_norm', False)
            layer_norm = config.get('layer_norm', False)
            residual = config.get('residual', False)

            # Add linear layer
            linear_layer = nn.Linear(previous_size, output_size)
            layers.append(linear_layer)

            # Apply batch normalization
            if batch_norm:
                layers.append(nn.BatchNorm1d(output_size))

            # Apply layer normalization
            if layer_norm:
                layers.append(nn.LayerNorm(output_size))

            # Apply activation (instantiate the activation function if provided)
            if activation:
                layers.append(activation())  # Instantiate activation function

            # Apply dropout
            if dropout_rate:
                layers.append(nn.Dropout(dropout_rate))

            # Apply DropConnect
            if dropconnect_rate:
                layers.append(DropConnect(linear_layer, dropconnect_rate))

            # Handle residual connections (to be implemented)
            if residual:
                pass

            previous_size = output_size

        return nn.Sequential(*layers)


    def forward(self, input_ids, attention_mask, labels=None):
        # Extract features from base model
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        embeddings = outputs[0][:, 0]  # CLS token representation

        # Pass embeddings through custom stack
        logits = self.linear_relu_stack(embeddings)

        return transformers.modeling_outputs.SequenceClassifierOutput(logits=logits)


class DropConnect(nn.Module):
    def __init__(self, layer, dropconnect_rate):
        super(DropConnect, self).__init__()
        self.layer = layer
        self.dropconnect_rate = dropconnect_rate

    def forward(self, x):
        if self.training:  # Only apply DropConnect during training
            # Create a mask based on dropconnect_rate
            mask = torch.rand_like(self.layer.weight) > self.dropconnect_rate
            weight = self.layer.weight * mask
            return F.linear(x, weight, self.layer.bias)
        else:
            return self.layer(x)  # No DropConnect during inference


In [None]:
import torch
import torch.nn as nn


class CustomModelBis(nn.Module):
    def __init__(self, model_name, num_labels, layer_configs):
        super(CustomModelBis, self).__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        self.base_model = AutoModel.from_pretrained(self.model_name)

        # Build custom linear stack
        self.linear_relu_stack = self._build_custom_stack(layer_configs)

    def _build_custom_stack(self, layer_configs):
        layers = []
        previous_size = layer_configs[0]['input_size']  # This should be 768 for camembert base

        for config in layer_configs:
            output_size = config['output_size']
            activation = config.get('activation')
            dropout_rate = config.get('dropout_rate')
            dropconnect_rate = config.get('dropconnect_rate')
            batch_norm = config.get('batch_norm', False)
            layer_norm = config.get('layer_norm', False)
            residual = config.get('residual', False)

            # Add linear layer
            linear_layer = nn.Linear(previous_size, output_size)
            layers.append(linear_layer)

            # Apply batch normalization
            if batch_norm:
                layers.append(nn.BatchNorm1d(output_size))

            # Apply layer normalization
            if layer_norm:
                layers.append(nn.LayerNorm(output_size))

            # Apply activation (instantiate the activation function if provided)
            if activation:
                layers.append(activation())  # Instantiate activation function

            # Apply dropout
            if dropout_rate:
                layers.append(nn.Dropout(dropout_rate))

            # Apply DropConnect
            if dropconnect_rate:
                layers.append(DropConnect(linear_layer, dropconnect_rate))

            # Handle residual connections (to be implemented)
            if residual:
                pass

            previous_size = output_size  # Update input size for the next layer

        return nn.Sequential(*layers)

    def forward(self, input_ids, attention_mask, labels=None):
        # Extract features from base model
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        embeddings = outputs[0][:, 0]  # CLS token representation (shape: [batch_size, 768])

        # Pass embeddings through custom stack
        logits = self.linear_relu_stack(embeddings)

        return transformers.modeling_outputs.SequenceClassifierOutput(logits=logits)


class DropConnect(nn.Module):
    def __init__(self, layer, dropconnect_rate):
        super(DropConnect, self).__init__()
        self.layer = layer
        self.dropconnect_rate = dropconnect_rate

    def forward(self, x):
        if self.training:  # Only apply DropConnect during training
            # Ensure that the layer weight dimensions match
            mask = torch.rand_like(self.layer.weight) > self.dropconnect_rate
            weight = self.layer.weight * mask
            return F.linear(x, weight, self.layer.bias)
        else:
            return self.layer(x)  # No DropConnect during inference


In [None]:
# OK
import torch
import torch.nn as nn
from transformers import AutoModel

# DropConnect class implementation
class DropConnect(nn.Module):
    def __init__(self, layer, dropconnect_rate):
        super(DropConnect, self).__init__()
        self.layer = layer
        self.dropconnect_rate = dropconnect_rate

    def forward(self, x):
        if self.training:  # Only apply DropConnect during training
            # Ensure that the layer weight dimensions match
            mask = torch.rand_like(self.layer.weight) > self.dropconnect_rate
            weight = self.layer.weight * mask
            return F.linear(x, weight, self.layer.bias)
        else:
            return self.layer(x)  # No DropConnect during inference

# The CustomModelBis class with flexible layer config
class CustomModelBis(nn.Module):
    def __init__(self, model_name, num_labels, layer_configs):
        super(CustomModelBis, self).__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        self.base_model = AutoModel.from_pretrained(self.model_name)

        # Build custom linear stack
        self.linear_relu_stack = self._build_custom_stack(layer_configs)

    def _build_custom_stack(self, layer_configs):
        layers = []
        previous_size = layer_configs[0]['input_size']  # This should be 768 for camembert base

        for config in layer_configs:
            output_size = config['output_size']
            activation = config.get('activation')
            dropout_rate = config.get('dropout_rate')
            dropconnect_rate = config.get('dropconnect_rate')
            batch_norm = config.get('batch_norm', False)
            layer_norm = config.get('layer_norm', False)
            residual = config.get('residual', False)

            # Add linear layer
            linear_layer = nn.Linear(previous_size, output_size)
            layers.append(linear_layer)

            # Apply batch normalization
            if batch_norm:
                layers.append(nn.BatchNorm1d(output_size))

            # Apply layer normalization
            if layer_norm:
                layers.append(nn.LayerNorm(output_size))

            # Apply activation (instantiate the activation function if provided)
            if activation:
                layers.append(activation())  # Instantiate activation function

            # Apply dropout
            if dropout_rate:
                layers.append(nn.Dropout(dropout_rate))

            # Apply DropConnect
            if dropconnect_rate:
                layers.append(DropConnect(linear_layer, dropconnect_rate))

            # Handle residual connections (to be implemented)
            if residual:
                pass

            previous_size = output_size  # Update input size for the next layer

        return nn.Sequential(*layers)

    def forward(self, input_ids, attention_mask, labels=None):
        # Extract features from base model
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        embeddings = outputs[0][:, 0]  # CLS token representation (shape: [batch_size, 768])

        # Pass embeddings through custom stack
        logits = self.linear_relu_stack(embeddings)

        return transformers.modeling_outputs.SequenceClassifierOutput(logits=logits)


In [None]:
# OK
num_labels = 4
model_name = "camembert-base"

custom_layer_configs = [
    {"input_size": 768, "output_size": 256, "dropout_rate": 0.2, "activation": nn.ReLU, "batch_norm": True},
    {"input_size": 256, "output_size": 128, "dropout_rate": 0.3, "activation": nn.ReLU, "batch_norm": True},
    {"input_size": 128, "output_size": num_labels, "activation": None}  # Final layer with no activation
]

model = CustomModelBis(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
)

In [None]:
# OK
num_epochs = 8
patience = 3
min_delta = 0.001
n_splits = 3
batch_size = 8

def custom_model_creator(model_name, num_labels):

    custom_layer_configs = [
        {"input_size": 768, "output_size": 256, "dropout_rate": 0.2, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 256, "output_size": 128, "dropout_rate": 0.3, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 128, "output_size": num_labels, "activation": None}  # Final layer with no activation
    ]

    model = CustomModelBis(
        model_name=model_name,
        num_labels=num_labels,
        layer_configs=custom_layer_configs
    )

    return model

trainer = MultiClassTrainer(model_name="camembert-base",
                            model_creator=custom_model_creator,
                            classes_names=classes_names_ESG,
                            )

rets = trainer.cross_validation(texts=texts,
              labels=labels_ESG,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)

In [None]:
num_labels = 4
model_name = "camembert-base"

custom_layer_configs = [
    {"input_size": 768, "output_size": 256, "dropout_rate": 0.2, "activation": nn.ReLU, "batch_norm": True},
    {"input_size": 256, "output_size": 128, "dropconnect_rate": 0.1, "activation": nn.Tanh, "residual": True},
    {"input_size": 128, "output_size": num_labels, "activation": None}
]

model = CustomModelBis(
    model_name=model_name,
    num_labels=num_labels,
    layer_configs=custom_layer_configs
)

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
n_splits = 3
batch_size = 8

def custom_model_creator(model_name, num_labels):

    custom_layer_configs = [
        {"input_size": 768, "output_size": 256, "dropout_rate": 0.2, "activation": nn.ReLU, "batch_norm": True},
        {"input_size": 256, "output_size": 128, "dropconnect_rate": 0.1, "activation": nn.Tanh, "residual": True},
        {"input_size": 128, "output_size": num_labels, "activation": None}
    ]

    model = CustomModelBis(
        model_name=model_name,
        num_labels=num_labels,
        layer_configs=custom_layer_configs
    )

    return model

trainer = MultiClassTrainer(model_name="camembert-base",
                            model_creator=custom_model_creator,
                            classes_names=classes_names_ESG,
                            )

rets = trainer.cross_validation(texts=texts,
              labels=labels_ESG,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)




In [None]:
# Dropout 0.05
num_epochs = 8
patience = 3
min_delta = 0.001
n_splits = 3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CustomModel(model_name=model_name, num_labels=num_labels)


trainer = MultiClassTrainer(model_name="camembert-base",
                            model_creator=custom_model_creator,
                            classes_names=classes_names_ESG,
                            )

rets = trainer.cross_validation(texts=texts,
              labels=labels_ESG,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
n_splits = 3
batch_size = 8

trainer = MultiClassTrainer(model_name="camembert-base",
                            classes_names=classes_names_ESG,
                            )

rets = trainer.cross_validation(texts=texts,
              labels=labels_ESG,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
n_splits = 3
batch_size = 8

trainer = MultiClassTrainer(model_name="camembert-base",
                            classes_names=classes_names_ESG,
                            )

rets = trainer.cross_validation(texts=texts,
              labels=labels_ESG,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)

In [None]:
rets[0]

## Holdout ESRS

In [None]:
len(np.unique(labels_ESRS))

### without classes balancing

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

trainer = MultiClassTrainer(model_name="camembert-base",
                            num_classes=len(np.unique(labels_ESRS)),
                            classes_names=classes_names_ESRS,
                            )

ret = trainer.holdout(texts=texts,
              labels=labels_ESRS,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)


### with classes balancing

In [None]:
num_epochs = 15 # 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

trainer = MultiClassTrainer(model_name="camembert-base",
                            num_classes=len(np.unique(labels_ESRS)),
                            classes_names=classes_names_ESRS,
                            )

ret = trainer.holdout(texts=texts,
              labels=labels_ESRS,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)


### with AdamW

In [None]:
num_epochs = 15 # 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8


custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))


trainer = MultiClassTrainer(model_name="camembert-base",
                                   classes_names=classes_names_ESRS,
                                   optimizer_creator=custom_optimizer_creator,
                                   optimizer_params=custom_optimizer_params,
                                   #scheduler_creator=custom_scheduler_creator,
                                   #scheduler_params=custom_scheduler_params
                                   )


ret = trainer.holdout(texts=texts,
              labels=labels_ESRS,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)


### with AdamW + AnnealOnPlateau

In [None]:
num_epochs = 15 # 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8


custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))

custom_scheduler_params = {
    'factor': 0.5,
    'patience': 3,
    'min_lr': 1e-5,
    'verbose': False
}

def custom_scheduler_creator(optimizer, **params):
    return AnnealOnPlateau(optimizer,
                           factor=params.get('factor'),
                           patience=params.get('patience'),
                           min_lr=params.get('min_lr'),
                           verbose=params.get('verbose')
                           )

trainer = MultiClassTrainer(model_name="camembert-base",
                                   classes_names=classes_names_ESRS,
                                   optimizer_creator=custom_optimizer_creator,
                                   optimizer_params=custom_optimizer_params,
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params
                                   )

ret = trainer.holdout(texts=texts,
              labels=labels_ESRS,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)


### with AdamW + AnnealOnPlateau

In [None]:
num_epochs = 15 # 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8


custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))

custom_scheduler_params = {
    'factor': 0.5,
    'patience': 2,
    'min_lr': 1e-6,
    'verbose': False
}

def custom_scheduler_creator(optimizer, **params):
    return AnnealOnPlateau(optimizer,
                           factor=params.get('factor'),
                           patience=params.get('patience'),
                           min_lr=params.get('min_lr'),
                           verbose=params.get('verbose')
                           )

trainer = MultiClassTrainer(model_name="camembert-base",
                                   classes_names=classes_names_ESRS,
                                   optimizer_creator=custom_optimizer_creator,
                                   optimizer_params=custom_optimizer_params,
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params
                                   )

ret = trainer.holdout(texts=texts,
              labels=labels_ESRS,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)


### with AdamW and StepLR

In [None]:
num_epochs = 15 # 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8


custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))


custom_scheduler_params = {
    'step_size': 10,
    'gamma': 0.1,
}

def custom_scheduler_creator(optimizer, **params):
    return StepLR(optimizer,
                  step_size=params.get('step_size'),
                  gamma=params.get('gamma'),
                  )

trainer = MultiClassTrainer(model_name="camembert-base",
                                   classes_names=classes_names_ESRS,
                                   optimizer_creator=custom_optimizer_creator,
                                   optimizer_params=custom_optimizer_params,
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params,
                                   scheduler_needs_loss = False,
                                   )


ret = trainer.holdout(texts=texts,
              labels=labels_ESRS,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)


## Holdout custom model

### With Camembert from HF

#### No scheduler

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr),
                                   scheduler_creator=None,
                                   scheduler_params=None
                                   )

ret = custom_trainer.holdout(texts=texts,
              labels=labels,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta,
              balanced=True)

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr),
                                   scheduler_creator=None,
                                   scheduler_params=None
                                   )

ret = custom_trainer.holdout(texts=texts,
              labels=labels,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)

#### with default scheduler

In [None]:
num_epochs = 8
patience = 4
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr),
                                   )

ret = custom_trainer.holdout(texts=texts,
              labels=labels,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr),
                                   scheduler_creator=custom_scheduler_creator
                                   )

ret = custom_trainer.holdout(texts=texts,
              labels=labels,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr)
                                   )

ret = custom_trainer.holdout(texts=texts,
              labels=labels,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)


In [None]:
ret

#### with custom optimizer & scheduler

##### ReduceLROnPlateau (default)

In [None]:
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

num_epochs = 10
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

custom_optimizer_params = {
    'lr': 1e-5,
}

def custom_optimizer_creator(model, **params):
    return AdamW(model.parameters(), lr=params.get('lr'))

custom_scheduler_params = {
    'T_0': 3,
    'T_mult': 1,
    'eta_min': 5e-6
}

def custom_scheduler_creator(optimizer, **params):
    return CosineAnnealingWarmRestarts(optimizer,
                                        T_0=params.get('T_0'),
                                        T_mult=params.get('T_mult'),
                                        eta_min=params.get('eta_min')
                                        )

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=custom_optimizer_creator,
                                   optimizer_params=custom_optimizer_params,
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params
                                   )

In [None]:
custom_trainer.init_model()

In [None]:
# How to get the name of the class name of the scheduler ?
#custom_trainer.scheduler
custom_trainer.scheduler.__class__.__name__

In [None]:

num_epochs = 10
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_scheduler_params = {
    'T_0': 3,
    'T_mult': 1,
    'eta_min': 5e-6
}

def custom_scheduler_creator(optimizer, **params):
    return CosineAnnealingWarmRestarts(optimizer,
                                        T_0=params.get('T_0'),
                                        T_mult=params.get('T_mult'),
                                        eta_min=params.get('eta_min')
                                        )

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr),
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params
                                   )

ret = custom_trainer.holdout(texts=texts,
                             labels=labels,
                             val_size=val_size,
                             num_epochs=num_epochs,
                             batch_size=batch_size,
                             patience=patience,
                             min_delta=min_delta,
                             balanced=True)

##### CosineAnnealingWarmRestarts

In [None]:

num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_scheduler_params = {
    'T_0': 3,
    'T_mult': 1,
    'eta_min': 5e-6
}

def custom_scheduler_creator(optimizer, **params):
    return CosineAnnealingWarmRestarts(optimizer,
                                        T_0=params.get('T_0'),
                                        T_mult=params.get('T_mult'),
                                        eta_min=params.get('eta_min')
                                        )

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr),
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params
                                   )

ret = custom_trainer.holdout(texts=texts,
                             labels=labels,
                             val_size=val_size,
                             num_epochs=num_epochs,
                             batch_size=batch_size,
                             patience=patience,
                             min_delta=min_delta,
                             balanced=True)

In [None]:

num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_scheduler_params = {
    'T_0': 3,
    'T_mult': 1.0,
    'eta_min': 1e-5
}

def custom_scheduler_creator(optimizer, **params):
    return CosineAnnealingWarmRestarts(optimizer,
                                        T_0=params.get('T_0'),
                                        T_mult=params.get('T_mult'),
                                        eta_min=params.get('eta_min')
                                        )

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr),
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params
                                   )

ret = custom_trainer.holdout(texts=texts,
                             labels=labels,
                             val_size=val_size,
                             num_epochs=num_epochs,
                             batch_size=batch_size,
                             patience=patience,
                             min_delta=min_delta,
                             balanced=True)

##### CyclicLR

In [None]:
from torch.optim.lr_scheduler import CyclicLR

num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_scheduler_params = {
    'base_lr': 1e-5,
    'max_lr': 5e-5,
    'step_size_up': 3
}

def custom_scheduler_creator(optimizer, **params):
    return CyclicLR(optimizer,
                    base_lr=params.get('base_lr'),
                    max_lr=params.get('max_lr'),
                    step_size_up=params.get('step_size_up')
                    )

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr),
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params
                                   )

ret = custom_trainer.holdout(texts=texts,
                             labels=labels,
                             val_size=val_size,
                             num_epochs=num_epochs,
                             batch_size=batch_size,
                             patience=patience,
                             min_delta=min_delta,
                             balanced=True)

In [None]:

num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_scheduler_params = {
    'base_lr': 1e-5,
    'max_lr': 5e-5,
}

def custom_scheduler_creator(optimizer, **params):
    return CyclicLR(optimizer,
                    base_lr=params.get('base_lr'),
                    max_lr=params.get('max_lr')
                    )

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr),
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params
                                   )

ret = custom_trainer.holdout(texts=texts,
                             labels=labels,
                             val_size=val_size,
                             num_epochs=num_epochs,
                             batch_size=batch_size,
                             patience=patience,
                             min_delta=min_delta,
                             balanced=True)

In [None]:

num_epochs = 8
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CamembertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_scheduler_params = {
    'base_lr': 1e-6,
    'max_lr': 1e-4,
}

def custom_scheduler_creator(optimizer, **params):
    base_lr=params.get('base_lr')
    max_lr=params.get('max_lr')
    return CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr)

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr),
                                   scheduler_creator=custom_scheduler_creator,
                                   scheduler_params=custom_scheduler_params
                                   )

ret = custom_trainer.holdout(texts=texts,
                             labels=labels,
                             val_size=val_size,
                             num_epochs=num_epochs,
                             batch_size=batch_size,
                             patience=patience,
                             min_delta=min_delta,
                             balanced=True)

##### Ret Example

In [None]:
ret

### With Custom Camembert class

In [None]:
num_epochs = 12
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CustomModel(model_name=model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr)
                                   )

ret = custom_trainer.holdout(texts=texts,
              labels=labels,
              val_size=val_size,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)

In [None]:
num_epochs = 12
patience = 3
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CustomModel(model_name=model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr)
                                   )

ret = custom_trainer.holdout(texts=texts,
                             labels=labels,
                             val_size=val_size,
                             num_epochs=num_epochs,
                             batch_size=batch_size,
                             patience=patience,
                             min_delta=min_delta,
                             balanced=True
                             )

In [None]:
num_epochs = 20
patience = 5
min_delta = 0.001
val_size = 0.3
batch_size = 8

def custom_model_creator(model_name, num_labels):
    return CustomModel(model_name=model_name, num_labels=num_labels)

def custom_optimizer_creator(model, lr):
    return AdamW(model.parameters(), lr=lr)

custom_trainer = MultiClassTrainer(model_name="camembert-base",
                                   num_classes=len(np.unique(labels)),
                                   model_creator=custom_model_creator,
                                   optimizer_creator=lambda model, lr: custom_optimizer_creator(model, lr),
                                   scheduler_creator=None,
                                   scheduler_params=None
                                   )

ret = custom_trainer.holdout(texts=texts,
                             labels=labels,
                             val_size=val_size,
                             num_epochs=num_epochs,
                             batch_size=batch_size,
                             patience=patience,
                             min_delta=min_delta,
                             balanced=True
                             )

## Display losses

In [None]:
ret

In [None]:
custom_trainer.display_losses(ret)

## Cross validation

In [None]:
num_epochs = 8
patience = 3
min_delta = 0.001
n_splits = 3
batch_size = 8

trainer = MultiClassTrainer(model_name="camembert-base",
                            num_classes=len(np.unique(labels)))

rets = trainer.cross_validation(texts=texts,
              labels=labels,
              n_splits=n_splits,
              num_epochs=num_epochs,
              batch_size=batch_size,
              patience=patience,
              min_delta=min_delta)

In [None]:
rets

In [None]:
print(rets[1]["val_losses"])
print(rets[1]["best_epoch"])
print(rets[1]["best_loss"])


In [None]:
print(rets[2]["val_losses"])
print(rets[2]["best_epoch"])
print(rets[2]["best_loss"])

# Release GPU Memory

In [None]:
model = None
gc.collect()
if device.startswith("cuda"):
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# End of game