In [1]:
import pandas as pd
import numpy as np
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import ast

# Activer la précision mixte pour accélérer les calculs
from tensorflow.keras.mixed_precision import set_global_policy
set_global_policy('mixed_float16')




  from .autonotebook import tqdm as notebook_tqdm


The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


**Chargement des données**

In [2]:
path = "./archive/lemmetised_clean_data.csv"
df = pd.read_csv(path, delimiter=",", quotechar='"')

**Séléction des tags et réduction du Dataset**


In [3]:
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Sélectionner les 50 tags les plus fréquents (ajustable)
all_tags = [tag for tag_list in df['tags'] for tag in tag_list]
tag_counts = pd.Series(all_tags).value_counts()
num_top_tags = min(15, len(tag_counts))  # Modifier pour augmenter/diminuer
selected_tags = tag_counts.head(num_top_tags).index.tolist()

df = df[df['tags'].apply(lambda tags: any(tag in selected_tags for tag in tags))]
df['main_tag'] = df['tags'].apply(lambda tags: next((tag for tag in tags if tag in selected_tags), None))
df.dropna(subset=['clean_text'], inplace=True)

print(f"Taille du dataset après filtrage: {df.shape}")

Taille du dataset après filtrage: (64571, 8)


**Préparation des textes et etiquettes et encodage des étiquettes**

In [4]:
# Préparation des textes et des étiquettes
texts = df['clean_text'].astype(str).tolist()
labels = df['main_tag'].tolist()

# Encodage des étiquettes
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
num_classes = len(np.unique(y))

**Tokenization des textes**

In [None]:
# Chargement du tokenizer DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Réduire la séquence pour accélérer
max_len = 32  

# Tokenisation des textes
tokens = tokenizer(texts, max_length=max_len, padding=True, truncation=True, return_tensors="np")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


**Division des données en train/test**

In [6]:
# Convertir les tokens en tableaux NumPy
X = np.array(tokens['input_ids'])  # Convertir les IDs en NumPy array
y = np.array(y)  # Convertir les labels en NumPy array

# Division train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Création des datasets TensorFlow
batch_size = 64  # Augmentation de la taille des batchs
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size).prefetch(tf.data.AUTOTUNE)


**Création du modèle Transformer (BERT)**

In [7]:
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_classes)




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

**Compilation**

In [8]:
optimizer = Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

**Définition d'un Early Stopping**

In [9]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

**Entraînement**

In [10]:
model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=5,  # Ajustable
    callbacks=[early_stopping]
)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x17738bccce0>

**Évaluation**

In [11]:
loss, accuracy = model.evaluate(test_dataset)
print(f"Accuracy sur les données de test: {accuracy:.4f}")

Accuracy sur les données de test: 0.5335
