**Importation des bibilothèques**

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import ast

**Chargement des données**


In [2]:
# Charger les données
path = "./archive/lemmetised_clean_data.csv"
df = pd.read_csv(path, delimiter=",", quotechar='"')

**Filtrage des Tags**


In [3]:
# Vérifier et convertir les tags correctement
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Compter les occurrences des tags
all_tags = [tag for tag_list in df['tags'] for tag in tag_list]
tag_counts = pd.Series(all_tags).value_counts()

# Sélectionner les 1000 tags les plus fréquents
num_top_tags = min(15, len(tag_counts))
selected_tags = tag_counts.head(num_top_tags).index.tolist()

# Filtrer les articles qui contiennent au moins un des tags sélectionnés
df = df[df['tags'].apply(lambda tags: any(tag in selected_tags for tag in tags))]

df['main_tag'] = df['tags'].apply(lambda tags: next((tag for tag in tags if tag in selected_tags), None))
df.dropna(subset=['clean_text'], inplace=True)

# Vérifier la taille du dataset après filtrage
print(f"Taille du dataset après filtrage: {df.shape}")

Taille du dataset après filtrage: (64571, 8)


**Préparation des textes et etiquettes et encodage des étiquettes**

In [4]:
# Préparation des textes et étiquettes
texts = df['clean_text'].astype(str).tolist()
labels = df['main_tag'].tolist()

# Encodage des étiquettes
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
num_classes = len(np.unique(y))

**Tokenisation et séquencement des textes puis création du tokenizer**

In [5]:
# Tokenisation et séquencement des textes
max_words = 20000  # Nombre max de mots dans le vocabulaire
max_len = 200  # Longueur max des séquences

# Création du tokenizer
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
X_sequences = tokenizer.texts_to_sequences(texts)
X_padded = pad_sequences(X_sequences, maxlen=max_len, padding='post', truncating='post')

**Division train/test**

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

**Création du modèle CNN**

In [7]:
from tensorflow.keras.regularizers import l2

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Conv1D(filters=64, kernel_size=5, activation='relu', kernel_regularizer=l2(0.01)),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.6),
    Dense(num_classes, activation='softmax')
])




**Compilation du modèle**

In [11]:
from tensorflow.keras.metrics import CategoricalAccuracy

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy', CategoricalAccuracy(name='categorical_accuracy')]
)


**Entraînement du modèle**

In [12]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m1615/1615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 75ms/step - accuracy: 0.2700 - categorical_accuracy: 0.0487 - loss: 2.3206 - val_accuracy: 0.5172 - val_categorical_accuracy: 0.0525 - val_loss: 1.4913
Epoch 2/10
[1m1615/1615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 73ms/step - accuracy: 0.4929 - categorical_accuracy: 0.0432 - loss: 1.5775 - val_accuracy: 0.5455 - val_categorical_accuracy: 0.0485 - val_loss: 1.3988
Epoch 3/10
[1m1615/1615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 71ms/step - accuracy: 0.5344 - categorical_accuracy: 0.0493 - loss: 1.4456 - val_accuracy: 0.5514 - val_categorical_accuracy: 0.0482 - val_loss: 1.3825
Epoch 4/10
[1m1615/1615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 66ms/step - accuracy: 0.5570 - categorical_accuracy: 0.0494 - loss: 1.3689 - val_accuracy: 0.5562 - val_categorical_accuracy: 0.0480 - val_loss: 1.3798
Epoch 5/10
[1m1615/1615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x1b5825e5190>

**Évaluation du modèle**

In [16]:
print(model.evaluate(X_test, y_test))
loss, precision, recall = model.evaluate(X_test, y_test)
f1_score = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())

print(f"Accuracy : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall : {recall:.4f}")
print(f"F1-score : {f1_score:.4f}")

[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5459 - categorical_accuracy: 0.0464 - loss: 1.6869
[1.6556713581085205, 0.5476577877998352, 0.048625629395246506]
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.5459 - categorical_accuracy: 0.0464 - loss: 1.6869
Accuracy : 0.5477
Precision : 0.5477
Recall : 0.0486
F1-score : 0.0893
