In [1]:
# !pip install --upgrade datasets

In [2]:
import polars as pl
from datasets import load_dataset

ds = load_dataset("fancyzhx/ag_news")

KeyboardInterrupt: 

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [4]:
# Convertir chaque split en Polars DataFrame
train_pl = pl.DataFrame(ds["train"][:])
test_pl = pl.DataFrame(ds["test"][:])

# Affichage
print(train_pl.head())
print(test_pl.head())

shape: (5, 2)
┌─────────────────────────────────┬───────┐
│ text                            ┆ label │
│ ---                             ┆ ---   │
│ str                             ┆ i64   │
╞═════════════════════════════════╪═══════╡
│ Wall St. Bears Claw Back Into … ┆ 2     │
│ Carlyle Looks Toward Commercia… ┆ 2     │
│ Oil and Economy Cloud Stocks' … ┆ 2     │
│ Iraq Halts Oil Exports from Ma… ┆ 2     │
│ Oil prices soar to all-time re… ┆ 2     │
└─────────────────────────────────┴───────┘
shape: (5, 2)
┌─────────────────────────────────┬───────┐
│ text                            ┆ label │
│ ---                             ┆ ---   │
│ str                             ┆ i64   │
╞═════════════════════════════════╪═══════╡
│ Fears for T N pension after ta… ┆ 2     │
│ The Race is On: Second Private… ┆ 3     │
│ Ky. Company Wins Grant to Stud… ┆ 3     │
│ Prediction Unit Helps Forecast… ┆ 3     │
│ Calif. Aims to Limit Farm-Rela… ┆ 3     │
└─────────────────────────────────┴───────┘


In [5]:
label_names = ds["train"].features["label"].names
print(label_names)

train_pl = train_pl.with_columns([
    pl.col("label").map_elements(lambda x: label_names[x], return_dtype=pl.String).alias("label_name")
])
# test_pl = test_pl.with_columns([
#     pl.col("label").map_elements(lambda x: label_names[x], return_dtype=pl.String).alias("label_name")
# ])

train_pl.head()
#test_pl.head()

['World', 'Sports', 'Business', 'Sci/Tech']


text,label,label_name
str,i64,str
"""Wall St. Bears Claw Back Into …",2,"""Business"""
"""Carlyle Looks Toward Commercia…",2,"""Business"""
"""Oil and Economy Cloud Stocks' …",2,"""Business"""
"""Iraq Halts Oil Exports from Ma…",2,"""Business"""
"""Oil prices soar to all-time re…",2,"""Business"""


In [6]:
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words("english"))

def clean_text(text: str) -> str:
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in STOPWORDS]
    return tokens

[nltk_data] Downloading package punkt to /Users/hugo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/hugo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/hugo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
train_pl = train_pl.with_columns([
    pl.col("text").map_elements(clean_text, return_dtype=pl.List(pl.String)).alias("text_cleaned")
])

train_pl.head()

text,label,label_name,text_cleaned
str,i64,str,list[str]
"""Wall St. Bears Claw Back Into …",2,"""Business""","[""wall"", ""bears"", … ""green""]"
"""Carlyle Looks Toward Commercia…",2,"""Business""","[""carlyle"", ""looks"", … ""market""]"
"""Oil and Economy Cloud Stocks' …",2,"""Business""","[""oil"", ""economy"", … ""doldrums""]"
"""Iraq Halts Oil Exports from Ma…",2,"""Business""","[""iraq"", ""halts"", … ""saturday""]"
"""Oil prices soar to all-time re…",2,"""Business""","[""oil"", ""prices"", … ""elections""]"


In [8]:
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

tag_map = {
    "J": wordnet.ADJ,    # Adjectif
    "V": wordnet.VERB,   # Verbe
    "N": wordnet.NOUN,   # Nom
    "R": wordnet.ADV     # Adverbe
}

def lemmatik(seq):
    tag = pos_tag(seq)
    lenma = [
        lemmatizer.lemmatize(word, tag_map.get(tag[0], wordnet.NOUN))  # Défaut : nom
        for word, tag in tag
    ]

    return " ".join(lenma)

[nltk_data] Downloading package wordnet to /Users/hugo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/hugo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x10489bf10>>
Traceback (most recent call last):
  File "/Users/hugo/PycharmProjects/Chatbot_M1/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [9]:
train_pl = train_pl.with_columns([
    pl.col("text_cleaned").map_elements(lemmatik, return_dtype=pl.String).alias("text_cleaned")
])

train_pl.head()

text,label,label_name,text_cleaned
str,i64,str,str
"""Wall St. Bears Claw Back Into …",2,"""Business""","""wall bear claw back black reut…"
"""Carlyle Looks Toward Commercia…",2,"""Business""","""carlyle look toward commercial…"
"""Oil and Economy Cloud Stocks' …",2,"""Business""","""oil economy cloud stock outloo…"
"""Iraq Halts Oil Exports from Ma…",2,"""Business""","""iraq halt oil export main sout…"
"""Oil prices soar to all-time re…",2,"""Business""","""oil price soar record pose new…"


In [10]:
# test_pl = test_pl.with_columns([
#     pl.col("text").map_elements(clean_text, return_dtype=pl.List(pl.String)).alias("text_cleaned")
# ])
#
# test_pl = test_pl.with_columns([
#     pl.col("text_cleaned").map_elements(lemmatik, return_dtype=pl.String).alias("text_cleaned")
# ])
#
# test_pl.head()

# Entrainement du Model ML

In [11]:
from sklearn.model_selection import train_test_split
# Séparer X et Y train_test_split
X = train_pl["text_cleaned"].to_list()
y = train_pl["label"].to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)


In [None]:
import optuna
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score


def objective(trial):
    model_name = trial.suggest_categorical("model", ["LogisticRegression", "SVM"])
    max_features = trial.suggest_int("max_features", 1000, 10000, step=1000)

    vectorizer = TfidfVectorizer(max_features=max_features)

    if model_name == "LogisticRegression":
        clf = LogisticRegression(max_iter=1000, C=trial.suggest_float("C", 1e-3, 10, log=True))
    else:
        clf = LinearSVC(C=trial.suggest_float("C", 1e-3, 10, log=True), max_iter=1000)

    pipeline = Pipeline([
        ("vectorizer", vectorizer),
        ("classifier", clf)
    ])

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best trial:")
print(study.best_trial.params)


[I 2025-06-18 08:49:17,751] A new study created in memory with name: no-name-2cb91db1-703c-4069-953f-f0aaff39bb28
[I 2025-06-18 08:49:21,041] Trial 0 finished with value: 0.9015416666666667 and parameters: {'model': 'LogisticRegression', 'max_features': 3000, 'C': 0.2784165532260916}. Best is trial 0 with value: 0.9015416666666667.
[I 2025-06-18 08:49:24,808] Trial 1 finished with value: 0.9034583333333334 and parameters: {'model': 'LogisticRegression', 'max_features': 7000, 'C': 0.10967613259135425}. Best is trial 1 with value: 0.9034583333333334.
[I 2025-06-18 08:49:29,056] Trial 2 finished with value: 0.9094583333333334 and parameters: {'model': 'LogisticRegression', 'max_features': 6000, 'C': 0.35921989879479366}. Best is trial 2 with value: 0.9094583333333334.
[I 2025-06-18 08:49:31,770] Trial 3 finished with value: 0.9032916666666667 and parameters: {'model': 'LogisticRegression', 'max_features': 9000, 'C': 0.09898243325742442}. Best is trial 2 with value: 0.9094583333333334.
[I 

In [30]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

# Réentraîner le meilleur modèle
best_params = study.best_trial.params
vectorizer = TfidfVectorizer(max_features=best_params["max_features"])

if best_params["model"] == "LogisticRegression":
    clf = LogisticRegression(max_iter=1000, C=best_params["C"])
else:
    clf = SVC(C=best_params["C"], kernel='linear', probability=True)

pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", clf)
])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)

#  Metrics
print("\nClassification Report :")
print(classification_report(y_test, preds))

print("\nConfusion Matrix :")
print(confusion_matrix(y_test, preds))

# ROC AUC (pour 4 classes)
# On binarise les labels pour calculer l'AUC macro/micro
y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3])
proba = pipeline.predict_proba(X_test)

print("\nROC AUC Score (macro) :")
print(roc_auc_score(y_test_bin, proba, average='macro', multi_class='ovr'))


Classification Report :
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      6000
           1       0.95      0.98      0.96      6000
           2       0.88      0.86      0.87      6000
           3       0.87      0.88      0.88      6000

    accuracy                           0.90     24000
   macro avg       0.90      0.90      0.90     24000
weighted avg       0.90      0.90      0.90     24000


Confusion Matrix :
[[5390  197  237  176]
 [  76 5852   24   48]
 [ 204   55 5185  556]
 [ 205   55  451 5289]]

ROC AUC Score (macro) :
0.9781207199074075


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Génération de la matrice
cm = confusion_matrix(y_test, preds)
labels = [0, 1, 2, 3]  # adapte si tu as des labels autres

# Tracé avec seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)

plt.title("Matrice de confusion")
plt.xlabel("Prédictions")
plt.ylabel("Vraies classes")
plt.show()