In [None]:
!pip install datasets pandas scikit-learn


In [None]:
from datasets import load_dataset
import pandas as pd
import joblib
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [33]:
# Charger AG News
dataset = load_dataset('ag_news')

# Convertir en DataFrame
df = pd.DataFrame(dataset['train'])
df.head()


Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [34]:
# Vérifier les valeurs manquantes
print(df.isnull().sum())


text     0
label    0
dtype: int64


In [35]:
# Supprimer les lignes avec des valeurs manquantes
df = df.dropna()


In [36]:
df['desc_length'] = df['text'].apply(len) # Changed 'description' to 'text'
df = df[df['desc_length'].between(20, 500)]

In [37]:
scaler = StandardScaler()
df['desc_length_scaled'] = scaler.fit_transform(df[['desc_length']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['desc_length_scaled'] = scaler.fit_transform(df[['desc_length']])


In [38]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [39]:
# Longueur du texte
df['text_length'] = df['text'].apply(len)

# Nombre de majuscules
df['caps_count'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()))

# Présence d’un point d’interrogation
df['has_question'] = df['text'].apply(lambda x: int('?' in x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_length'] = df['text'].apply(len)


In [40]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

y_train = train_df['label']
y_test = test_df['label']


In [41]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

nb_pred = nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))
print(classification_report(y_test, nb_pred))


Naive Bayes Accuracy: 0.8964243746852443
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      5982
           1       0.94      0.98      0.96      5978
           2       0.88      0.85      0.86      5986
           3       0.86      0.87      0.87      5882

    accuracy                           0.90     23828
   macro avg       0.90      0.90      0.90     23828
weighted avg       0.90      0.90      0.90     23828



In [42]:
joblib.dump(nb_model, 'naive_bayes_model.pkl')

['naive_bayes_model.pkl']

In [43]:
df.to_csv('processed_ag_news.csv', index=False)

In [44]:
# Charger le modèle Naive Bayes
nb_model = joblib.load('naive_bayes_model.pkl')

In [51]:
# Ton propre texte
text_input = ["Apple unveils new MacBook with faster M3 chip and AI features"]


In [52]:
# Transformer le texte
X_input = vectorizer.transform(text_input)

# Prédire
pred = nb_model.predict(X_input)
print("Prédiction brute:", pred)


Prédiction brute: [3]


In [53]:
# Dictionnaire des labels
label_map = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Science/Technology"
}

print("Catégorie prédite:", label_map[pred[0]])


Catégorie prédite: Science/Technology


# FIN !