#Entrainement d'un modèle qui determine le sentiment d'un tweet positif ou négatif avec l'algorithme NAIVE DE BAYES

Étape 1 : Importer les librairies

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import re

Étape 2 : Charger les données

In [None]:
def load_data(file_path, label):
    try:
        df = pd.read_csv(file_path, sep='\t', encoding='utf-8') # Use pandas to directly read the .tsv
        df['label'] = label
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame() # Return empty DataFrame on error
    except Exception as e:
        print(f"An error occurred while loading the file: {e}")
        return pd.DataFrame() # Return empty DataFrame on error


# Charger les données d'entraînement
train_pos = load_data("train_Arabic_tweets_positive_20190413.tsv", 1)  # 1 = positif
train_neg = load_data("train_Arabic_tweets_negative_20190413.tsv", 0)  # 0 = négatif

# Concatenate only if DataFrames are not empty
if not train_pos.empty and not train_neg.empty:
    train_df = pd.concat([train_pos, train_neg])
else:
    print("Error loading training data. Check file paths and try again.")
    exit()

# Charger les données de test
test_pos = load_data("test_Arabic_tweets_positive_20190413.tsv", 1)
test_neg = load_data("test_Arabic_tweets_negative_20190413.tsv", 0)

if not test_pos.empty and not test_neg.empty:
    test_df = pd.concat([test_pos, test_neg])
else:
    print("Error loading test data. Check file paths and try again.")
    exit()

Étape 3 : Nettoyer le texte (en arabe)

In [None]:
def clean_arabic_text(text):
    # Supprimer les mentions (@), les liens (http), et les caractères spéciaux
    text = re.sub(r"@\w+|http\S+|[^\u0600-\u06FF\s]", "", text)
    return text.strip()

train_df["clean_text"] = train_df["text"].apply(clean_arabic_text)
test_df["clean_text"] = test_df["text"].apply(clean_arabic_text)

Étape 4 : Transformer en nombres (TF-IDF)

In [None]:
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))  # Garde 1000 mots max
X_train = tfidf.fit_transform(train_df["clean_text"])
X_test = tfidf.transform(test_df["clean_text"])

y_train = train_df["label"]
y_test = test_df["label"]

Étape 5 : Entraîner le modèle Naive Bayes

In [None]:
model = MultinomialNB(alpha=0.5)
model.fit(X_train, y_train)

Étape 6 : Tester le modèle

In [None]:
predictions = model.predict(X_test)
print(f"Précision : {accuracy_score(y_test, predictions) * 100:.2f}%")

Précision : 76.08%


Étape 7 : Sauvegarder le modèle

In [None]:
import joblib
joblib.dump(model, "sentiment_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']