In [3]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [5]:
# Charger le CSV
df = pd.read_csv("/content/amazon_reviews_clean.csv")

df = df.dropna()

print(df.head())
print(df["label"].value_counts())

      label                                               Text
0  positive  I have bought several of the Vitality canned d...
1  negative  Product arrived labeled as Jumbo Salted Peanut...
2  positive  This is a confection that has been around a fe...
3  negative  If you are looking for the secret ingredient i...
4  positive  Great taffy at a great price.  There was a wid...
label
positive    443777
negative     82037
neutral      42640
Name: count, dtype: int64


In [6]:
X = df["Text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [7]:
pipeline = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(
            lowercase=True,
            stop_words="english",
            ngram_range=(1, 2),
            min_df=5,
            max_df=0.9
        )
    ),
    (
        "clf",
        LinearSVC(class_weight="balanced")
    )
])

In [8]:
print("Training model...")
pipeline.fit(X_train, y_train)

Training model...


In [9]:
y_pred = pipeline.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("F1 Macro:", f1_score(y_test, y_pred, average="macro"))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.9136167330747377
F1 Macro: 0.7928041884686768

Classification Report:

              precision    recall  f1-score   support

    negative       0.81      0.83      0.82     16407
     neutral       0.66      0.55      0.60      8528
    positive       0.95      0.96      0.96     88756

    accuracy                           0.91    113691
   macro avg       0.81      0.78      0.79    113691
weighted avg       0.91      0.91      0.91    113691



In [10]:
joblib.dump(pipeline, "sentiment_model.pkl")
print("\n✅ Model saved as sentiment_model.pkl")


✅ Model saved as sentiment_model.pkl


In [11]:
samples = [
    "The product works as expected. Nothing special.",
    "This is the worst purchase I have ever made.",
    "Absolutely fantastic quality and fast delivery."
]

print("\nSample predictions:")
for s in samples:
    print(f"{s} -> {pipeline.predict([s])[0]}")


Sample predictions:
The product works as expected. Nothing special. -> neutral
This is the worst purchase I have ever made. -> negative
Absolutely fantastic quality and fast delivery. -> positive
