In [11]:
import numpy as numpy
import pandas as pd

In [12]:
df = pd.read_csv("data/Reviews.csv")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [13]:
# Create sentiment label
def label_sentiment(score):
    if score in [1, 2]:
        return 'negative'
    elif score == 3:
        return 'neutral'
    else:
        return 'positive'

df['Sentiment'] = df['Score'].apply(label_sentiment)

In [14]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize and remove stopwords + lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['Text'].apply(clean_text)

[nltk_data] Downloading package stopwords to C:\Users\Durdana
[nltk_data]     Khalid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Durdana
[nltk_data]     Khalid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['Sentiment'])

In [16]:
label_encoder.classes_  # ['negative', 'positive'] → [0, 1]

array(['negative', 'neutral', 'positive'], dtype=object)

In [17]:
df.to_csv("data/Reviews_cleaned.csv", index=False)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label_encoded']


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[ 9741  6638]
 [ 2649 86135]]
              precision    recall  f1-score   support

    negative       0.79      0.59      0.68     16379
    positive       0.93      0.97      0.95     88784

    accuracy                           0.91    105163
   macro avg       0.86      0.78      0.81    105163
weighted avg       0.91      0.91      0.91    105163



In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

# Example:
xgb = XGBClassifier()
xgb.fit(X_train_res, y_train_res)
y_pred_xgb = xgb.predict(X_test)

print(classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))


              precision    recall  f1-score   support

    negative       0.62      0.78      0.69     16379
    positive       0.96      0.91      0.93     88784

    accuracy                           0.89    105163
   macro avg       0.79      0.85      0.81    105163
weighted avg       0.90      0.89      0.90    105163



In [None]:
model = LogisticRegression()
model.fit(X_train_res, y_train_res)

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

[[13898  2481]
 [11588 77196]]
              precision    recall  f1-score   support

    negative       0.55      0.85      0.66     16379
    positive       0.97      0.87      0.92     88784

    accuracy                           0.87    105163
   macro avg       0.76      0.86      0.79    105163
weighted avg       0.90      0.87      0.88    105163



In [None]:
import joblib
joblib.dump(xgb, "model/test_model.joblib")

['model/test_model.joblib']

In [None]:
# Save the fitted vectorizer
joblib.dump(vectorizer, "model/tfidf_vectorizer.joblib")

print("✅ TfidfVectorizer saved successfully.")


✅ TfidfVectorizer saved successfully.


In [10]:
df['Sentiment'].value_counts()

positive    443777
negative     82037
Name: Sentiment, dtype: int64