In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import warnings
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv("reviews.csv")

In [4]:
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [5]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [7]:
null_percent = df.isnull().mean() * 100
print(null_percent)

Id                        0.000000
ProductId                 0.000000
UserId                    0.000000
ProfileName               0.004574
HelpfulnessNumerator      0.000000
HelpfulnessDenominator    0.000000
Score                     0.000000
Time                      0.000000
Summary                   0.004750
Text                      0.000000
dtype: float64


In [8]:
df = df[['Score', 'Text']]

In [9]:
df['Sentiment'] = df['Score'].apply(lambda x: 2 if x > 3 else (0 if x < 3 else 1))

In [10]:
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)  
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text
df['Clean_Text'] = df['Text'].apply(clean_text)

In [11]:
vectorizer = TfidfVectorizer(max_features=5000,
    ngram_range=(1, 2),        # Include unigrams + bigrams
    min_df=2,                  # Ignore terms that appear in fewer than 2 docs
    max_df=0.9,                # Ignore terms in more than 90% of docs (too common)
    stop_words='english',      # Remove common stopwords
    sublinear_tf=True)
X = vectorizer.fit_transform(df['Clean_Text'])
y = df['Sentiment']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
best_model=LogisticRegression(C=1.0, max_iter=1000, solver='liblinear')
models = {
    "Logistic Regression": best_model,
    "Naive Bayes": MultinomialNB(),
    "Linear SVC": LinearSVC(C=0.5)
}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    train_acc = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    y_test_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    print(f"\n📌 {name}")
    print(f"Training Accuracy: {train_acc:.4f}, F1 Score: {train_f1:.4f}")
    print(f"Testing  Accuracy: {test_acc:.4f}, F1 Score: {test_f1:.4f}")


📌 Logistic Regression
Training Accuracy: 0.8706, F1 Score: 0.8501
Testing  Accuracy: 0.8661, F1 Score: 0.8447

📌 Naive Bayes
Training Accuracy: 0.8254, F1 Score: 0.7762
Testing  Accuracy: 0.8266, F1 Score: 0.7769

📌 Linear SVC
Training Accuracy: 0.8705, F1 Score: 0.8475
Testing  Accuracy: 0.8664, F1 Score: 0.8424


In [32]:
sample = ["i absolutely hate that product"]
cleaned_sample = [clean_text(sample[0])]
sample_vec = vectorizer.transform(cleaned_sample)
pred = best_model.predict(sample_vec)
sentiment_map = {2:"Positive", 0:"Negative", 1:"Neutral"}
print(sentiment_map.get(pred[0], "Unknown Sentiment"))

Negative
