In [21]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

from transformers import BertTokenizer, BertModel
import torch
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report



In [2]:
df=pd.read_csv("data.csv")
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB


In [4]:
df.describe()

Unnamed: 0,Up Votes,Down Votes,Ratings
count,8508.0,8508.0,8518.0
mean,0.391396,0.121768,4.181028
std,11.613909,3.248022,1.2622
min,0.0,0.0,1.0
25%,0.0,0.0,4.0
50%,0.0,0.0,5.0
75%,0.0,0.0,5.0
max,889.0,219.0,5.0


In [5]:
df.shape

(8518, 8)

In [6]:
df=df[['Review text','Ratings']]
df.head()

Unnamed: 0,Review text,Ratings
0,"Nice product, good quality, but price is now r...",4
1,They didn't supplied Yonex Mavis 350. Outside ...,1
2,Worst product. Damaged shuttlecocks packed in ...,1
3,"Quite O. K. , but nowadays the quality of the...",3
4,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [7]:
df.shape

(8518, 2)

In [8]:
df.isnull().sum()

Review text    8
Ratings        0
dtype: int64

In [9]:
df.dropna( inplace=True)

In [10]:
df.shape

(8510, 2)

In [11]:
df.isnull().sum()

Review text    0
Ratings        0
dtype: int64

In [12]:
df.shape

(8510, 2)

In [13]:
print(df['Ratings'].unique())

[4 1 3 5 2]


In [14]:
#Creating Sentiment Labels
""" 1-->positive
    0-->negative"""
def label_sentiment(rating):
    return 1 if rating >= 4 else 0

df['sentiment'] = df['Ratings'].apply(label_sentiment)


In [15]:
df.head()

Unnamed: 0,Review text,Ratings,sentiment
0,"Nice product, good quality, but price is now r...",4,1
1,They didn't supplied Yonex Mavis 350. Outside ...,1,0
2,Worst product. Damaged shuttlecocks packed in ...,1,0
3,"Quite O. K. , but nowadays the quality of the...",3,0
4,Over pricedJust â?¹620 ..from retailer.I didn'...,1,0


In [16]:
print(df['sentiment'].value_counts())

sentiment
1    6823
0    1687
Name: count, dtype: int64


In [17]:
# Filter negative reviews
negative_reviews = df[df['sentiment'] == 0]

# Most common words in negative reviews
from collections import Counter

all_words = " ".join(negative_reviews['Review text'].dropna()).lower().split()
common_words = Counter(all_words).most_common(20)

print("Common words in negative reviews:")
print(common_words)


Common words in negative reviews:
[('more', 1712), ('not', 526), ('the', 467), ('is', 390), ('very', 324), ('quality', 268), ('of', 254), ('shuttle', 229), ('good', 214), ('product', 212), ('goodread', 186), ('i', 185), ('bad', 183), ('this', 175), ('was', 173), ('and', 173), ('to', 168), ('for', 168), ('it', 168), ('in', 165)]


In [18]:
# Filter positive reviews
positive_reviews = df[df['sentiment'] == 1]

# Most common words in positive reviews
from collections import Counter

all_words = " ".join(positive_reviews['Review text'].dropna()).lower().split()
common_words = Counter(all_words).most_common(20)

print("Common words in positive reviews:")
print(common_words)


Common words in positive reviews:
[('more', 6868), ('good', 1738), ('goodread', 1122), ('productread', 808), ('very', 790), ('nice', 717), ('for', 703), ('is', 668), ('and', 665), ('product', 580), ('the', 579), ('niceread', 437), ('best', 389), ('quality', 348), ('i', 337), ('shuttle', 332), ('qualityread', 328), ('to', 321), ('it', 307), ('in', 299)]


In [19]:
#Data Cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # remove special characters
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    return " ".join(text)

# Apply cleaning
df['Cleaned_Review'] = df['Review text'].apply(clean_text)

print(df[['Review text', 'Cleaned_Review']].head())


                                         Review text  \
0  Nice product, good quality, but price is now r...   
1  They didn't supplied Yonex Mavis 350. Outside ...   
2  Worst product. Damaged shuttlecocks packed in ...   
3  Quite O. K. , but nowadays  the quality of the...   
4  Over pricedJust â?¹620 ..from retailer.I didn'...   

                                      Cleaned_Review  
0  nice product good quality price rising bad sig...  
1  supplied yonex mavis outside cover yonex ad in...  
2  worst product damaged shuttlecock packed new b...  
3  quite k nowadays quality cork like year back u...  
4  pricedjust retailer understand wat advantage b...  


In [22]:
# Apply cleaning
df['Cleaned_Review'] = df['Review text'].apply(clean_text)
y = df['sentiment']
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['Cleaned_Review'],
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [23]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)


In [24]:
model = LogisticRegression(max_iter=2000)

model.fit(X_train_tfidf, y_train)


In [25]:
y_pred = model.predict(X_test_tfidf)

train_acc = accuracy_score(y_train, model.predict(X_train_tfidf))
test_acc = accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)

print("Train Accuracy:", train_acc)
print("Test Accuracy:", test_acc)
print("Test F1 Score:", test_f1)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Train Accuracy: 0.9005581668625147
Test Accuracy: 0.8789659224441834
Test F1 Score: 0.9286209286209286

Classification Report:

              precision    recall  f1-score   support

           0       0.86      0.46      0.60       337
           1       0.88      0.98      0.93      1365

    accuracy                           0.88      1702
   macro avg       0.87      0.72      0.77      1702
weighted avg       0.88      0.88      0.86      1702



In [28]:
import joblib
joblib.dump(tfidf, "vectorizer.pkl")
joblib.dump(model, "best_model.pkl")

print("Model and Vectorizer Saved Successfully ✅")


Model and Vectorizer Saved Successfully ✅
