In [1]:
# Import pandas for loading and handling the dataset
import pandas as pd  
# Import numpy for numerical operations
import numpy as np 

In [3]:
# Load the Amazon reviews dataset
df = pd.read_csv("Reviews.csv")

# Display first 5 rows to understand data structure
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,sentiment_label
0,28389,B007OXJMD2,A1YNN51RNUCCUA,Jennilee R. Benda,0,0,2,1342915200,"It's okay, but not really worth buying.",I had some of this in a sampler pack. It real...,negative
1,36700,B004CLCEDE,A2YWQAFCHIO779,VR33,2,2,4,1312761600,"Great product, but kind of pricey",This is great because it doesn't have any calo...,positive
2,38719,B002DHUCEM,A1FDVT0DLJWV78,D. A. J.,6,7,4,1255651200,Great for a chocolate boost,I really like this product. I have been trying...,positive
3,16808,B001LGGH40,A15IH0ZQ87H51P,"S. Shrader ""Bio Gal""",1,1,1,1237420800,Did not like the taste of this,My guess is the other flavors of Switch taste ...,negative
4,33586,B006N3I84I,A3HPHQ9F2CCF06,"J. Taylor ""Nana""",0,0,4,1310688000,Good Coffee,I actually came to this coffee because Amazon ...,positive


In [5]:
# Select only the required columns for sentiment analysis
df = df[['Text', 'Score']]

# Check for missing values
df.isnull().sum()

Text     0
Score    0
dtype: int64

In [7]:
# Convert star ratings into sentiment labels
# Rating >= 4 → Positive (1), Rating <= 2 → Negative (0)
df = df[df['Score'] != 3]  # Remove neutral reviews

df['Sentiment'] = df['Score'].apply(lambda x: 1 if x >= 4 else 0)

In [9]:
# Import re for regular expressions (used in text cleaning)
import re  

# Import Natural Language Toolkit for stopwords and lemmatization
import nltk  

# Download stopwords (run once)
nltk.download('stopwords')
nltk.download('wordnet')

# Import stopwords list
from nltk.corpus import stopwords  

# Import lemmatizer to convert words into base form
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bbek1\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bbek1\AppData\Roaming\nltk_data...


In [11]:
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean review text
def clean_text(text):
    # Convert text to lowercase to maintain uniformity
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize text
    words = text.split()
    
    # Remove stopwords and apply lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # Join words back into sentence
    return " ".join(words)

In [13]:
# Clean all review texts
df['Clean_Text'] = df['Text'].apply(clean_text)

In [35]:
df['Clean_Text']

0        sampler pack really wasnt good couldnt taste h...
1        great doesnt calorie however kinda pricey nood...
2        really like product trying loose last ten poun...
3        guess flavor switch taste better thought overk...
4        actually came coffee amazon took coffee subscr...
                               ...                        
17176    like dogswell product purchased vitality chick...
17177    dissolved entire tab tongue disappointed disco...
17178    love highoctane peanut butter fatcalorie conte...
17179    discovered salsa vacation last year im excited...
17180    fan jasmine tea quite several year agoi first ...
Name: Clean_Text, Length: 17181, dtype: object

In [15]:
# Import train-test split to divide dataset
from sklearn.model_selection import train_test_split  

# Import TF-IDF vectorizer to convert text into numerical features
from sklearn.feature_extraction.text import TfidfVectorizer  

# Import Logistic Regression model
from sklearn.linear_model import LogisticRegression  

# Import Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB  

# Import metrics to evaluate model performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [17]:
# Define input features and target label
X = df['Clean_Text']   # Input text
y = df['Sentiment']    # Output label

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [19]:
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Convert text into TF-IDF numerical features
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [21]:
# Initialize Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Train the model
lr_model.fit(X_train_tfidf, y_train)

# Make predictions
lr_predictions = lr_model.predict(X_test_tfidf)

In [23]:
# Print accuracy score
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))

# Print classification report
print(classification_report(y_test, lr_predictions))

# Print confusion matrix
print(confusion_matrix(y_test, lr_predictions))

Logistic Regression Accuracy: 0.8737270875763747
              precision    recall  f1-score   support

           0       0.86      0.83      0.85      1437
           1       0.88      0.90      0.89      2000

    accuracy                           0.87      3437
   macro avg       0.87      0.87      0.87      3437
weighted avg       0.87      0.87      0.87      3437

[[1194  243]
 [ 191 1809]]


In [25]:
# Initialize Multinomial Naive Bayes
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train_tfidf, y_train)

# Make predictions
nb_predictions = nb_model.predict(X_test_tfidf)

In [27]:
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print(classification_report(y_test, nb_predictions))
print(confusion_matrix(y_test, nb_predictions))

Naive Bayes Accuracy: 0.8510328775094559
              precision    recall  f1-score   support

           0       0.89      0.73      0.80      1437
           1       0.83      0.94      0.88      2000

    accuracy                           0.85      3437
   macro avg       0.86      0.83      0.84      3437
weighted avg       0.86      0.85      0.85      3437

[[1048  389]
 [ 123 1877]]


In [29]:
# Function to predict sentiment for new input using Logistic Regression
def predict_sentiment(review):
    # Clean the review text
    review = clean_text(review)
    
    # Convert to TF-IDF
    review_tfidf = tfidf.transform([review])
    
    # Predict sentiment
    prediction = lr_model.predict(review_tfidf)[0]
    
    return "Positive" if prediction == 1 else "Negative"

In [31]:
predict_sentiment("This product quality is amazing and delivery was godd fast")

'Positive'

In [33]:
predict_sentiment("It is not good product, bad")

'Negative'

In [37]:
# Function to predict sentiment using Naive Bayes
def predict_sentiment_nb(review):
    # Clean the review text
    review = clean_text(review)
    
    # Convert to TF-IDF
    review_tfidf = tfidf.transform([review])
    
    # Predict sentiment using Naive Bayes
    prediction = nb_model.predict(review_tfidf)[0]
    
    return "Positive" if prediction == 1 else "Negative"

In [39]:
new_review = "This product is amazing and works perfectly!"
print("Logistic Regression Prediction:", predict_sentiment(new_review))
print("Naive Bayes Prediction:", predict_sentiment_nb(new_review))

Logistic Regression Prediction: Positive
Naive Bayes Prediction: Positive
