In [12]:
import pandas as pd

df = pd.read_csv("amazon_reviews.csv")
print(df.head())

       reviewerID        asin  reviewerName helpful  \
0  A3SBTW3WS4IQSN  B007WTAJTO           NaN  [0, 0]   
1  A18K1ODH1I2MVB  B007WTAJTO          0mie  [0, 0]   
2  A2FII3I2MBMUIA  B007WTAJTO           1K3  [0, 0]   
3   A3H99DFEG68SR  B007WTAJTO           1m2  [0, 0]   
4  A375ZM4U047O79  B007WTAJTO  2&amp;1/2Men  [0, 0]   

                                          reviewText  overall  \
0                                         No issues.      4.0   
1  Purchased this for my device, it worked as adv...      5.0   
2  it works as expected. I should have sprung for...      4.0   
3  This think has worked out great.Had a diff. br...      5.0   
4  Bought it with Retail Packaging, arrived legit...      5.0   

                                  summary  unixReviewTime  reviewTime  \
0                              Four Stars      1406073600  2014-07-23   
1                           MOAR SPACE!!!      1382659200  2013-10-25   
2               nothing to really say....      1356220800  

Positive → ratings 4 or 5

Negative → ratings 1 or 2

Neutral → rating 3 (optional — skip this for binary classification)

In [13]:
def get_sentiment(rating):
    if rating >= 4:
        return "Positive"
    elif rating <= 2:
        return "Negative"
    else:
        return None  # We’ll drop these

df['sentiment'] = df['overall'].apply(get_sentiment)
df = df.dropna(subset=['sentiment'])  # Drop neutral rows

In [14]:
#Preprocessing the text
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    filtered = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return " ".join(filtered)

df['cleaned_review'] = df['reviewText'].apply(preprocess)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ics\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ics\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
#TF-IDF and Model Training
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment'].map({'Positive': 1, 'Negative': 0})  # Binary

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9465968586387434
              precision    recall  f1-score   support

           0       0.56      0.72      0.63        61
           1       0.98      0.96      0.97       894

    accuracy                           0.95       955
   macro avg       0.77      0.84      0.80       955
weighted avg       0.95      0.95      0.95       955



In [16]:
def predict_sentiment(text):
    text = preprocess(text)
    vectorized = vectorizer.transform([text])
    result = model.predict(vectorized)
    return "Positive" if result[0] == 1 else "Negative"

# Try it!
print(predict_sentiment("I love this product, it works perfectly!"))
print(predict_sentiment("Terrible, waste of money."))


Positive
Positive


In [17]:
print(df['sentiment'].value_counts())


sentiment
Positive    4449
Negative     324
Name: count, dtype: int64
