In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv("amazon_review.csv")

In [3]:
dataset.head()

Unnamed: 0,rating,title,review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50001 entries, 0 to 50000
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   rating  50001 non-null  int64 
 1   title   49996 non-null  object
 2   review  50001 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [5]:
# Check the shape and columns
print(f'Rows: {dataset.shape[0]}\nColumns: {dataset.shape[1]}')
print(f'Columns Names: {list(dataset.columns)}')

Rows: 50001
Columns: 3
Columns Names: ['rating', 'title', 'review']


In [6]:
#cleaning the dataset with missing value
dataset_cleaned = dataset.dropna(subset=['title'])
print(f'Rows: {dataset_cleaned.shape[0]}\nColumns: {dataset_cleaned.shape[1]}')
print(f'Columns Names: {list(dataset_cleaned.columns)}')

Rows: 49996
Columns: 3
Columns Names: ['rating', 'title', 'review']


In [7]:
from spacy.lang.en import English
nlp = English()

In [8]:
import string
from spacy.lang.en.stop_words import STOP_WORDS

stopwords = list(STOP_WORDS)
punctuations = string.punctuation

In [9]:
#Function to tokenize

def tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
    return mytokens

In [10]:
# Basic function to clean the text
def clean_text(text):
    return text.strip().lower()

In [11]:
# Convert rating into sentiment (positive/negative) - 1 for negative and 2 for positive
def convert_sentiment(rating):
    return 'positive' if rating == 2 else 'negative'

In [12]:
dataset_cleaned.loc[:, 'sentiment'] = dataset_cleaned['rating'].apply(convert_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_cleaned.loc[:, 'sentiment'] = dataset_cleaned['rating'].apply(convert_sentiment)


In [13]:
dataset_cleaned.head()

Unnamed: 0,rating,title,review,sentiment
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...,positive
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,positive
2,2,Amazing!,This soundtrack is my favorite music of all ti...,positive
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...,positive
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine...",positive


In [14]:
#Preparing the features and the labels
X = dataset_cleaned['review']
y = dataset_cleaned['sentiment']

In [15]:
#Splitting the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=66)
print(f"length of training data - {len(X_train)}")
print(f"length of testing data - {len(X_test)}")

length of training data - 39996
length of testing data - 10000


In [16]:
# Vectorizer and classifier pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [17]:
vectorizer = CountVectorizer(tokenizer=None, ngram_range=(1, 1))  # Simple bag-of-words
tfvectorizer = TfidfVectorizer(tokenizer=tokenizer)

In [18]:
#Trying the logistic regression model
from sklearn.linear_model import LogisticRegression

In [19]:
classifier = LogisticRegression(max_iter=1000)

In [20]:
#defining the lr model pipeline
# Define pipeline with vectorizer and classifier
from sklearn.pipeline import Pipeline
lr_model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Train the model
lr_model.fit(X_train, y_train)

# Predict on test data
lr_pred = lr_model.predict(X_test)

# Print evaluation metrics
print(f'Confusion Matrix:\n{confusion_matrix(y_test, lr_pred)}')
print(f'\nClassification Report:\n{classification_report(y_test, lr_pred)}')
print(f'Accuracy: {accuracy_score(y_test, lr_pred) * 100}%')

Confusion Matrix:
[[4147  727]
 [ 671 4455]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.85      0.86      4874
    positive       0.86      0.87      0.86      5126

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Accuracy: 86.02%


In [22]:
# Ensuring that save directory exists before saving the model
import os
import pickle
save_path = "./saved_model"
os.makedirs(save_path, exist_ok=True)

# Saving the trained Logistic Regression model
pickle.dump(lr_model, open(os.path.join(save_path, "LogisticRegression.sav"), "wb"))

In [23]:
#Trying the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

In [24]:
# Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    n_jobs=-1,
    random_state=66
)

rf_model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', rf_model)
])

rf_model.fit(X_train, y_train)

# Predict on test data
rf_pred = rf_model.predict(X_test)

# Print evaluation metrics for Random Forest
print(f'Confusion Matrix:\n{confusion_matrix(y_test, rf_pred)}')
print(f'\nClassification Report:\n{classification_report(y_test, rf_pred)}')
print(f'Accuracy: {accuracy_score(y_test, rf_pred) * 100}%')

Confusion Matrix:
[[3468 1406]
 [ 686 4440]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.71      0.77      4874
    positive       0.76      0.87      0.81      5126

    accuracy                           0.79     10000
   macro avg       0.80      0.79      0.79     10000
weighted avg       0.80      0.79      0.79     10000

Accuracy: 79.08%


In [26]:
# Saving the trained Random Forest model
pickle.dump(rf_model, open(os.path.join(save_path, "RandomForest.sav"), "wb"))

In [27]:
#Example Predictions


prediction = lr_model.predict(["This product is amazing! Highly recommend it to everyone."])
print(f'Prediction: {prediction[0]}')

# Another example prediction (Random Forest)
prediction = rf_model.predict(["The product quality is poor. It broke after one use."])
print(f'Prediction: {prediction[0]}')

Prediction: positive
Prediction: negative


In [28]:
#Example prediction with a subtle positive review

prediction = lr_model.predict(["I recently tried this product and while it took me a bit of time to fully get accustomed to it, I can definitely see how it would work for others. The quality is solid, and it has a unique design that stands out compared to others I’ve used. There are a few things that could be improved for a more seamless experience, but overall, it has its strong points. It’s a good option for anyone who appreciates functionality with a touch of style."])
print(f'Prediction: {prediction[0]}')

# Another example prediction (Random Forest)
prediction = rf_model.predict(["I recently tried this product and while it took me a bit of time to fully get accustomed to it, I can definitely see how it would work for others. The quality is solid, and it has a unique design that stands out compared to others I’ve used. There are a few things that could be improved for a more seamless experience, but overall, it has its strong points. It’s a good option for anyone who appreciates functionality with a touch of style."])
print(f'Prediction: {prediction[0]}')

Prediction: positive
Prediction: positive


In [29]:
#Example prediction with a subtle negative review

prediction = lr_model.predict([" It's not terrible, but there are definitely areas where improvement could make a significant difference. It might work better for others, but for me, it didn’t quite hit the mark."])
print(f'Prediction: {prediction[0]}')

# Another example prediction (Random Forest)
prediction = rf_model.predict([" It's not terrible, but there are definitely areas where improvement could make a significant difference. It might work better for others, but for me, it didn’t quite hit the mark."])
print(f'Prediction: {prediction[0]}')

Prediction: negative
Prediction: negative
