# Bayesian inference on amazon reviews

### Part 1. Count vectorizer

In [1]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import nltk
import itertools
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import tensorflow_text as text
import tensorflow_hub as hub

plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Read in data

df = pd.read_csv('Reviews.csv')
df = df.drop(['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis = 1)
df = df.head(50000)
df['Label'] = np.where(df['Score'] >= 3, 1, 0)
df.head()

Unnamed: 0,Id,Score,Text,Label
0,1,5,I have bought several of the Vitality canned d...,1
1,2,1,Product arrived labeled as Jumbo Salted Peanut...,0
2,3,4,This is a confection that has been around a fe...,1
3,4,2,If you are looking for the secret ingredient i...,0
4,5,5,Great taffy at a great price. There was a wid...,1


In [3]:
# Data preprocessing

reviews = list(df['Text'])
labels = list(df['Label'])

reviews = [review.lower() for review in reviews]

reviews_nopunct = []
for review in reviews:
  reviews_nopunct.append("".join([char for char in review if char not in string.punctuation]))
  
reviews = reviews_nopunct

# Tokenization, Lemmatization, Stemming, Stopwords. Label numerical encoding

reviews_tokenized = []
for review in reviews:
  splitted_review = nltk.word_tokenize(review)
  splitted_review = [word for word in splitted_review if word not in stop_words]
  splitted_review = [WordNetLemmatizer().lemmatize(w) for w in splitted_review]
  splitted_review = [PorterStemmer().stem(w).strip() for w in splitted_review]
  reviews_tokenized.append(splitted_review)
  
reviews_unrolled = list(itertools.chain(*reviews_tokenized))

reviews = [' '.join(review) for review in reviews_tokenized]

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Count vectorizer feature names examples

count_vector = CountVectorizer(binary = True, stop_words = 'english')
count_vector.fit(reviews)
count_vector.get_feature_names_out()

array(['00', '00006mgbr', '002', ..., 'çaykur', 'çelem', 'ît'],
      dtype=object)

In [5]:
# Create reviews array

reviews_array = count_vector.transform(reviews).toarray()

In [6]:
# Create a frequency matrix

frequency_matrix = pd.DataFrame(reviews_array, columns=count_vector.get_feature_names_out())
frequency_matrix

Unnamed: 0,00,00006mgbr,002,0035ounc,005,0080br,009,009br,01,010,...,zukesbr,zukess,zuma,zupa,zuppa,zwieback,zyto,çaykur,çelem,ît
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.model_selection import train_test_split

# Train test split
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size = 0.2, random_state = 1)

In [8]:
# Count vectorizer feature transformation

count_vector = CountVectorizer(stop_words = 'english', binary = True)

training_data = count_vector.fit_transform(X_train)
test_data = count_vector.transform(X_test)

In [9]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB

# Multinomial Naive Bayes model predictions

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

predictions = naive_bayes.predict(test_data)

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Model evaluation

print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.8882
Precision score:  0.8978815860945139
Recall score:  0.9789174464053062
F1 score:  0.9366500453309157


### Part 2. TF-IDF Vectorizer

In [11]:
# TF-IDF vevtorizer feature examples

tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
tfidf_vectorizer.fit(reviews)
tfidf_vectorizer.get_feature_names_out()

array(['00', '00006mgbr', '002', ..., 'çaykur', 'çelem', 'ît'],
      dtype=object)

In [12]:
reviews_array = tfidf_vectorizer.transform(reviews).toarray()
frequency_matrix = pd.DataFrame(reviews_array, columns=count_vector.get_feature_names_out())
frequency_matrix

ValueError: Shape of passed values is (50000, 49493), indices imply (50000, 43485)

In [13]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size = 0.2, random_state = 1)

In [14]:
# TF-IDF vectorizer

count_vector = TfidfVectorizer(stop_words = 'english')

training_data = count_vector.fit_transform(X_train)
test_data = count_vector.transform(X_test)

In [15]:
# Naive Bayes model and predictions

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

predictions = naive_bayes.predict(test_data)

In [16]:
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.8453
Precision score:  0.8451451451451452
Recall score:  1.0
F1 score:  0.9160744317257092
