# Natural Language Processing with Amazon Reviews

Developed for the second project of the Artificial Intelligence course

## Setup and Environment

After installing Python and the necessary packages, run the following code to import them:

In [1]:
%matplotlib inline

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *

## Implementation work

### Generating a data set
#### Obtaining features from the Bag-of-Words model

In [2]:
train = pd.read_csv('./data/train_preprocessed.csv')
test = pd.read_csv('./data/test_preprocessed.csv')

vectorizer = CountVectorizer()

trainArr = map(str, train['review_title'])

titleWords = vectorizer.fit_transform(trainArr).toarray()

print(titleWords.shape)
print(vectorizer.get_feature_names())

(10000, 6248)
['00', '000', '007', '04', '07', '0mbp', '10', '100', '1000', '101', '11', '115', '12', '1275', '128', '130', '14', '149', '14k', '14lb', '14th', '15', '150', '16', '17', '170', '1721', '18', '1890', '18th', '18v', '19', '1914', '1920', '1933', '1949', '1955', '1960', '1967', '1969', '1970', '1976', '1977', '1980', '1986', '1990', '1993', '1994', '1997', '1g', '1st', '20', '2000', '2001', '2003', '2004', '2005', '2006', '2007', '2008', '2008c', '2009', '2011', '2012', '2040', '20th', '213', '21st', '22', '220', '236', '24', '25', '250', '2730', '281', '2g', '2nd', '30', '300', '3000', '30oz', '321373', '33', '330', '3340', '34', '344263', '35', '360', '369', '370', '377dat', '3and', '3dequel', '3rd', '40', '400', '4105', '44', '45', '450', '45w', '4890', '4th', '50', '510', '550', '56', '5762', '58', '5mm', '5mp', '60', '6000', '64', '64bit', '65', '650', '66', '6th', '6v', '70', '72', '720', '74', '75', '76csx', '77', '773', '7th', '80', '800', '8004', '822', '8500', '90



### Obtaining the Test classes

In [3]:
y = train['class_index']

# print(X.shape, y.shape)

### Training Classifiers

In [4]:
clf = MultinomialNB()
clf.fit(titleWords, y)

testArr = map(str, test['review_title'])
testVector = vectorizer.transform(testArr).toarray()

y_pred = clf.predict(testVector)
print(y_pred)

[5 3 3 ... 2 3 5]


### Analyse Results

In [5]:
# confusion matrix
confusionMatrix = confusion_matrix(test['class_index'], y_pred)
print(confusionMatrix)

correctReviews = 0
for i in range(len(confusionMatrix)):
    correctReviews += confusionMatrix[i][i]

incorrectReviews = sum(map(sum, confusionMatrix)) - correctReviews
# print("correctReviews:", correctReviews)
# print("incorrectReviews:", incorrectReviews)

accuracy = (correctReviews / (incorrectReviews + correctReviews)) * 100

# accuracy, precision, recall, f1
# CHECK IF CONFUSION MATRIX IS ORDER FROM 1-5 RATING
print("Accuracy:", accuracy)
for i in range(5):
    colSum = 0
    for j in range(5):
        colSum += confusionMatrix[j][i]
    
    precision = (confusionMatrix[i][i] / colSum)
    recall = (confusionMatrix[i][i] / sum(confusionMatrix[i]))
    fMeasure = (2*precision*recall) / (precision + recall) 
    print("-------\n", i+1, "Rating Results:")
    print("Precision:", precision * 100, "%")
    print("Recall:", recall * 100, "%")
    print("F-measure:", fMeasure * 100, "%")

[[202  91  52  30  25]
 [106 116  89  59  30]
 [ 60  89 118  80  53]
 [ 52  52  70 118 108]
 [ 45  28  32  86 209]]
Accuracy: 38.15
-------
 1 Rating Results:
Precision: 43.44086021505376 %
Recall: 50.5 %
F-measure: 46.705202312138724 %
-------
 2 Rating Results:
Precision: 30.851063829787233 %
Recall: 28.999999999999996 %
F-measure: 29.896907216494846 %
-------
 3 Rating Results:
Precision: 32.686980609418285 %
Recall: 29.5 %
F-measure: 31.011826544021027 %
-------
 4 Rating Results:
Precision: 31.63538873994638 %
Recall: 29.5 %
F-measure: 30.53040103492885 %
-------
 5 Rating Results:
Precision: 49.1764705882353 %
Recall: 52.25 %
F-measure: 50.66666666666667 %
