# Natural Language Processing with Amazon Reviews

Developed for the second project of the Artificial Intelligence course

## Setup and Environment

After installing Python and the necessary packages, run the following code to import them:

In [63]:
%matplotlib inline

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *

## Implementation work

### Generating a data set
#### Obtaining features from the Bag-of-Words model

In [64]:
train = pd.read_csv('./data/train_preprocessed.csv')
test = pd.read_csv('./data/test_preprocessed.csv')

Please choose one of the following vectorizers:
#### Creating regular CountVectorizers (default)

In [80]:
vectorizer = CountVectorizer()

#### Creating 1-hot vectors

In [None]:
vectorizer = CountVectorizer(binary=True)

#### Creating TF-IDF vectors

In [20]:
vectorizer = TfidfVectorizer()

(10000, 6248)
['00', '000', '007', '04', '07', '0mbp', '10', '100', '1000', '101', '11', '115', '12', '1275', '128', '130', '14', '149', '14k', '14lb', '14th', '15', '150', '16', '17', '170', '1721', '18', '1890', '18th', '18v', '19', '1914', '1920', '1933', '1949', '1955', '1960', '1967', '1969', '1970', '1976', '1977', '1980', '1986', '1990', '1993', '1994', '1997', '1g', '1st', '20', '2000', '2001', '2003', '2004', '2005', '2006', '2007', '2008', '2008c', '2009', '2011', '2012', '2040', '20th', '213', '21st', '22', '220', '236', '24', '25', '250', '2730', '281', '2g', '2nd', '30', '300', '3000', '30oz', '321373', '33', '330', '3340', '34', '344263', '35', '360', '369', '370', '377dat', '3and', '3dequel', '3rd', '40', '400', '4105', '44', '45', '450', '45w', '4890', '4th', '50', '510', '550', '56', '5762', '58', '5mm', '5mp', '60', '6000', '64', '64bit', '65', '650', '66', '6th', '6v', '70', '72', '720', '74', '75', '76csx', '77', '773', '7th', '80', '800', '8004', '822', '8500', '90



#### Creating bi-gram CountVectorizors

In [75]:
vectorizer = CountVectorizer(ngram_range=(1, 2))

#### Creating bi-gram TF-IDF vectors

In [70]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

#### Creating tri-gram TF-IDF vectors

In [65]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3))

#### Transforming the dataset

In [81]:
trainArr = map(str, train['review_title'])

titleWords = vectorizer.fit_transform(trainArr).toarray()

print(titleWords.shape)
print(vectorizer.get_feature_names())

(10000, 6524)
['00', '03', '05', '09', '10', '100', '1000', '101', '1080p', '110', '112', '12', '120', '13', '130', '13355', '1370dev', '13th', '14', '140', '14k', '15', '150', '16', '17', '1738', '174', '18', '182', '1877', '18v', '19', '193', '1940', '1950', '1951', '1960', '1966', '1970', '1973', '1975', '1976', '1977', '1978', '1983', '1984', '1985', '1993', '1995', '1997', '1gb', '1mp', '1st', '20', '200', '2000', '2001', '2002', '2003', '2004', '2005', '2008', '2009', '2011', '2013', '21', '22', '222', '23', '24', '24th', '24x', '25', '26th', '2g', '2gb', '2k', '2nd', '30', '300', '3000', '31', '31259', '343', '35', '350', '360', '37', '3d', '3m', '3rd', '3x5', '40', '400', '4000', '4440', '45', '450', '4cyl', '4ghz', '4th', '4v', '50', '50hp', '51', '52', '5300', '54', '55', '550', '5500', '5th', '60', '600', '6439', '65533', '66', '6th', '70', '700', '74mb', '75', '76csx', '77', '80', '8000', '808rc4', '822', '83', '860r', '8925', '8th', '90', '900', '9466', '95', 'a57', 'aaron



### Obtaining the Test classes

In [82]:
y = train['class_index']

# print(X.shape, y.shape)

### Training Classifiers

In [83]:
clf = MultinomialNB()
clf.fit(titleWords, y)

testArr = map(str, test['review_title'])
testVector = vectorizer.transform(testArr).toarray()

y_pred = clf.predict(testVector)
print(y_pred)

[1 3 5 ... 2 2 4]


### Analyse Results

In [84]:
# confusion matrix
confusionMatrix = confusion_matrix(test['class_index'], y_pred)
print(confusionMatrix)

correctReviews = 0
for i in range(len(confusionMatrix)):
    correctReviews += confusionMatrix[i][i]

incorrectReviews = sum(map(sum, confusionMatrix)) - correctReviews
# print("correctReviews:", correctReviews)
# print("incorrectReviews:", incorrectReviews)

accuracy = (correctReviews / (incorrectReviews + correctReviews)) * 100

# accuracy, precision, recall, f1
# CHECK IF CONFUSION MATRIX IS ORDER FROM 1-5 RATING
print("Accuracy:", accuracy)
for i in range(5):
    colSum = 0
    for j in range(5):
        colSum += confusionMatrix[j][i]
    
    precision = (confusionMatrix[i][i] / colSum)
    recall = (confusionMatrix[i][i] / sum(confusionMatrix[i]))
    fMeasure = (2*precision*recall) / (precision + recall) 
    print("-------\n", i+1, "Rating Results:")
    print("Precision:", precision * 100, "%")
    print("Recall:", recall * 100, "%")
    print("F-measure:", fMeasure * 100, "%")

[[200  74  46  29  51]
 [101 131 102  38  28]
 [ 47  76 166  62  49]
 [ 43  45  78 111 123]
 [ 42  27  26  84 221]]
Accuracy: 41.449999999999996
-------
 1 Rating Results:
Precision: 46.18937644341801 %
Recall: 50.0 %
F-measure: 48.019207683073226 %
-------
 2 Rating Results:
Precision: 37.110481586402265 %
Recall: 32.75 %
F-measure: 34.79415670650731 %
-------
 3 Rating Results:
Precision: 39.71291866028708 %
Recall: 41.5 %
F-measure: 40.58679706601467 %
-------
 4 Rating Results:
Precision: 34.25925925925926 %
Recall: 27.750000000000004 %
F-measure: 30.662983425414364 %
-------
 5 Rating Results:
Precision: 46.82203389830508 %
Recall: 55.25 %
F-measure: 50.688073394495405 %
