# Natural Language Processing with Amazon Reviews

Developed for the second project of the Artificial Intelligence course

## Setup and Environment

After installing Python and the necessary packages, run the following code to import them:

In [63]:
%matplotlib inline

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *

## Implementation work

### Generating a data set
#### Obtaining features from the Bag-of-Words model

In [85]:
train = pd.read_csv('./data/train_preprocessed.csv')
test = pd.read_csv('./data/test_preprocessed.csv')

Please choose one of the following vectorizers:
#### Creating regular CountVectorizers (default)

In [86]:
titleVectorizer = CountVectorizer()
textVectorizer = CountVectorizer()

#### Creating 1-hot vectors

In [None]:
titleVectorizer = CountVectorizer(binary=True)
textVectorizer = CountVectorizer(binary=True)

#### Creating TF-IDF vectors

In [None]:
titleVectorizer = TfidfVectorizer()
textVectorizer = TfidfVectorizer()

#### Creating bi-gram CountVectorizors

In [75]:
titleVectorizer = CountVectorizer(ngram_range=(1, 2))
textVectorizer = CountVectorizer(ngram_range=(1, 2))

#### Creating bi-gram TF-IDF vectors

In [70]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 2))
textVectorizer = TfidfVectorizer(ngram_range=(1, 2))

#### Creating tri-gram TF-IDF vectors

In [65]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 3))
textVectorizer = TfidfVectorizer(ngram_range=(1, 3))

#### Transforming the dataset

In [87]:
trainTitleArr = list(map(str, train['review_title']))
trainTextArr = list(map(str, train['review_text']))

titleWords = titleVectorizer.fit_transform(trainTitleArr).toarray()
textWords = textVectorizer.fit_transform(trainTextArr).toarray()

allWords = np.array([np.concatenate((titleWords[i], textWords[i])) for i in range(len(titleWords))])

print(titleWords.shape)
# print(vectorizer.get_feature_names())
print(textWords.shape)

print(allWords.shape)

(10000, 6524)
(10000, 24318)
(10000, 30842)


### Training Classifiers

In [88]:
y = train['class_index']

clf = MultinomialNB()

# Train with allWords
clf.fit(allWords, y, sample_weight=1)

MultinomialNB()

### Prediction

In [83]:
testTitleArr = list(map(str, test['review_title']))
testTextArr = list(map(str, test['review_text']))

testTitleVector = titleVectorizer.transform(testTitleArr).toarray()
testTextVector = textVectorizer.transform(testTitleArr).toarray()

testAllWords = np.array([np.concatenate((testTitleVector[i], testTextVector[i])) for i in range(len(testTitleVector))])

y_pred = clf.predict(testAllWords)
print(y_pred)

[1 3 5 ... 2 2 4]


### Analyse Results

In [84]:
# confusion matrix
confusionMatrix = confusion_matrix(test['class_index'], y_pred)
print(confusionMatrix)

correctReviews = 0
for i in range(len(confusionMatrix)):
    correctReviews += confusionMatrix[i][i]

incorrectReviews = sum(map(sum, confusionMatrix)) - correctReviews
# print("correctReviews:", correctReviews)
# print("incorrectReviews:", incorrectReviews)

accuracy = (correctReviews / (incorrectReviews + correctReviews)) * 100

# accuracy, precision, recall, f1
# CHECK IF CONFUSION MATRIX IS ORDER FROM 1-5 RATING
print("Accuracy:", accuracy)
for i in range(5):
    colSum = 0
    for j in range(5):
        colSum += confusionMatrix[j][i]
    
    precision = (confusionMatrix[i][i] / colSum)
    recall = (confusionMatrix[i][i] / sum(confusionMatrix[i]))
    fMeasure = (2*precision*recall) / (precision + recall) 
    print("-------\n", i+1, "Rating Results:")
    print("Precision:", precision * 100, "%")
    print("Recall:", recall * 100, "%")
    print("F-measure:", fMeasure * 100, "%")

[[200  74  46  29  51]
 [101 131 102  38  28]
 [ 47  76 166  62  49]
 [ 43  45  78 111 123]
 [ 42  27  26  84 221]]
Accuracy: 41.449999999999996
-------
 1 Rating Results:
Precision: 46.18937644341801 %
Recall: 50.0 %
F-measure: 48.019207683073226 %
-------
 2 Rating Results:
Precision: 37.110481586402265 %
Recall: 32.75 %
F-measure: 34.79415670650731 %
-------
 3 Rating Results:
Precision: 39.71291866028708 %
Recall: 41.5 %
F-measure: 40.58679706601467 %
-------
 4 Rating Results:
Precision: 34.25925925925926 %
Recall: 27.750000000000004 %
F-measure: 30.662983425414364 %
-------
 5 Rating Results:
Precision: 46.82203389830508 %
Recall: 55.25 %
F-measure: 50.688073394495405 %
