# Natural Language Processing with Amazon Reviews

Developed for the second project of the Artificial Intelligence course

## Setup and Environment

After installing Python and the necessary packages, run the following code to import them:

In [1]:
%matplotlib inline

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *

MAX_FEATURES = 10000

## Implementation work

### Generating a data set
#### Obtaining features from the Bag-of-Words model

In [2]:
train = pd.read_csv('./data/train_preprocessed.csv')
test = pd.read_csv('./data/test_preprocessed.csv')

Please choose one of the following vectorizers:
#### Creating regular CountVectorizers (default)

In [86]:
titleVectorizer = CountVectorizer(max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(max_features=MAX_FEATURES)

#### Creating 1-hot vectors

In [None]:
titleVectorizer = CountVectorizer(binary=True, max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(binary=True, max_features=MAX_FEATURES)

#### Creating TF-IDF vectors

In [None]:
titleVectorizer = TfidfVectorizer(max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(max_features=MAX_FEATURES)

#### Creating bi-gram CountVectorizors

In [75]:
titleVectorizer = CountVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)

#### Creating bi-gram TF-IDF vectors

In [70]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)

#### Creating tri-gram TF-IDF vectors

In [3]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)

#### Transforming the dataset

In [4]:
trainTitleArr = list(map(str, train['review_title']))
trainTextArr = list(map(str, train['review_text']))

titleWords = titleVectorizer.fit_transform(trainTitleArr).toarray()
textWords = textVectorizer.fit_transform(trainTextArr).toarray()

allWords = np.array([np.concatenate((titleWords[i], textWords[i])) for i in range(len(titleWords))])

print(titleWords.shape)
# print(vectorizer.get_feature_names())
print(textWords.shape)

print(allWords.shape)

(5000, 10000)
(5000, 10000)
(5000, 20000)


### Training Classifiers

In [5]:
y = train['class_index']

clf = MultinomialNB()

# Train with allWords
clf.fit(allWords, y, sample_weight=1)

MultinomialNB()

### Prediction

In [6]:
testTitleArr = list(map(str, test['review_title']))
testTextArr = list(map(str, test['review_text']))

testTitleVector = titleVectorizer.transform(testTitleArr).toarray()
testTextVector = textVectorizer.transform(testTitleArr).toarray()

testAllWords = np.array([np.concatenate((testTitleVector[i], testTextVector[i])) for i in range(len(testTitleVector))])

y_pred = clf.predict(testAllWords)
print(y_pred)

[1 1 2 2 1 1 1 1 2 1 1 2 1 1 5 3 2 1 1 1 3 2 1 5 3 1 1 5 3 2 3 1 1 1 2 5 2
 1 1 5 2 1 1 2 2 1 1 2 1 1 2 1 4 1 1 1 1 5 1 3 1 4 2 1 1 4 2 1 2 1 1 1 2 1
 4 3 2 1 1 1 1 1 1 5 1 1 2 1 2 2 1 1 1 2 5 1 3 2 1 1 3 3 1 1 1 5 1 2 1 5 1
 2 1 1 3 3 3 3 2 1 5 1 1 5 1 1 4 2 1 1 5 3 2 1 1 1 1 2 1 1 1 1 4 2 5 2 2 5
 3 2 5 2 1 1 2 2 1 2 1 1 1 5 5 4 3 1 2 1 1 3 1 3 1 1 5 1 2 2 3 4 1 1 1 1 3
 1 1 1 1 1 3 4 2 1 1 5 2 1 3 3 1 1 1 3 2 5 4 3 2 1 1 3 3 3 1 3 2 1 3 1 3 3
 5 3 2 4 2 1 1 4 3 5 1 1 2 5 2 1 1 3 2 2 3 3 5 2 2 2 3 1 2 1 2 2 2 1 1 1 3
 1 5 3 3 1 5 2 2 3 2 3 1 3 3 4 2 1 2 1 4 2 2 2 3 5 1 3 1 4 1 4 3 3 5 4 2 4
 1 4 1 1 1 2 4 1 2 1 1 2 5 1 1 1 1 4 3 1 3 2 1 3 3 4 2 2 2 2 1 5 5 1 1 4 4
 3 2 2 2 4 1 1 1 5 3 2 3 3 2 1 1 4 5 2 2 2 3 3 5 4 3 3 1 1 2 3 2 4 3 2 3 4
 4 2 5 4 5 2 1 1 5 5 2 3 2 1 3 1 1 3 1 1 2 2 1 2 4 3 1 2 2 2 3 5 1 2 4 1 1
 3 1 3 5 3 4 3 5 3 2 4 2 5 2 1 2 2 2 3 3 2 3 1 1 1 2 3 5 2 3 3 1 3 2 4 5 4
 3 4 3 2 1 2 2 3 3 2 1 3 3 1 5 2 5 3 5 4 3 3 2 3 2 3 3 3 3 5 2 2 5 5 3 3 2
 1 2 1 4 1 2 1 3 2 5 4 3 

### Analyse Results

In [7]:
# confusion matrix
confusionMatrix = confusion_matrix(test['class_index'], y_pred)
print(confusionMatrix)

correctReviews = 0
for i in range(len(confusionMatrix)):
    correctReviews += confusionMatrix[i][i]

incorrectReviews = sum(map(sum, confusionMatrix)) - correctReviews
# print("correctReviews:", correctReviews)
# print("incorrectReviews:", incorrectReviews)

accuracy = (correctReviews / (incorrectReviews + correctReviews)) * 100

# accuracy, precision, recall, f1
# CHECK IF CONFUSION MATRIX IS ORDER FROM 1-5 RATING
print("Accuracy:", accuracy)
for i in range(5):
    colSum = 0
    for j in range(5):
        colSum += confusionMatrix[j][i]
    
    precision = (confusionMatrix[i][i] / colSum)
    recall = (confusionMatrix[i][i] / sum(confusionMatrix[i]))
    fMeasure = (2*precision*recall) / (precision + recall) 
    print("-------\n", i+1, "Rating Results:")
    print("Precision:", precision * 100, "%")
    print("Recall:", recall * 100, "%")
    print("F-measure:", fMeasure * 100, "%")

[[104  43  24   9  20]
 [ 59  54  45  23  19]
 [ 32  37  73  27  31]
 [ 23  18  30  71  58]
 [ 22  15   9  46 108]]
Accuracy: 41.0
-------
 1 Rating Results:
Precision: 43.333333333333336 %
Recall: 52.0 %
F-measure: 47.27272727272728 %
-------
 2 Rating Results:
Precision: 32.33532934131736 %
Recall: 27.0 %
F-measure: 29.427792915531338 %
-------
 3 Rating Results:
Precision: 40.331491712707184 %
Recall: 36.5 %
F-measure: 38.320209973753286 %
-------
 4 Rating Results:
Precision: 40.340909090909086 %
Recall: 35.5 %
F-measure: 37.765957446808514 %
-------
 5 Rating Results:
Precision: 45.76271186440678 %
Recall: 54.0 %
F-measure: 49.54128440366973 %
