# Natural Language Processing with Amazon Reviews

Developed for the second project of the Artificial Intelligence course

## Setup and Environment

After installing Python and the necessary packages, run the following code to import them:

In [1]:
%matplotlib inline

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *

MAX_FEATURES = 10000

## Implementation work

### Generating a data set
#### Obtaining features from the Bag-of-Words model

In [2]:
train = pd.read_csv('./data/train_preprocessed.csv')
test = pd.read_csv('./data/test_preprocessed.csv')

Please choose one of the following vectorizers:
#### Creating regular CountVectorizers (default)

In [3]:
titleVectorizer = CountVectorizer(max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(max_features=MAX_FEATURES)

#### Creating 1-hot vectors

In [None]:
titleVectorizer = CountVectorizer(binary=True, max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(binary=True, max_features=MAX_FEATURES)

#### Creating TF-IDF vectors

In [None]:
titleVectorizer = TfidfVectorizer(max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(max_features=MAX_FEATURES)

#### Creating bi-gram CountVectorizors

In [75]:
titleVectorizer = CountVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)

#### Creating bi-gram TF-IDF vectors

In [70]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)

#### Creating tri-gram TF-IDF vectors

In [3]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)

#### Transforming the dataset

In [4]:
trainTitleArr = list(map(str, train['review_title']))
trainTextArr = list(map(str, train['review_text']))

titleWords = titleVectorizer.fit_transform(trainTitleArr).toarray()
textWords = textVectorizer.fit_transform(trainTextArr).toarray()

allWords = np.array([np.concatenate((titleWords[i], textWords[i])) for i in range(len(titleWords))])

print(titleWords.shape)
print(textWords.shape)
print(allWords.shape)

(5000, 4088)
(5000, 10000)
(5000, 14088)


### Training Classifiers and Prediction

In [6]:
def predict(clf, train, test):
    y = train['class_index']
    # Train with allWords
    clf.fit(allWords, y, sample_weight=1)
    
    testTitleArr = list(map(str, test['review_title']))
    testTextArr = list(map(str, test['review_text']))

    testTitleVector = titleVectorizer.transform(testTitleArr).toarray()
    testTextVector = textVectorizer.transform(testTitleArr).toarray()

    testAllWords = np.array([np.concatenate((testTitleVector[i], testTextVector[i])) for i in range(len(testTitleVector))])

    return clf.predict(testAllWords)

# MultinomialNB()

### Analyse Results

In [7]:
def analyze(prediction, test):
    # confusion matrix
    confusionMatrix = confusion_matrix(test['class_index'], prediction)
    print("Confusion Matrix:")
    print(confusionMatrix)

    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Precision: ", precision_score(y_test, y_pred, average='weighted'))
    print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
    print('F1: ', f1_score(y_test, y_pred, average='weighted'))

### Multinomial Naîve Bays

In [8]:
prediction = predict(MultinomialNB(), train, test)
print("MultinomialNB Results:")
analyze(prediction, test)

MultinomialNB Results:
Confusion Matrix:
[[ 97  46  28   8  21]
 [ 51  58  38  27  26]
 [ 28  45  71  26  30]
 [ 21  26  26  68  59]
 [ 20  18  11  41 110]]
Accuracy: 40.400000000000006
-------
 1 Rating Results:
Precision: 44.70046082949309 %
Recall: 48.5 %
F-measure: 46.52278177458033 %
-------
 2 Rating Results:
Precision: 30.05181347150259 %
Recall: 28.999999999999996 %
F-measure: 29.516539440203566 %
-------
 3 Rating Results:
Precision: 40.804597701149426 %
Recall: 35.5 %
F-measure: 37.967914438502675 %
-------
 4 Rating Results:
Precision: 40.0 %
Recall: 34.0 %
F-measure: 36.75675675675676 %
-------
 5 Rating Results:
Precision: 44.71544715447154 %
Recall: 55.00000000000001 %
F-measure: 49.327354260089685 %
