# Natural Language Processing with Amazon Reviews

Developed for the second project of the Artificial Intelligence course

## Setup and Environment

After installing Python and the necessary packages, run the following code to import them:

In [20]:
%matplotlib inline

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

MAX_FEATURES = 10000

## Implementation work

### Generating a data set
#### Obtaining features from the Bag-of-Words model

In [11]:
train = pd.read_csv('./data/train_preprocessed.csv')
test = pd.read_csv('./data/test_preprocessed.csv')

Please choose one of the following vectorizers:
#### Creating regular CountVectorizers (default)

In [3]:
titleVectorizer = CountVectorizer(max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(max_features=MAX_FEATURES)

#### Creating 1-hot vectors

In [None]:
titleVectorizer = CountVectorizer(binary=True, max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(binary=True, max_features=MAX_FEATURES)

#### Creating TF-IDF vectors

In [None]:
titleVectorizer = TfidfVectorizer(max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(max_features=MAX_FEATURES)

#### Creating bi-gram CountVectorizors

In [75]:
titleVectorizer = CountVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)

#### Creating bi-gram TF-IDF vectors

In [12]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)

#### Creating tri-gram TF-IDF vectors

In [3]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)

#### Transforming the dataset

In [13]:
trainTitleArr = list(map(str, train['review_title']))
trainTextArr = list(map(str, train['review_text']))

titleWords = titleVectorizer.fit_transform(trainTitleArr).toarray()
textWords = textVectorizer.fit_transform(trainTextArr).toarray()

allWords = np.array([np.concatenate((titleWords[i], textWords[i])) for i in range(len(titleWords))])

print(titleWords.shape)
print(textWords.shape)
print(allWords.shape)

(5000, 2000)
(5000, 2000)
(5000, 4000)


### Training Classifiers and Prediction

In [26]:
def predict(clf, train, test):
    y = train['class_index']
    # Train with allWords
    clf.fit(allWords, y, sample_weight=1.0)
    
    testTitleArr = list(map(str, test['review_title']))
    testTextArr = list(map(str, test['review_text']))

    testTitleVector = titleVectorizer.transform(testTitleArr).toarray()
    testTextVector = textVectorizer.transform(testTitleArr).toarray()

    testAllWords = np.array([np.concatenate((testTitleVector[i], testTextVector[i])) for i in range(len(testTitleVector))])

    return clf.predict(testAllWords)

# MultinomialNB()

### Analyse Results

In [17]:
def analyze(prediction, test):
    y_test = test['class_index']

    # confusion matrix
    confusionMatrix = confusion_matrix(y_test, prediction)
    print("Confusion Matrix:")
    print(confusionMatrix)

    accuracy = accuracy_score(y_test, prediction)

    print("Accuracy: ", accuracy * 100, " %")
    print("Precision: ", precision_score(y_test, prediction, average='weighted') * 100, " %")
    print('Recall: ', recall_score(y_test, prediction, average='weighted') * 100, " %")
    print('F1: ', f1_score(y_test, prediction, average='weighted') * 100, " %")

### Multinomial Na√Æve Bays

In [21]:
prediction = predict(MultinomialNB(), train, test)
print("MultinomialNB Results:")
analyze(prediction, test)

MultinomialNB Results:
Confusion Matrix:
[[100  47  17  10  26]
 [ 66  41  47  27  19]
 [ 36  41  71  24  28]
 [ 28  19  24  73  56]
 [ 22  13  13  41 111]]
Accuracy:  39.6  %
Precision:  38.87834673471681  %
Recall:  39.6  %
F1:  38.90447816993883  %


### Random Forest

In [22]:
prediction = predict(RandomForestClassifier(), train, test)
print("RandomForestClassifier Results:")
analyze(prediction, test)

RandomForestClassifier Results:
Confusion Matrix:
[[ 61  22  64  26  27]
 [ 30  36  86  23  25]
 [ 25  17 105  38  15]
 [ 18   4  70  51  57]
 [  9   1  57  37  96]]
Accuracy:  34.9  %
Precision:  37.58469488626557  %
Recall:  34.9  %
F1:  34.055911754981814  %


### Decision Tree

In [23]:
prediction = predict(DecisionTreeClassifier(), train, test)
print("DecisionTreeClassifier Results:")
analyze(prediction, test)

DecisionTreeClassifier Results:
Confusion Matrix:
[[ 41  25  27  86  21]
 [ 24  27  31 101  17]
 [ 22  18  38 109  13]
 [ 13   8  36 100  43]
 [  7   9  23  86  75]]
Accuracy:  28.1  %
Precision:  31.79879099837632  %
Recall:  28.1  %
F1:  27.381959827018797  %


### Perceptron

In [27]:
prediction = predict(Perceptron(), train, test)
print("Perceptron Results:")
analyze(prediction, test)

Perceptron Results:
Confusion Matrix:
[[ 85  56  19  21  19]
 [ 62  44  38  29  27]
 [ 42  51  48  32  27]
 [ 30  24  20  62  64]
 [ 25  14  18  41 102]]
Accuracy:  34.1  %
Precision:  33.574852040384656  %
Recall:  34.1  %
F1:  33.51515498984475  %


### Logistic Regression

In [28]:
prediction = predict(LogisticRegression(), train, test)
print("LogisticRegression Results:")
analyze(prediction, test)

LogisticRegression Results:
Confusion Matrix:
[[ 96  48  16  12  28]
 [ 72  42  35  21  30]
 [ 40  44  53  32  31]
 [ 35  13  18  65  69]
 [ 28   9  10  35 118]]
Accuracy:  37.4  %
Precision:  36.92930178009594  %
Recall:  37.4  %
F1:  36.29676359209592  %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Linear SVC

In [29]:
prediction = predict(LinearSVC(), train, test)
print("LinearSVC Results:")
analyze(prediction, test)

LinearSVC Results:
Confusion Matrix:
[[ 92  56  20  15  17]
 [ 70  41  40  28  21]
 [ 39  53  48  35  25]
 [ 33  19  18  65  65]
 [ 26  11  16  35 112]]
Accuracy:  35.8  %
Precision:  35.029746132610505  %
Recall:  35.8  %
F1:  34.989949621528574  %
