# Natural Language Processing with Amazon Reviews

Developed for the second project of the Artificial Intelligence course

## Setup and Environment

After installing Python and the necessary packages, run the following code to import them:

In [47]:
%matplotlib inline

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

MAX_FEATURES = 10000

## Implementation work

### Generating a data set
#### Obtaining features from the Bag-of-Words model

In [14]:
train = pd.read_csv('./data/train_preprocessed.csv')
test = pd.read_csv('./data/test_preprocessed.csv')

Please choose one of the following vectorizers:
#### Creating regular CountVectorizers (default)

In [15]:
titleVectorizer = CountVectorizer(max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(max_features=MAX_FEATURES)

#### Creating 1-hot vectors

In [None]:
titleVectorizer = CountVectorizer(binary=True, max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(binary=True, max_features=MAX_FEATURES)

#### Creating TF-IDF vectors

In [None]:
titleVectorizer = TfidfVectorizer(max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(max_features=MAX_FEATURES)

#### Creating bi-gram CountVectorizors

In [75]:
titleVectorizer = CountVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)

#### Creating bi-gram TF-IDF vectors

In [16]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)

#### Creating tri-gram TF-IDF vectors

In [17]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)

#### Transforming the dataset

In [18]:
trainTitleArr = list(map(str, train['review_title']))
trainTextArr = list(map(str, train['review_text']))

titleWords = titleVectorizer.fit_transform(trainTitleArr).toarray()
textWords = textVectorizer.fit_transform(trainTextArr).toarray()

allWords = np.array([np.concatenate((titleWords[i], textWords[i])) for i in range(len(titleWords))])

print(titleWords.shape)
print(textWords.shape)
print(allWords.shape)

(5000, 10000)
(5000, 10000)
(5000, 20000)


### Training Classifiers and Prediction

In [37]:
def predict(clf, train, test, **arg):
    y = train['class_index']
    # Train with allWords
    clf.fit(allWords, y, **arg)
    
    testTitleArr = list(map(str, test['review_title']))
    testTextArr = list(map(str, test['review_text']))

    testTitleVector = titleVectorizer.transform(testTitleArr).toarray()
    testTextVector = textVectorizer.transform(testTextArr).toarray()

    testAllWords = np.array([np.concatenate((testTitleVector[i], testTextVector[i])) for i in range(len(testTitleVector))])

    return clf.predict(testAllWords)

# MultinomialNB()

### Analyse Results

In [35]:
def analyze(prediction, test):
    y_test = test['class_index']

    # confusion matrix
    confusionMatrix = confusion_matrix(y_test, prediction)
    print("Confusion Matrix:")
    print(confusionMatrix)

    accuracy = accuracy_score(y_test, prediction)

    print("Accuracy: ", accuracy * 100, " %")
    print("Precision: ", precision_score(y_test, prediction, average='weighted') * 100, " %")
    print('Recall: ', recall_score(y_test, prediction, average='weighted') * 100, " %")
    print('F1: ', f1_score(y_test, prediction, average='weighted') * 100, " %")

### K-Nearest Neighbor (K = 5)

In [38]:
prediction = predict(KNeighborsClassifier(n_neighbors=5), train, test)
print("K-Nearest Neighbor Results:")
analyze(prediction, test)

MultinomialNB Results:
Confusion Matrix:
[[96 35 29 24 16]
 [66 46 41 26 21]
 [48 30 69 32 21]
 [27 26 47 56 44]
 [30 22 39 50 59]]
Accuracy:  32.6  %
Precision:  32.39714744639546  %
Recall:  32.6  %
F1:  32.15275590745132  %


### Multinomial Naive Bays

In [39]:
prediction = predict(MultinomialNB(), train, test, sample_weight=1.0)
print("MultinomialNB Results:")
analyze(prediction, test)

MultinomialNB Results:
Confusion Matrix:
[[100  57  24   8  11]
 [ 45  73  56  19   7]
 [ 20  48  87  27  18]
 [  5  13  51  78  53]
 [  8   7  15  42 128]]
Accuracy:  46.6  %
Precision:  46.84025585313622  %
Recall:  46.6  %
F1:  46.57606894758912  %


### Random Forest

In [40]:
prediction = predict(RandomForestClassifier(), train, test, sample_weight=1.0)
print("RandomForestClassifier Results:")
analyze(prediction, test)

RandomForestClassifier Results:
Confusion Matrix:
[[116  29  28  10  17]
 [ 63  51  40  20  26]
 [ 41  43  60  27  29]
 [ 20  15  39  53  73]
 [ 12   2  17  40 129]]
Accuracy:  40.9  %
Precision:  39.49652768332553  %
Recall:  40.9  %
F1:  39.45870553217365  %


### Decision Tree

In [41]:
prediction = predict(DecisionTreeClassifier(), train, test, sample_weight=1.0)
print("DecisionTreeClassifier Results:")
analyze(prediction, test)

DecisionTreeClassifier Results:
Confusion Matrix:
[[70 48 40 21 21]
 [59 50 35 33 23]
 [38 37 54 41 30]
 [16 32 50 56 46]
 [19 26 30 48 77]]
Accuracy:  30.7  %
Precision:  30.724903921164852  %
Recall:  30.7  %
F1:  30.707627733362475  %


### Perceptron

In [43]:
prediction = predict(Perceptron(), train, test, sample_weight=1.0)
print("Perceptron Results:")
analyze(prediction, test)

Perceptron Results:
Confusion Matrix:
[[113  32  15  23  17]
 [ 60  40  36  45  19]
 [ 19  34  61  53  33]
 [ 10   6  29  94  61]
 [  9   4   8  67 112]]
Accuracy:  42.0  %
Precision:  41.71823667493293  %
Recall:  42.0  %
F1:  40.988838542531006  %


### Logistic Regression

In [42]:
prediction = predict(LogisticRegression(), train, test, sample_weight=1.0)
print("LogisticRegression Results:")
analyze(prediction, test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression Results:
Confusion Matrix:
[[116  46  18  10  10]
 [ 53  68  47  21  11]
 [ 19  51  78  34  18]
 [  5  17  33  80  65]
 [ 10   6  14  42 128]]
Accuracy:  47.0  %
Precision:  46.463772788793406  %
Recall:  47.0  %
F1:  46.64454262301147  %


### Linear SVC

In [45]:
prediction = predict(LinearSVC(), train, test, sample_weight=1.0)
print("LinearSVC Results:")
analyze(prediction, test)

LinearSVC Results:
Confusion Matrix:
[[111  48  19  13   9]
 [ 60  65  39  26  10]
 [ 19  48  82  32  19]
 [ 10  16  40  81  53]
 [  8   6  18  41 127]]
Accuracy:  46.6  %
Precision:  46.10488887272591  %
Recall:  46.6  %
F1:  46.30945557768168  %


### Neural Network (Multi-Layer Perceptron)

In [49]:
prediction = predict(MLPClassifier(), train, test)
print("NeuralNetwork Results:")
analyze(prediction, test)

NeuralNetwork Results:
Confusion Matrix:
[[106  48  24  15   7]
 [ 57  64  45  27   7]
 [ 18  49  83  37  13]
 [  6  17  47  73  57]
 [  8   7  17  59 109]]
Accuracy:  43.5  %
Precision:  43.69066704308459  %
Recall:  43.5  %
F1:  43.56306754905318  %
