# Natural Language Processing with Amazon Reviews

Developed for the second project of the Artificial Intelligence course

## Setup and Environment

After installing Python and the necessary packages, run the following code to import them:

In [1]:
%matplotlib inline

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

MAX_FEATURES = 10000

## Implementation work

### Generating a data set
#### Obtaining features from the Bag-of-Words model

In [24]:
train = pd.read_csv('./data/train_preprocessed.csv')
test = pd.read_csv('./data/test_preprocessed.csv')

Please choose one of the following vectorizers:
#### Creating regular CountVectorizers (default)

In [25]:
titleVectorizer = CountVectorizer(max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(max_features=MAX_FEATURES)

#### Creating 1-hot vectors

In [None]:
titleVectorizer = CountVectorizer(binary=True, max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(binary=True, max_features=MAX_FEATURES)

#### Creating TF-IDF vectors

In [None]:
titleVectorizer = TfidfVectorizer(max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(max_features=MAX_FEATURES)

#### Creating bi-gram CountVectorizors

In [3]:
titleVectorizer = CountVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)

#### Creating bi-gram TF-IDF vectors

In [None]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)

#### Creating tri-gram CountVectorizors

In [None]:
titleVectorizer = CountVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)
textVectorizer = CountVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)

#### Creating tri-gram TF-IDF vectors

In [None]:
titleVectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)
textVectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=MAX_FEATURES)

#### Transforming the dataset

In [4]:
trainTitleArr = list(map(str, train['review_title']))
trainTextArr = list(map(str, train['review_text']))

titleWords = titleVectorizer.fit_transform(trainTitleArr).toarray()
textWords = textVectorizer.fit_transform(trainTextArr).toarray()

allWords = np.array([np.concatenate((titleWords[i], textWords[i])) for i in range(len(titleWords))])

print(titleWords.shape)
print(textWords.shape)
print(allWords.shape)

y = train['class_index']

# Test variables
testTitleArr = list(map(str, test['review_title']))
testTextArr = list(map(str, test['review_text']))

testTitleVector = titleVectorizer.transform(testTitleArr).toarray()
testTextVector = textVectorizer.transform(testTextArr).toarray()

testAllWords = np.array([np.concatenate((testTitleVector[i], testTextVector[i])) for i in range(len(testTitleVector))])

(5000, 10000)
(5000, 10000)
(5000, 20000)


### Prediction after searching parameters in grid

In [31]:
#param_grid = {"alpha": [0.005, 0.01, 0.05], "penalty": ['l2', 'l1', 'elasticnet', None]}
def gridPredict(clf, param_grid, **arg):
    grid = GridSearchCV(clf, param_grid, cv=2) # If we want Cross Validation: cv=pdfsplt

    grid.fit(allWords, y, **arg)

    print("Best grid params:", grid.best_params_)

    return grid.predict(testAllWords)

### Training Classifiers and Prediction

In [27]:
def predict(clf, **arg):
    # Train with allWords
    clf.fit(allWords, y, **arg)

    return clf.predict(testAllWords)

# MultinomialNB()

### Analyse Results

In [28]:
def analyze(prediction, test):
    y_test = test['class_index']

    # confusion matrix
    confusionMatrix = confusion_matrix(y_test, prediction)
    print("Confusion Matrix:")
    print(confusionMatrix)

    accuracy = accuracy_score(y_test, prediction)

    print("Accuracy: ", accuracy * 100, " %")
    print("Precision: ", precision_score(y_test, prediction, average='weighted') * 100, " %")
    print('Recall: ', recall_score(y_test, prediction, average='weighted') * 100, " %")
    print('F1: ', f1_score(y_test, prediction, average='weighted') * 100, " %")

### K-Nearest Neighbor (K = 5)

In [None]:
prediction = predict(KNeighborsClassifier(n_neighbors=5))
print("K-Nearest Neighbor Results:")
analyze(prediction, test)

### Multinomial Naive Bays

In [18]:
# prediction = predict(MultinomialNB(), sample_weight=1.0)
prediction = gridPredict(MultinomialNB(), {"alpha": [0.0001, 0.5, 1.0]}, sample_weight=1.0)
print("MultinomialNB Results:")
analyze(prediction, test)

len train indices:  3500
len test indices:  1500
1
Best grid params: {'alpha': 0.5}
MultinomialNB Results:
Confusion Matrix:
[[105  60  19   8   8]
 [ 40  89  46  18   7]
 [ 28  56  57  38  21]
 [ 16  20  38  72  54]
 [  4  16  15  54 111]]
Accuracy:  43.4  %
Precision:  43.40473033383637  %
Recall:  43.4  %
F1:  43.29651984565041  %


### Random Forest

In [None]:
# prediction = predict(RandomForestClassifier(n_estimators=100), sample_weight=1.0) 
# "max_features": ["int", "sqrt", "log2"]
prediction = gridPredict(RandomForestClassifier(), {"n_estimators": [50, 100]} , sample_weight=1.0)
# TODO: TEST SINGLE PARAMETER (n_estimators = 200 ...)
print("RandomForestClassifier Results:")
analyze(prediction, test)

### Decision Tree

In [None]:
prediction = predict(DecisionTreeClassifier(), sample_weight=1.0)
# prediction = gridPredict(DecisionTreeClassifier(), {"criterion": ["gini", "entropy", "log_loss"]} , sample_weight=1.0)
print("DecisionTreeClassifier Results:")
analyze(prediction, test)

### Perceptron

In [40]:
# prediction = predict(Perceptron(), sample_weight=1.0)
prediction = gridPredict(Perceptron(), {"alpha": [0.0001, 0.001]} , sample_weight=1.0)
print("Perceptron Results:")
analyze(prediction, test)

Best grid params: {'alpha': 0.0001}
Perceptron Results:
Confusion Matrix:
[[112  50  26   7   5]
 [ 48  79  37  19  17]
 [ 23  50  57  41  29]
 [ 18  13  44  73  52]
 [  8  14  17  54 107]]
Accuracy:  42.8  %
Precision:  42.402198189692605  %
Recall:  42.8  %
F1:  42.571240335239125  %


### Logistic Regression

In [None]:
prediction = predict(LogisticRegression(), sample_weight=1.0)
print("LogisticRegression Results:")
analyze(prediction, test)

### Linear SVC

In [None]:
prediction = predict(LinearSVC(), sample_weight=1.0)
print("LinearSVC Results:")
analyze(prediction, test)

### Neural Network (Multi-Layer Perceptron)

In [None]:
prediction = predict(MLPClassifier())
print("NeuralNetwork Results:")
analyze(prediction, test)