In [1]:
# import kaggle
# import zipfile
#
# kaggle datasets download -d bittlingmayer/amazonreviews
# with zipfile.ZipFile("amazonreviews.zip", "r") as zip_ref:
#      zip_ref.extractall("amazonreviews")

In [48]:
import sklearn                      # Machine Learning
import torch                        # PyTorch
import tensorflow                   # ditto

import pandas as pd                 # Data manipulation
import numpy as np                  # Number operations
import math                         # ditto

import matplotlib.pyplot as plt     # Plotting
import matplotlib.image as mpimg    # Image
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import seaborn as sns

import bz2
import csv
import time

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [27]:
plot_data = {
    "fastText": 0,
    "BERT": 0,
    "gpt-4o-mini": 0,
    "LSTM": 0,
    "textCNN": 0,
    "TF-IDF + NaiveBayes": 0,
    "TF-IDF + LogisticRegression": 0
}

In [None]:
plot_data

In [64]:
# Save data
import pickle
with open('session.pkl', 'wb') as f:
    pickle.dump(plot_data, f)

In [1]:
# Load data
import pickle
with open('session.pkl', 'rb') as f:
    plot_data = pickle.load(f)

Creating Dataframe

In [29]:
trainsize = 180000
testsize = 20000

bert_trainsize = 1800
bert_testsize = 200

def sepcontent(line):
    rating, content = line.split(" ", 1)
    rating = rating.replace("__label__", "")
    title, comment = content.split(": ", 1)
    return int(rating), title, comment

def bz2todf(originfile, limit=1000):
    ratings, content, raws, berts = [], [], [], []
    with bz2.open(originfile, mode='rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= limit:
                break
            line = line.strip()

            rating, title, comment = sepcontent(line)
            bert_input = f"{title} [SEP] {comment}"
            content_input = f"{title}: {comment}"

            raws.append(line)
            ratings.append(rating)
            content.append(content_input)
            berts.append(bert_input)

    raw_df = pd.DataFrame(raws, columns=["raw"])
    bert_df = pd.DataFrame({"rating": ratings, "bert": berts})
    normal_df = pd.DataFrame({"rating": ratings, "text": content})

    return raw_df, bert_df, normal_df

ft_traindf, bert_traindf, normal_traindf = bz2todf("amazonreviews/train.ft.txt.bz2", trainsize)
ft_testdf, bert_testdf, normal_testdf = bz2todf("amazonreviews/test.ft.txt.bz2", testsize)

bert_traindf = bert_traindf.iloc[:bert_trainsize]
bert_testdf = bert_traindf.iloc[:bert_testsize]

1. fastText

In [7]:
import fasttext

In [8]:
ft_traindf.to_csv("train.txt", index=False, sep=" ", quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

train_start = time.time()
ft_model = fasttext.train_supervised("train.txt", label_prefix="__label__", thread=4, epoch=10)
train_time = time.time() - train_start

In [9]:
clean = [x for x in ft_testdf["raw"]]
clean = [x.replace("__label__1", "").replace("__label__2", "") for x in clean]

In [None]:
test_start = time.time()
predict = [int(ft_model.predict(x)[0][0].replace("__label__", "")) for x in clean]
test_time = time.time() - test_start

truevalue = [int(x.split(" ")[0].replace("__label__", "")) for x in ft_testdf["raw"]]
accuracy = accuracy_score(truevalue, predict)
confusion = confusion_matrix(truevalue, predict)

plot_data["fastText"] = [accuracy, train_time, test_time, confusion]

print("Accuracy:", accuracy)
print("Train Time:", train_time)
print("Test Time:", test_time)
print("Confusion Matrix:\n", confusion)

2. BERT

In [None]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import classification_report

In [17]:
X_train, X_test, y_train, y_test = bert_traindf.bert, bert_testdf.bert, bert_traindf.rating, bert_testdf.rating
y_train = [x-1 for x in y_train]
y_test = [x-1 for x in y_test]

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encoding = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encoding = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

In [None]:
train_dataset = Dataset.from_dict({
    "input_ids": train_encoding["input_ids"],
    "attention_mask": train_encoding["attention_mask"],
    "label": list(y_train)
})

test_dataset = Dataset.from_dict({
    "input_ids": test_encoding["input_ids"],
    "attention_mask": test_encoding["attention_mask"],
    "label": list(y_test)
})

bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) # classify to 2 types (0/1)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

train_start = time.time()
trainer.train()
train_time = time.time() - train_start

In [None]:
test_start = time.time()
prediction = trainer.predict(test_dataset)
test_time = time.time() - test_start

y_pred = prediction.predictions.argmax(axis=1)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

plot_data["BERT"] = [accuracy, train_time, test_time, confusion]

print("Accuracy:", accuracy)
print("Train Time:", train_time)
print("Test Time:", test_time)
print("Confusion Matrix:\n", confusion)

3. OpenAI GPT

In [5]:
import openai

In [10]:
textbatch = 10
client = openai.OpenAI(
    base_url="https://models.inference.ai.azure.com",
    api_key=""
)
aimodel = "gpt-4o-mini"

def classify_with_gpt(i):
    inputtexts = ""
    start = i*textbatch
    currentbatch = min(start+textbatch, len(bert_testdf)) - start
    for x in range(start, start+currentbatch):
        text = bert_testdf.bert[x]
        inputtexts = inputtexts + f"Text {x-start+1}: {text}\n\n"

    prompt = f"""You are a sentiment classifier that returns multiple either 1 or 2 with separated commas for each text. Return 1 when the text is negative review, and return 2 when the text is positive review. Here is an example:

Text 1: Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^

Text 2: Awful beyond belief!: I feel I have to write to keep others from wasting their money. This book seems to have been written by a 7th grader with poor grammatical skills for her age! As another reviewer points out, there is a misspelling on the cover, and I believe there is at least one per chapter. For example, it was mentioned twice that she had a "lean" on her house. I was so distracted by the poor writing and weak plot, that I decided to read with a pencil in hand to mark all of the horrible grammar and spelling. Please don't waste your money. I too, believe that the good reviews must have been written by the author's relatives. I will not put much faith in the reviews from now on!

Text 3: Disappointed: I read the reviews,made my purchase and was very disappointed. The charger is convenient by charging all four batteries at once but the charge only lasts a very short time. I now have to go and find batteries that will give me longer life than the kodak NiMH AA batteries.

Because the first text is a positive review, the second text is a negative review, and the third text is also a negative review, you would return "2,1,1"

Now, here is {currentbatch} texts I want you to return the sentiment values of:

{inputtexts}Since there are {currentbatch} texts, you have to return in the folowing format: {"X"+",X"*(currentbatch-1)}

with each "X" is 1 or 2 depending on the sentiment of the review text

Return: """

    response = client.chat.completions.create(
        model=aimodel,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=currentbatch*2 + 10,
    )

    return start, currentbatch, response.choices[0].message.content.strip()

In [None]:
result = []
test_start = time.time()
for x in range(0, math.ceil(len(bert_testdf)/textbatch)):
    start, currentbatch, raw = classify_with_gpt(x)
    print(f"Start: {start}. Currentbatch: {currentbatch}. Result: {raw}")
    result.extend([int(x) for x in raw.split(",")])
test_time = time.time() - test_start

accuracy = accuracy_score(bert_testdf['rating'], result)
confusion = confusion_matrix(bert_testdf['rating'], result)

plot_data["gpt-4o-mini"] = [accuracy, np.nan, test_time, confusion]

print("Accuracy:", accuracy)
print("Train Time: 0")
print("Test Time:", test_time)
print("Confusion Matrix:\n", confusion)

4. LSTM

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, SpatialDropout1D, Dense
from keras.callbacks import ModelCheckpoint

In [43]:
voc_size = 20000
max_length = 128
tokenizer = Tokenizer(num_words=voc_size)
tokenizer.fit_on_texts(normal_traindf.text)
word_index = tokenizer.word_index

train = tokenizer.texts_to_sequences(normal_traindf.text)
train = pad_sequences(train, maxlen=max_length)
test = tokenizer.texts_to_sequences(normal_testdf.text)
test = pad_sequences(test, maxlen=max_length)

keras_trainrating = np.array(normal_traindf.rating - 1).astype("float32")

In [None]:
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=voc_size, output_dim=64))
lstm_model.add(LSTM(units=32, return_sequences=True))
lstm_model.add(SpatialDropout1D(rate=0.2))
lstm_model.add(LSTM(units=32))
lstm_model.add(Dense(1, activation="sigmoid"))
lstm_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
checkpoint_cb = ModelCheckpoint("amazon_model.h5", save_best_only=True)

train_start = time.time()
lstm_model.fit(train, keras_trainrating, epochs=2, validation_split=.1, callbacks=[checkpoint_cb])
train_time = time.time() - train_start

In [None]:
test_start = time.time()
y_prob = lstm_model.predict(test)
test_time = time.time() - test_start

y_pred = ((y_prob > 0.5).astype(int) + 1).flatten()

print(y_pred)

accuracy = accuracy_score(y_pred, normal_testdf.rating)
confusion = confusion_matrix(y_pred, normal_testdf.rating)

plot_data["LSTM"] = [accuracy, train_time, test_time, confusion]

print("Accuracy:", accuracy)
print("Train Time:", train_time)
print("Test Time:", test_time)
print("Confusion Matrix:\n", confusion)

5. textCNN

In [48]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Concatenate, Dense, Dropout
from keras.callbacks import ModelCheckpoint

In [50]:
voc_size = 20000
max_length = 128
input_layer = Input(shape=(max_length,))
embedding = Embedding(input_dim=voc_size, output_dim=128)(input_layer)

conv_3 = Conv1D(128, 3, activation="relu")(embedding)
conv_4 = Conv1D(128, 4, activation="relu")(embedding)
conv_5 = Conv1D(128, 5, activation="relu")(embedding)

pool_3 = GlobalMaxPooling1D()(conv_3)
pool_4 = GlobalMaxPooling1D()(conv_4)
pool_5 = GlobalMaxPooling1D()(conv_5)

concat = Concatenate()([pool_3, pool_4, pool_5])

dropout = Dropout(0.5)(concat)
output = Dense(1, activation="sigmoid")(dropout) # >2 class = softmax

cnn_model = Model(inputs=input_layer, outputs=output)
cnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

keras_trainrating = np.array(normal_traindf.rating - 1).astype("float32")

In [None]:
checkpoint_cb = ModelCheckpoint("amazon_model.h5", save_best_only=True)

train_start = time.time()
cnn_model.fit(train, keras_trainrating, epochs=2, validation_split=.1, callbacks=[checkpoint_cb])
train_time = time.time() - train_start

In [None]:
test_start = time.time()
y_prob = cnn_model.predict(test)
test_time = time.time() - test_start

y_pred = ((y_prob > 0.5).astype(int) + 1).flatten()

accuracy = accuracy_score(y_pred, normal_testdf.rating)
confusion = confusion_matrix(y_pred, normal_testdf.rating)

plot_data["textCNN"] = [accuracy, train_time, test_time, confusion]

print("Accuracy:", accuracy)
print("Train Time:", train_time)
print("Test Time:", test_time)
print("Confusion Matrix:\n", confusion)

6. TF-IDF + Naive Bayes

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

In [None]:
nb_model = make_pipeline(TfidfVectorizer(), MultinomialNB())

train_start = time.time()
nb_fit = nb_model.fit(normal_traindf.text, normal_traindf.rating)
train_time = time.time() - train_start

test_start = time.time()
predict = nb_fit.predict(normal_testdf.text)
test_time = time.time() - test_start

accuracy = accuracy_score(predict, normal_testdf.rating)
confusion = confusion_matrix(predict, normal_testdf.rating)

plot_data["TF-IDF + NaiveBayes"] = [accuracy, train_time, test_time, confusion]

print("Accuracy:", accuracy)
print("Train Time:", train_time)
print("Test Time:", test_time)
print("Confusion Matrix:\n", confusion)

7. TF-IDF + Logistic Regression

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

In [None]:
lr_model = make_pipeline(TfidfVectorizer(), LogisticRegression())

train_start = time.time()
lr_fit = lr_model.fit(normal_traindf.text, normal_traindf.rating)
train_time = time.time() - train_start

test_start = time.time()
predict = lr_fit.predict(normal_testdf.text)
test_time = time.time() - test_start

accuracy = accuracy_score(predict, normal_testdf.rating)
confusion = confusion_matrix(predict, normal_testdf.rating)

plot_data["TF-IDF + LogisticRegression"] = [accuracy, train_time, test_time, confusion]

print("Accuracy:", accuracy)
print("Train Time:", train_time)
print("Test Time:", test_time)
print("Confusion Matrix:\n", confusion)

Data Viz

In [None]:
plt.style.use("ggplot")
backgroundcolor = "#F2E9E4"

plot_data

Plot: Accuracy

In [20]:
imgsize = 16
def getImageDict():
    imagedict = {}
    imagedir = lambda x: f"Images/Script_4/{x}.png"
    for model in plot_data.keys():
        image = mpimg.imread(imagedir(model))
        imagebox = OffsetImage(image, zoom= imgsize/(image.shape[0]))

        imagedict[model] = imagebox
    return imagedict

In [None]:
model_list = plot_data.keys()

colordict = {
    "fastText": "#c3506e",
    "BERT": "#d4af37",
    "gpt-4o-mini": "#20856a",
    "LSTM": "#3b6ca3",
    "textCNN": "#e3822d",
    "TF-IDF + NaiveBayes": "#4c9141",
    "TF-IDF + LogisticRegression": "#7e5ca3"
}
curImageDict = getImageDict()

raw_values = [x[0] for x in plot_data.values()]
label_values = [(lambda x: f"{x:.1%}")(x) for x in raw_values]

zipped = list(zip(raw_values, label_values, model_list))
zipped.sort(reverse=True)
raw_values, label_values, model_list = zip(*zipped)

plt.figure(figsize=(12, 8), facecolor=backgroundcolor)
bars = plt.bar(model_list, raw_values, width=0.4)
for v, (val, label, model) in enumerate(zip(raw_values, label_values, model_list)):
    plt.text(v, val + 0.005, label, ha='center', va='bottom', fontsize=12, color="black")
for bar, model in zip(bars, model_list):
    x, y = bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02
    ab = AnnotationBbox(curImageDict[model], (x, y), xycoords='data', box_alignment=(0.5,0.5), frameon=False)
    plt.gca().add_artist(ab)
    bar.set_color(colordict[model])
plt.title("Sentiment Analysis Models - Accuracy", weight="bold", y=1.05)
plt.suptitle("Train Size: 180k (exc. BERT: 1.8k, gpt-4o-mini: -)", y=0.91, x=0.52)
plt.xlabel("Model")
plt.xticks(rotation=20)
plt.ylabel("Accuracy")
plt.ylim(0.8, 1.01)
plt.tight_layout()
plt.show()

In [None]:
model_list = plot_data.keys()
train_scalemult = bert_trainsize / trainsize
test_scalemult = bert_testsize / testsize
scaledmodel = ["textCNN", "LSTM", "TF-IDF + LogisticRegression", "fastText", "TF-IDF + NaiveBayes"]

nanlambda = lambda x: not np.isnan(x) and x or 0
labellambda = lambda x: x>1e2 and f"{x:.0f}s" or f"{x:.3g}s"

trainlambda = lambda x, y: (x in scaledmodel) and y*train_scalemult or y
testlambda = lambda x, y: (x in scaledmodel) and y*test_scalemult or y

curImageDict = getImageDict()

plt.figure(figsize=(12, 8), facecolor=backgroundcolor)
def plotdata(raw_values, ylambda, sort, label, addimage):
    y_values = [ylambda(x) for x in raw_values]
    label_values = [labellambda(x) for x in raw_values]

    plt.plot(model_list, y_values, marker="o", label=label)
    for v, (val, label, model) in enumerate(zip(y_values, label_values, model_list)):
        plt.text(v, val + 0.1, label, ha='center', va='bottom', fontsize=12, color="black")
        if addimage:
            ab = AnnotationBbox(curImageDict[model], (v, val + 0.7), xycoords='data', box_alignment=(0.5,0.5), frameon=False)
            plt.gca().add_artist(ab)

traintime_values = [trainlambda(x, y[1]) for x, y in plot_data.items()]
testtime_values = [testlambda(x, y[2]) for x, y in plot_data.items()]
totaltime_values = [nanlambda(traintime_values[x]) + testtime_values[x] for x in range(len(traintime_values))]

zipped = list(zip(totaltime_values, traintime_values, testtime_values, model_list))
zipped.sort(reverse=True)
totaltime_values, traintime_values, testtime_values, model_list = zip(*zipped)

plotdata(
    totaltime_values,
    lambda x: math.log10(x) + 2,
    True,
    "Total Time",
    True
)

plotdata(
    traintime_values,
    lambda x: not np.isnan(x) and math.log10(x) or x,
    False,
    "Training Time",
    False
)

plotdata(
    testtime_values,
    lambda x: math.log10(x),
    False,
    "Testing Time",
    False
)

plt.title("Sentiment Analysis Models - Time Performance", weight="bold", y=1.05)
plt.suptitle("Scaled to 1800 Training / 200 Test Samples", y=0.91)
plt.xlabel("Model")
plt.xticks(rotation=20)
plt.legend()
plt.ylim(-2.5, 6.5)
plt.tight_layout()
plt.show()

In [None]:
shortendict = {
    "TF-IDF + NaiveBayes": "NaiveBayes",
    "TF-IDF + LogisticRegression": "LogRegress"
}
shortenlambda = lambda x: shortendict[x] if x in shortendict.keys() else x
grouplambda = lambda x, y: f"{x:.1%}\n({y})"

for model, value in plot_data.items():
    matrix = value[3]
    norm = matrix / matrix.sum(axis=1, keepdims=True)
    group = [[grouplambda(norm[x,y], matrix[x,y]) for y in range(matrix.shape[1])] for x in range(matrix.shape[0])]
    print(group)

    plt.figure(figsize=(5, 4), facecolor=backgroundcolor)
    sns.heatmap(norm, annot=group, fmt="", cmap="viridis",
                xticklabels=["Pred 0", "Pred 1"], yticklabels=["True 0", "True 1"],
                vmin=0, vmax=1)
    plt.title(f"{shortenlambda(model)} - Confusion Matrix", weight="bold")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()