In [None]:
# Import required libraries
import numpy as np
import pandas as pd

In [None]:
# Download fastai library
! pip install fastbook


In [None]:
import fastbook

In [None]:
# Import required libraries
from fastbook import *
from IPython.display import display,HTML

In [None]:
# Import NLP module from fastai library
from fastai.text.all import *

In [None]:
# Read 1/6th of the train dataset to reduce total time taken for training
train = pd.read_csv('train_unbalanced_final.csv')
train.head()

In [None]:
# Read 1/6th of the test dataset to reduce total time taken for testing
test = pd.read_csv('test.csv')
test.head()

In [None]:
# Check major statistics for the train dataset
train.describe()

In [None]:
# Check major statistics for the test dataset
test.describe()

In [None]:
# Check the distribution of the number of data points for each of the 5 classes
train['classes'].value_counts().plot(kind='bar')

In [None]:
# The distribution seems fairly equal which is perfect for training the model
train['classes'].value_counts()

In [None]:
# Array of all the reviews in the train data frame
txts = L([i for i in train['content']])

In [None]:
# Tokenize all the rows
tok = Tokenizer.from_df(train)
tok.setup(train)

toks = txts.map(tok)
toks[0]

In [None]:
# Numericalize all the tokens from the previous step
num = Numericalize()
num.setup(toks)
nums = toks.map(num)
nums[0][:10]

In [None]:
# Seeing the indexes of how the words are stored for the first row only
num.encodes(toks[0])

In [None]:
# Decoding the indexes to see the tokens
num.decode(nums[0])

In [None]:
# Concatening just the reviews column of both train and test datasets to help in creating the language model
language_model = pd.concat([train, test], axis=0)[['content']]
language_model.head()

In [None]:
# Checking major statistics of the new dataset
language_model.describe()

In [None]:
# Splitting the new dataset randomly into 90% train and 10% validation 
data_lm = DataBlock(
    blocks=TextBlock.from_df('content', is_lm=True),
    get_x=ColReader('text'), 
    splitter=RandomSplitter(0.1) 
)

In [None]:
# Internally tokenizing and numericalizing the data. The sequence length used is the default used for training the Wikipedia 103 language model
data_lm = data_lm.dataloaders(language_model, bs=64, seq_len=72)

In [None]:
# Randomly show 5 of the rows from the language model
data_lm.show_batch(max_n=5)

In [None]:
# Defining the LSTM architecture of the train model and the metrics used to acertain its accuracy
learn_model = language_model_learner(
    data_lm, AWD_LSTM, drop_mult=0.3,
    metrics=[accuracy, Perplexity()]).to_fp16()

In [None]:
# Check the various layers of the model
learn_model.model

In [None]:
# Find the best learning rate for training the language model
learn_model.lr_find()

In [None]:
# Fine tuning the language model based on the datablock which would enable in predicting the next word in a sentence for the Amazon Reviews dataset specifically
learn_model.fine_tune(4, 1e-1)

In [None]:
# Save the encoder that understands the intricate language semantics of our dataset
learn_model.save_encoder('finetuned_encoder')

In [None]:
# Load the encoder that understands the intricate language semantics of our dataset
learn_model.load_encoder('finetuned_encoder')

In [None]:
# Predict the full sentence of a negative prompt using our trained language model
learn_model.predict("This laptop was horrible because", 30, temperature=0.80)

In [None]:
# Predict the full sentence of a positive prompt using our trained language model
learn_model.predict("I absolutely loved this dress because", 20, temperature=0.80)

In [None]:
# Download necessary libraries for Easy Data Augmentation (EDA)
!pip install -U nltk

In [None]:
# Download necessary libraries for Easy Data Augmentation (EDA)
import nltk
nltk.download('wordnet')

In [None]:
# Make new dataframe containing 10x less data than the train dataset because EDA would generate new data from lesser existing data
eda = pd.read_csv('train.csv')
eda = eda[["classes", "content"]]
eda.head()

In [None]:
# Convert csv to txt file required for EDA to run
eda.to_csv('train2.txt', header=False, index=False, sep='\t', mode='a')

In [None]:
# Run the python command for EDA with 10 new augmentations per existing tweet and perform all 4 augmentations with its default alpha values as stated in the research paper
%run eda_nlp/code/augment.py --input=eda_nlp/data/train2.txt --num_aug=10 --alpha_sr=0.05 --alpha_rd=0.00 --alpha_ri=0.05 --alpha_rs=0.05

In [None]:
# Convert txt to csv and replace train dataset with new EDA data
train = pd.read_csv('eda_train2.txt', delimiter="\t", header=None, names=["classes", "content"])

In [None]:
# train = train[["classes", "content"]]

In [None]:
# Check data to see if each data point is repeated 3 times, first is original and the next 2 are augmentations
train.head()

In [None]:
# Create the datablock for the classifier now splitting the dataset into 80% train and 20% validation
blocks = (TextBlock.from_df('content', seq_len=data_lm.seq_len, vocab=data_lm.vocab), CategoryBlock())
data_classifier = DataBlock(blocks=blocks,
                get_x=ColReader('text'),
                get_y=ColReader('classes'),
                splitter=RandomSplitter(0.2))

In [None]:
# Perform tokenization and numericalization of the loaded data automatically
data_classifier = data_classifier.dataloaders(train, bs=64)

In [None]:
# Randomly show 5 of the rows from the classifier
data_classifier.show_batch(max_n=5)

In [None]:
# Defining the LSTM architecture of the train model and the metrics used to acertain its accuracy
learn_model = text_classifier_learner(data_classifier, AWD_LSTM, metrics=[accuracy, F1Score(average="micro")], drop_mult=0.1)


In [None]:
# Load up the previously trained language model encoder
learn_model.load_encoder('finetuned_encoder')

In [None]:
# Find the best learning rate for training the classifer
learn_model.lr_find()

In [None]:
# Train the model one epoch with the min learning rate from previous step
learn_model.fit_one_cycle(1, 2e-2)

In [None]:
# Freeze all the layers in the model except for the last two and train again with a smaller learning rate, the sliced values follows the instructions as per Jeremy Howard's FastAI course
learn_model.freeze_to(-2)
learn_model.fit_one_cycle(1, slice(2e-2/(2.6**4), 2e-2))

In [None]:
# Freeze all the layers in the model except for the last three and train again with a smaller learning rate, the sliced values follows the instructions as per Jeremy Howard's FastAI course
learn_model.freeze_to(-3)
learn_model.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

In [None]:
# Unfreeze all the layers in the model and train again with a smaller learning rate, the sliced values follows the instructions as per Jeremy Howard's FastAI course
learn_model.unfreeze()
learn_model.fit_one_cycle(5, slice(1e-3/(2.6**4),1e-3))

In [None]:
# Save the final trained model
learn_model.save('trained_model_eda')

In [None]:
# Export the final trained model
learn_model.export()

In [None]:
# Load the final trained model
learn_model.load('trained_model_eda')

In [None]:
test['one_hot_labels'].value_counts().plot(kind='bar')

In [None]:
learn_model.show_results()

In [None]:
interp = ClassificationInterpretation.from_learner(learn_model)

In [None]:
interp.plot_confusion_matrix()

In [None]:
interp.plot_top_losses(5, nrows=5)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
test['preds'] = test['content'].apply(lambda row: learn_model.predict(row)[0])
print("Test Accuracy: ", accuracy_score(test['classes'], test['preds']))

In [None]:
test.to_csv("preds3.csv")

In [None]:
dl = learn_model.dls.test_dl(test['content'])

In [None]:
preds = learn_model.get_preds(dl=dl)

In [None]:
preds[0][0].cpu().numpy()

In [None]:
preds[0][0].cpu().argmax(dim=-1)

In [None]:
test['target'] = preds[0].argmax(dim=-1)

In [None]:
test.head()

In [None]:
test.to_csv("preds4.csv")