# Fastai Approach

In [1]:
'''
Created by Ruoxi Jia.
'''
import pandas as pd
import fastai.text.all as ft
from sklearn.model_selection import train_test_split
import pickle

# Import the reviews.csv to dataframe
data = pd.read_csv('reviews.csv', sep=',')
data['RatingValue'].value_counts()

2    1465
1     297
0     158
Name: RatingValue, dtype: int64

## Preprocessing

In [2]:
# Random Drop Undersampling
data_us = pd.concat([
    data.loc[data.RatingValue == 1],
    data.loc[data.RatingValue == 0],
    data.loc[data.RatingValue == 2].sample(frac=1/4),
]).reset_index().drop('index', axis=1)

data_us['RatingValue'].value_counts()

2    366
1    297
0    158
Name: RatingValue, dtype: int64

In [3]:
# Split the dataset into training and valid
training, valid = train_test_split(data_us, test_size=0.2, random_state=42)

# Export the training and valid dataset into csv file
training.to_csv("training.csv")
valid.to_csv("valid.csv")

## Train language model

In [4]:
# dataloader for the language model
dls_lm = ft.TextDataLoaders.from_df(
    training, valid_pct=0.1, text_col='Review', label_col='RatingValue', is_lm=True)
# dataloader for the classifier
dls_cls = ft.TextDataLoaders.from_df(
    training, valid_pct=0.1, text_col='Review', label_col='RatingValue', is_lm=False)

Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `n_workers` has to be changed to 0 to avoid getting stuck


  return array(a, dtype, copy=False, order=order)


Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `n_workers` has to be changed to 0 to avoid getting stuck


  return array(a, dtype, copy=False, order=order)


Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `number_workers` is changed to 0 to avoid getting stuck


In [5]:
learn_lm = ft.language_model_learner(dls_lm, ft.AWD_LSTM, metrics=[
                                     ft.accuracy, ft.Perplexity()], wd=0.1).to_fp16()

In [6]:
learn_lm.fine_tune(2, 1e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.874808,4.192383,0.257423,66.180328,00:03


epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.262172,3.852232,0.274178,47.098057,00:02
1,4.048535,3.753575,0.286258,42.673351,00:02


In [7]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(n_epoch=10, lr_max=1e-3)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.85277,3.726527,0.288256,41.534607,00:02
1,3.814104,3.679998,0.293008,39.646332,00:02
2,3.743138,3.642814,0.297782,38.199177,00:02
3,3.654687,3.622003,0.29638,37.412415,00:02
4,3.559993,3.599943,0.299401,36.596153,00:02
5,3.474667,3.591141,0.302729,36.275459,00:02
6,3.379064,3.596891,0.301199,36.484642,00:02
7,3.311077,3.595577,0.301939,36.436718,00:02
8,3.246224,3.598185,0.301199,36.531879,00:02
9,3.204222,3.598272,0.300509,36.535061,00:02


In [8]:
learn_lm.save_encoder('finetuned_lm')

## Train text classifier

In [9]:
learn_cls = ft.text_classifier_learner(
    dls_cls, ft.AWD_LSTM, drop_mult=0.5, metrics=ft.accuracy).to_fp16()

In [10]:
learn_cls = learn_cls.load_encoder('finetuned_lm')

In [11]:
learn_cls.fit_one_cycle(5, 2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.971501,1.025177,0.430769,00:01
1,0.777691,1.066128,0.476923,00:01
2,0.675391,1.213193,0.584615,00:01
3,0.590992,1.091697,0.615385,00:01
4,0.52784,1.155475,0.646154,00:01


In [12]:
learn_cls.freeze_to(-2)
learn_cls.fit_one_cycle(5, slice(1e-2/(2.6**4), 1e-2))

epoch,train_loss,valid_loss,accuracy,time
0,0.403745,1.493863,0.569231,00:01
1,0.423751,1.506846,0.630769,00:01
2,0.388541,1.488972,0.584615,00:01
3,0.325814,1.530447,0.615385,00:01
4,0.277782,1.48734,0.615385,00:01


In [13]:
learn_cls.freeze_to(-3)
learn_cls.fit_one_cycle(5, slice(5e-3/(2.6**4), 5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.147134,1.579533,0.630769,00:01
1,0.141701,1.757116,0.584615,00:01
2,0.147225,1.770997,0.569231,00:01
3,0.130054,2.012062,0.553846,00:01
4,0.110199,1.941315,0.569231,00:01


In [14]:
learn_cls.unfreeze()
learn_cls.fit_one_cycle(10, slice(1e-3/(2.6**4), 1e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.053855,1.906933,0.584615,00:01
1,0.05373,1.87003,0.6,00:01
2,0.049566,1.855609,0.630769,00:01
3,0.047644,1.877927,0.630769,00:01
4,0.043732,1.875014,0.630769,00:01
5,0.037362,1.912004,0.630769,00:01
6,0.034953,1.93533,0.630769,00:01
7,0.032887,1.969751,0.630769,00:01
8,0.03031,1.953644,0.630769,00:01
9,0.027468,1.964167,0.630769,00:01


In [15]:
learn_cls.export(fname="fine_tuned.pkl")

## Deliverable

In [16]:
import pandas as pd
from sklearn import metrics
import numpy as np
from fastai.text.all import *

validation = pd.read_csv("valid.csv", sep=',')
learn = load_learner('fine_tuned.pkl')

In [17]:
preds = []
result = []
for i in range(0, validation.shape[0]):
    pred = learn.predict(validation.loc[i, 'Review'])
    preds += pred
    result += preds[i*3]
    i += 1

for j in range(0, len(result)):
    result[j] = int(result[j])

In [18]:
# calculate the accuracy
accuracy = round(np.mean(result == validation['RatingValue']), 2)
# calculate the F1 score
F1_score = round(metrics.f1_score(
    validation.RatingValue, result, average='weighted'), 2)
# print out the result
print("accuracy:", accuracy, "\n")
print("F1_score:", F1_score, "\n")
print("Confusion_matrix:")
# generate the confusion matrix
conf = pd.DataFrame(metrics.confusion_matrix(validation.RatingValue, result),
                    index=['negative', 'neutral', 'positive'],
                    columns=['negative', 'neutral', 'positive'])
print(conf)

accuracy: 0.62 

F1_score: 0.62 

Confusion_matrix:
          negative  neutral  positive
negative        15       17         6
neutral          4       40        13
positive         1       21        48
