In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/small_corpus.csv')

In [3]:
df.head()
df.dropna(subset = ["reviews"], inplace=True)

In [4]:
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re

tokenizer = TweetTokenizer()

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

def normalizeTweet(tweet):
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = normTweet.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
    normTweet = normTweet.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
    normTweet = normTweet.replace(" p . m .", "  p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")

    normTweet = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet)
    
    return " ".join(normTweet.split())

In [5]:
import re
import string
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [6]:
df['text'] = df['reviews'].apply(lambda x:clean_text(normalizeTweet(x)))

In [7]:
def score_to_Target(value):
    if value >= 5:
        return 2
    if value <= 4 and value >= 2:
        return 1
    else:
        return 0

In [8]:
df['labels'] = df['ratings'].apply(lambda x:score_to_Target(x))

In [9]:
from sklearn.utils import shuffle
df = shuffle(df)

In [10]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging

In [11]:
logging.basicConfig(level=logging.ERROR)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [12]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df,
                                     stratify=df["labels"],
                                     random_state=42)

In [13]:
model_args = ClassificationArgs(num_train_epochs=4)
model_args.learning_rate = 2e-5
model_args.max_seq_length=128
model_args.sliding_window=True
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = True
model_args.eval_batch_size = 32
model_args.train_batch_size =32
model_args.output_dir = 'output/'
model_args.best_model_dir = 'bestmodel/'
model_args.use_cached_eval_features = True
model_args.use_early_stopping = True
model_args.early_stopping_metric = 'eval_loss'
model_args.early_stopping_metric_minimize = True
model_args.save_steps = -1
model_args.save_model_every_epoch = False
model_args.reprocess_input_data = True

In [14]:
model = ClassificationModel(
    "albert", "albert-base-v2", args=model_args,num_labels=3
)

In [15]:
import sklearn
model.train_model(train_df, eval_df=val_df,accuracy = sklearn.metrics.accuracy_score)

HBox(children=(FloatProgress(value=0.0, max=3374.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 4', max=216.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 4', max=216.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 4', max=216.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 4', max=216.0, style=ProgressStyle(des…





(864,
 {'global_step': [216, 432, 648, 864],
  'mcc': [0.6195408121014373,
   0.7009285405789194,
   0.8040867269196295,
   0.8400567524176431],
  'train_loss': [0.9630818367004395,
   0.48918071389198303,
   0.6318419575691223,
   0.406562477350235],
  'eval_loss': [0.6813986502789162,
   0.5576134591891959,
   0.4232998631290487,
   0.36945050280239133],
  'accuracy': [0.7448888888888889,
   0.8,
   0.8693333333333333,
   0.8933333333333333]})

In [16]:
evalutionModel = model.eval_model(eval_df=val_df,accuracy = sklearn.metrics.accuracy_score)

HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=74.0, style=ProgressStyle(descri…




In [17]:
data = [{'modeltype': model_args.model_type, 
         'model_name': model_args.model_name,
         'accuracy':evalutionModel[0]['accuracy'],
         'mcc':evalutionModel[0]['mcc'],
         'eval_loss':evalutionModel[0]['eval_loss'],
        'learning_rate':model_args.learning_rate,
        'num_train_epochs':model_args.num_train_epochs,
        'max_length':model_args.max_seq_length,
        'sliding_window':model_args.sliding_window,
        'batch_size':model_args.train_batch_size,
        'comment':"Changed the model to Albert and increasing the learning rate"}]

In [18]:
import os

In [19]:
results_df = pd.DataFrame(data)
if(os.path.exists('comparision.csv')):
    results_csv = pd.read_csv('comparision.csv')
    results_csv = results_csv.append(results_df,ignore_index = True)
    results_csv.to_csv('comparision.csv',index=False)
else:
    results_df.to_csv('comparision.csv',index=False)