In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/small_corpus.csv')

In [4]:
df.head()
df.dropna(subset = ["reviews"], inplace=True)

In [5]:
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re

tokenizer = TweetTokenizer()

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

def normalizeTweet(tweet):
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = normTweet.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
    normTweet = normTweet.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
    normTweet = normTweet.replace(" p . m .", "  p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")

    normTweet = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet)
    
    return " ".join(normTweet.split())

In [6]:
import re
import string
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [7]:
df['text'] = df['reviews'].apply(lambda x:clean_text(normalizeTweet(x)))

In [8]:
def score_to_Target(value):
    if value >= 5:
        return 2
    if value <= 4 and value >= 2:
        return 1
    else:
        return 0

In [9]:
df['labels'] = df['ratings'].apply(lambda x:score_to_Target(x))

In [10]:
from sklearn.utils import shuffle
df = shuffle(df)

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,ratings,reviews,text,labels
1434,485154,1,The code has been used.. wasted all my time an...,the code has been used wasted all my time and...,0
1670,111595,2,I guess if you have a young child that wants t...,i guess if you have a young child that wants t...,1
3568,273656,5,Sony did several things wrong with the Vita of...,sony did several things wrong with the vita of...,2
2705,41885,4,I'll skip the history lesson and get straight ...,i ll skip the history lesson and get straight ...,1
1833,266170,2,I hooked up the Turtle Beach Ear Force DSS2 to...,i hooked up the turtle beach ear force dss to...,1


In [12]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df,
                                     stratify=df["labels"],
                                     random_state=42)

In [22]:
text = list(train_df['text'].values)

In [23]:
with open('data/train.txt','w', encoding='utf-8') as myfile:
    myfile.write('\n'.join(text))

In [24]:
val_text = list(val_df['text'].values)

In [26]:
with open('data/val.txt','w', encoding='utf-8') as myfile:
    myfile.write('\n'.join(val_text))

In [29]:
from simpletransformers.language_modeling import (
    LanguageModelingModel,LanguageModelingArgs
)

In [31]:
model_args = LanguageModelingArgs()
model_args.sliding_window = True
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.num_train_epochs = 1
model_args.dataset_type = "simple"

In [33]:
train_file = "data/train.txt"
test_file = "data/val.txt"

In [34]:
model = LanguageModelingModel(
    "distilbert", "distilbert-base-uncased", args=model_args
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [None]:
model.train_model(train_file, eval_file=test_file)

HBox(children=(FloatProgress(value=0.0, max=3372.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=840.0, style=ProgressStyle(des…




