In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import TweetTokenizer
import string
import contractions

### Step 1 - load the data and take a look

In [5]:
col_names = ['label', 'id', 'date', 'flag', 'user', 'text']
df = pd.read_csv('bert_model/sentiment140/training.csv', encoding='latin-1', names=col_names)
df = df[['text', 'label']]
print(df.head(2))
print("\nThere are {} tweets in total.\n".format(len(df)))
print("There are {} positve tweets".format(np.sum(df["label"] == 4)))
print("There are {} neural tweets".format(np.sum(df["label"] == 2)))
print("There are {} negative tweets".format(np.sum(df["label"] == 0)))

                                                text  label
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...      0
1  is upset that he can't update his Facebook by ...      0

There are 1600000 tweets in total.

There are 800000 positve tweets
There are 0 neural tweets
There are 800000 negative tweets


### Step 2 - Generate train, val, test data set

In [6]:
# test = pd.read_csv('data/test.csv', encoding='latin-1', names=col_names)
# X_test = test["text"].values.tolist()
# y_test = test["label"].values.tolist()

# train, val = train_test_split(df, test_size=0.1)
# X_train = train["text"].values.tolist()
# y_train = train["label"].values.tolist()

# X_val = val["text"].values.tolist()
# y_val = val["label"].values.tolist()

# print("There are {} train set".format(len(X_train)))
# print("There are {} validation set".format(len(X_val)))
# print("There are {} test set".format(len(X_test)))

In [7]:
def remove_mention(text):
    return re.sub('@[^\s]+','',text)

text = "@David hi"
remove_mention(text)

' hi'

In [8]:
def remove_html(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    return souped

text = "&amp; I like it"
remove_html(text)

'& I like it'

In [9]:
def remove_url(text):
    return re.sub(r'http\S+', '', text)

text = "Check this video http://bit.ly/IMXUM"
remove_url(text)

'Check this video '

In [10]:
def remove_hashtag(text):
    return re.sub(r'#\w*', '', text)

text = "#trumpisidiot hahah"
remove_hashtag(text)

' hahah'

In [11]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

text = 'string with "punctuation" inside of it! Does this work? I hope so.'
remove_punctuation(text)

'string with punctuation inside of it Does this work I hope so'

In [12]:
def fix_contractions(text):
    return contractions.fix(text)

text = "I can't help you!"
fix_contractions(text)

'I can not help you!'

In [13]:
def preprocess(text):
    text = remove_html(text)
    text = remove_mention(text)
    text = remove_url(text)
    text = remove_hashtag(text)
    text = fix_contractions(text)

    words = TweetTokenizer().tokenize(text)
    words = [word.lower() for word in words]
    # Remove punctuation
    words = [remove_punctuation(word) for word in words]
    # Remove words that contain numeric values
    words = [word for word in words if word.isalpha()]
    
    return " ".join(words)

text = """
boy's cars
I can't help you. #Fucker. 
Check this out http://shit_url
Lebron..sh*t was hilarious...LMAO!!!
"""
preprocess(text)


'boys cars i can not help you check this out lebron sh t was hilarious lmao'

In [14]:
index = 0
max_length = 0
for i, row in df.iterrows():
    if index % 1000 == 0:
        print("Finished processing {} training data".format(index))
    index += 1
    text = df.at[i,'text']
    text = preprocess(text)
    
    length = len(text.split())
    if length > max_length:
        max_length = length
        
    df.at[i,'text'] = text

Finished processing 0 training data
Finished processing 1000 training data
Finished processing 2000 training data
Finished processing 3000 training data
Finished processing 4000 training data
Finished processing 5000 training data
Finished processing 6000 training data
Finished processing 7000 training data
Finished processing 8000 training data
Finished processing 9000 training data
Finished processing 10000 training data
Finished processing 11000 training data
Finished processing 12000 training data
Finished processing 13000 training data
Finished processing 14000 training data
Finished processing 15000 training data
Finished processing 16000 training data
Finished processing 17000 training data
Finished processing 18000 training data
Finished processing 19000 training data
Finished processing 20000 training data
Finished processing 21000 training data
Finished processing 22000 training data
Finished processing 23000 training data
Finished processing 24000 training data
Finished proc

Finished processing 203000 training data
Finished processing 204000 training data
Finished processing 205000 training data
Finished processing 206000 training data
Finished processing 207000 training data
Finished processing 208000 training data
Finished processing 209000 training data
Finished processing 210000 training data
Finished processing 211000 training data
Finished processing 212000 training data
Finished processing 213000 training data
Finished processing 214000 training data
Finished processing 215000 training data
Finished processing 216000 training data
Finished processing 217000 training data
Finished processing 218000 training data
Finished processing 219000 training data
Finished processing 220000 training data
Finished processing 221000 training data
Finished processing 222000 training data
Finished processing 223000 training data
Finished processing 224000 training data
Finished processing 225000 training data
Finished processing 226000 training data
Finished process

Finished processing 403000 training data
Finished processing 404000 training data
Finished processing 405000 training data
Finished processing 406000 training data
Finished processing 407000 training data
Finished processing 408000 training data
Finished processing 409000 training data
Finished processing 410000 training data
Finished processing 411000 training data
Finished processing 412000 training data
Finished processing 413000 training data
Finished processing 414000 training data
Finished processing 415000 training data
Finished processing 416000 training data
Finished processing 417000 training data
Finished processing 418000 training data
Finished processing 419000 training data
Finished processing 420000 training data
Finished processing 421000 training data
Finished processing 422000 training data
Finished processing 423000 training data
Finished processing 424000 training data
Finished processing 425000 training data
Finished processing 426000 training data
Finished process

Finished processing 603000 training data
Finished processing 604000 training data
Finished processing 605000 training data
Finished processing 606000 training data
Finished processing 607000 training data
Finished processing 608000 training data
Finished processing 609000 training data
Finished processing 610000 training data
Finished processing 611000 training data
Finished processing 612000 training data
Finished processing 613000 training data
Finished processing 614000 training data
Finished processing 615000 training data
Finished processing 616000 training data
Finished processing 617000 training data
Finished processing 618000 training data
Finished processing 619000 training data
Finished processing 620000 training data
Finished processing 621000 training data
Finished processing 622000 training data
Finished processing 623000 training data
Finished processing 624000 training data
Finished processing 625000 training data
Finished processing 626000 training data
Finished process

Finished processing 803000 training data
Finished processing 804000 training data
Finished processing 805000 training data
Finished processing 806000 training data
Finished processing 807000 training data
Finished processing 808000 training data
Finished processing 809000 training data
Finished processing 810000 training data
Finished processing 811000 training data
Finished processing 812000 training data
Finished processing 813000 training data
Finished processing 814000 training data
Finished processing 815000 training data
Finished processing 816000 training data
Finished processing 817000 training data
Finished processing 818000 training data
Finished processing 819000 training data
Finished processing 820000 training data
Finished processing 821000 training data
Finished processing 822000 training data
Finished processing 823000 training data
Finished processing 824000 training data
Finished processing 825000 training data
Finished processing 826000 training data
Finished process

Finished processing 1003000 training data
Finished processing 1004000 training data
Finished processing 1005000 training data
Finished processing 1006000 training data
Finished processing 1007000 training data
Finished processing 1008000 training data
Finished processing 1009000 training data
Finished processing 1010000 training data
Finished processing 1011000 training data
Finished processing 1012000 training data
Finished processing 1013000 training data
Finished processing 1014000 training data
Finished processing 1015000 training data
Finished processing 1016000 training data
Finished processing 1017000 training data
Finished processing 1018000 training data
Finished processing 1019000 training data
Finished processing 1020000 training data
Finished processing 1021000 training data
Finished processing 1022000 training data
Finished processing 1023000 training data
Finished processing 1024000 training data
Finished processing 1025000 training data
Finished processing 1026000 traini

Finished processing 1199000 training data
Finished processing 1200000 training data
Finished processing 1201000 training data
Finished processing 1202000 training data
Finished processing 1203000 training data
Finished processing 1204000 training data
Finished processing 1205000 training data
Finished processing 1206000 training data
Finished processing 1207000 training data
Finished processing 1208000 training data
Finished processing 1209000 training data
Finished processing 1210000 training data
Finished processing 1211000 training data
Finished processing 1212000 training data
Finished processing 1213000 training data
Finished processing 1214000 training data
Finished processing 1215000 training data
Finished processing 1216000 training data
Finished processing 1217000 training data
Finished processing 1218000 training data
Finished processing 1219000 training data
Finished processing 1220000 training data
Finished processing 1221000 training data
Finished processing 1222000 traini

Finished processing 1395000 training data
Finished processing 1396000 training data
Finished processing 1397000 training data
Finished processing 1398000 training data
Finished processing 1399000 training data
Finished processing 1400000 training data
Finished processing 1401000 training data
Finished processing 1402000 training data
Finished processing 1403000 training data
Finished processing 1404000 training data
Finished processing 1405000 training data
Finished processing 1406000 training data
Finished processing 1407000 training data
Finished processing 1408000 training data
Finished processing 1409000 training data
Finished processing 1410000 training data
Finished processing 1411000 training data
Finished processing 1412000 training data
Finished processing 1413000 training data
Finished processing 1414000 training data
Finished processing 1415000 training data
Finished processing 1416000 training data
Finished processing 1417000 training data
Finished processing 1418000 traini

Finished processing 1591000 training data
Finished processing 1592000 training data
Finished processing 1593000 training data
Finished processing 1594000 training data
Finished processing 1595000 training data
Finished processing 1596000 training data
Finished processing 1597000 training data
Finished processing 1598000 training data
Finished processing 1599000 training data


In [15]:
df.to_csv('train.csv', index=False)
max_length

116

In [61]:
X_train = [preprocess(text) for text in X_train]
X_val = [preprocess(text) for text in X_val]
X_test = [preprocess(text) for text in X_test]

print("There are {} train set".format(len(X_train)))
print("There are {} validation set".format(len(X_val)))
print("There are {} test set".format(len(X_test)))

There are 1440000 train set
There are 160000 validation set
There are 498 test set


In [65]:
import torch
import apex
from pytorch_pretrained_bert.tokenization import BertTokenizer
from fast_bert.data import BertDataBunch
from fast_bert.learner import BertLearner
from fast_bert.metrics import accuracy

TypeError: Class advice impossible in Python3.  Use the @implementer class decorator instead.