In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import codecs
import os
from tabulate import tabulate
from typing import Literal, List
import re

sns.set_style("whitegrid")

In [2]:
def get_filenames(dataset: Literal["train","test"], path="data/downloaded/") -> List[str]:
    if dataset == "train":
        r = '[a-z]+-(\d{4})'
        onlyfiles = [f for f in os.listdir("data/downloaded") if os.path.isfile(os.path.join("data/downloaded", f)) and os.path.join("data/downloaded", f)[-5] == "A"]
        return [f for f in onlyfiles if f not in ["twitter-2016dev-A.tsv", "twitter-2016devtest-A.tsv", "twitter-2016train-A.tsv"]]
    return ["twitter-2016dev-A.tsv", "twitter-2016devtest-A.tsv", "twitter-2016train-A.tsv"]

In [4]:
def concatenate_data(dataset: Literal["train","test"], path="data/downloaded/") -> None:
    # write all data from previous years to new file, as in paper
    filenames = get_filenames(dataset, path=path)

    with codecs.open(f'{path}twitter-{dataset}.tsv','w', encoding="utf-8") as outfile:
        for fname in filenames:
            with codecs.open(path+fname, "r",encoding="utf-8") as infile:
                for line in infile:
                    outfile.write(line)

In [5]:
concatenate_data("train")
concatenate_data("test")

Load into dataframes

In [3]:
# train = pd.read_csv("data/downloaded/twitter-train.tsv", sep='\t', usecols=[1,2], names=["label", "text"],encoding="utf-8")
# test = pd.read_csv("data/downloaded/twitter-test.tsv", sep='\t', usecols=[1,2], names=["label", "text"],encoding="utf-8")
# # dev = pd.read_csv("data/downloaded/twitter-2016dev-A.tsv", sep='\t', usecols=[1,2], names=["label", "text"])

# print(f"train: {round(train.shape[0]/(train.shape[0]+test.shape[0])*100)}%, test: {round(test.shape[0]/(train.shape[0]+test.shape[0])*100)}%")
# print(f"train: {train.shape[0]}, test: {test.shape[0]}")

train: 80%, test: 20%
train: 40297, test: 9834


In [17]:
import pickle

with open("data/bert_tokenized/bert_train_data.pkl", "rb") as f:
    train = pickle.load(f)

with open("data/bert_tokenized/bert_test_data.pkl","rb") as f:
    test = pickle.load(f)

print(f"train: {round(train.shape[0]/(train.shape[0]+test.shape[0])*100)}%, test: {round(test.shape[0]/(train.shape[0]+test.shape[0])*100)}%")
print(f"train: {train.shape[0]}, test: {test.shape[0]}")

train: 80%, test: 20%
train: 40280, test: 9827


class imbalances

In [5]:
train_class_counts = train.groupby('label').count().rename({"text":"train"}, axis=1)
test_class_counts = test.groupby('label').count().rename({"text":"test"}, axis=1)
# dev_class_counts = dev.groupby('label').count().rename({"text":"dev"}, axis=1)

#TODO we moeten nog dev hebben: 80/20 split van tex


pd.concat([train_class_counts, test_class_counts], axis=1)

Unnamed: 0_level_0,train,test
label,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,6242,1566
neutral,19096,3425
positive,14959,4836


preprocessing:
remove links? 
remove @'s ?
punctuation etc? 


In [62]:
train.head()

Unnamed: 0,label,text
0,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T..."
1,neutral,Order Go Set a Watchman in store or through ou...
2,negative,If these runway renovations at the airport pre...
3,neutral,If you could ask an onstage interview question...
4,positive,A portion of book sales from our Harper Lee/Go...


In [38]:
# preprocessing:

def preprocess(data: pd.DataFrame, name: str) -> pd.DataFrame:
        
    # remove urls
    data_clean = data.copy()
    data_clean['text']=data_clean.text.apply(lambda x: re.sub(r'https?:\/\/t\.co\/\w+','',x))

    # remove emailadresses
    data_clean['text']=data_clean.text.apply(lambda x: re.sub(r'\w+@\w+\.[a-z]+','',x))

    # remove html ref
    data_clean['text']=data_clean.text.apply(lambda x: re.sub(r'&[a-z]+;', '', x))

    data_clean['text']=data_clean.text.apply(lambda x: re.sub(r'&amp', '&', x))

    # remove unicode
    # data_clean['text']=data_clean.text.apply(lambda x: re.sub(r'\\u[a-z0-9]{4}', '', x))
    data_clean['text']=data_clean.text.apply(lambda x: re.sub("RT : ", '',x))

    # remove all @ mention
    data_clean['text']=data_clean.text.apply(lambda x: re.sub(r'@\w+', '', x))

    # remove non-letter characters, numbers 
    data_clean['text']=data_clean.text.apply(lambda x: re.sub(r"[^a-zA-Z\s\:\];='\.\!\?\,]", '', x))
    data_clean['text']=data_clean.text.apply(lambda x: re.sub(r"([.,!?\s])st([.,!?\s])", r'\1\2', x))
    data_clean['text']=data_clean.text.apply(lambda x: re.sub(r"([.,!?\s])th([.,!?\s])", r'\1\2', x))
    data_clean['text']=data_clean.text.apply(lambda x: re.sub(r"([.,!?\s])nd([.,!?\s])", r'\1\2', x))
    data_clean['text']=data_clean.text.apply(lambda x: re.sub(r"([.,!?\s])rd([.,!?\s])", r'\1\2', x))


    # remove trailing whitespace
    # data_clean['text']=data_clean.text.apply(lambda x: re.sub(r'[\t \v]+', '', x).strip())

    data_clean.to_csv(f"data/downloaded/twitter-{name}-clean.tsv", sep="\t", index=False,header=False,encoding="utf-8")

    return data_clean
# overige dingen: afkortingen? getallen kunnen belangrijk zijn

In [39]:
train_cleaned = preprocess(train, "train")
test_cleaned = preprocess(test, "test")

In [18]:
train.head()

Unnamed: 0,label,text
0,positive,Gas by my house hit .!!!! Ium going to Chapel ...
1,negative,Theo Walcott is still shituc watch Rafa and Jo...
2,negative,its not that Ium a GSP fanuc i just hate Nick ...
3,negative,Iranian general says Israelus Iron Dome canut ...
4,neutral,Tehranuc Mon Amour: Obama Tried to Establish T...


In [19]:
# concat + flatten
import re
import string
all_data = pd.concat([train,test])
terms = []
regex = re.compile('[%s]' % re.escape(string.punctuation))


for index, row in all_data.iterrows():
    tweet = row.text.lower()
    terms.extend(regex.sub('',tweet).split(" "))

In [21]:
whitespace = ['', " ", "\t", "\n"]
terms_cleaned = [t for t in terms if t not in whitespace]
len(terms_cleaned)

906705

In [23]:
terms_cleaned

['gas',
 'by',
 'my',
 'house',
 'hit',
 'ium',
 'going',
 'to',
 'chapel',
 'hill',
 'on',
 'sat',
 'theo',
 'walcott',
 'is',
 'still',
 'shituc',
 'watch',
 'rafa',
 'and',
 'johnny',
 'deal',
 'with',
 'him',
 'on',
 'saturday',
 'its',
 'not',
 'that',
 'ium',
 'a',
 'gsp',
 'fanuc',
 'i',
 'just',
 'hate',
 'nick',
 'diaz',
 'canut',
 'wait',
 'for',
 'february',
 'iranian',
 'general',
 'says',
 'israelus',
 'iron',
 'dome',
 'canut',
 'deal',
 'with',
 'their',
 'missiles',
 'keep',
 'talking',
 'like',
 'that',
 'and',
 'we',
 'may',
 'end',
 'up',
 'finding',
 'out',
 'tehranuc',
 'mon',
 'amour',
 'obama',
 'tried',
 'to',
 'establish',
 'ties',
 'with',
 'the',
 'mullahs',
 'via',
 'no',
 'barack',
 'obama',
 'vote',
 'mitt',
 'romney',
 'i',
 'sat',
 'through',
 'this',
 'whole',
 'movie',
 'just',
 'for',
 'harry',
 'and',
 'ron',
 'at',
 'christmas',
 'ohlawd',
 'with',
 'j',
 'davlar',
 'main',
 'rivals',
 'are',
 'team',
 'poland',
 'hopefully',
 'we',
 'an',
 'make',


In [36]:
from collections import Counter
top10000_counts = Counter(terms_cleaned[:10000]).most_common()
words = [count_tuple[0] for count_tuple in top10000_counts]
most_common_terms = [t for t in terms if t in words]


KeyboardInterrupt: 

In [38]:
words[:10]

['the', 'to', 'i', 'on', 'a', 'in', 'and', 'for', 'of', 'is']