In [46]:
import pandas as pd
import numpy as np
import re 
import heapq 
import string


from nltk import ngrams
from nltk.tokenize.casual import casual_tokenize
from nltk.stem import WordNetLemmatizer 

In [7]:
df = pd.read_csv('data/original.txt', sep="\t", quotechar="~")
labels = np.asarray(df["Label"])
tweets = np.asarray(df["Tweet text"])

In [8]:
lemmatizer = WordNetLemmatizer() 
tweets_tokenized = [casual_tokenize(tweet) for tweet in tweets]
tweets_lemmatized = [[lemmatizer.lemmatize(token) for token in tweet] for tweet in tweets_tokenized]

In [None]:
###################################### BAGS OF WORDS ############################################

# Map for word frequencies
word2count = {} 

# For every tweet, update word count for each word
for tweet in tweets_lemmatized: 
    for word in tweet: 
        if word not in word2count.keys(): 
            word2count[word] = 1
        else: 
            word2count[word] += 1

# Get 90% most frequent words, for efficiency purposes
freq_words = heapq.nlargest((0.9 * len(word2count)), word2count, key=word2count.get)

# Array for bags of words
X = [] 
for tweet in tweets_lemmatized: 
    vector = [] 
    for word in freq_words: 
        if word in tweet: 
            vector.append(1) 
        else: 
            vector.append(0) 
    X.append(vector) 
X = np.asarray(X) 

##################################################################################################

In [34]:
####################################### TOKEN BIGRAMS #############################################

token_bigrams = []

for tweet in tweets_lemmatized:
    token_bigrams.append(list(ngrams(tweet, 2)))
    
##################################################################################################

In [38]:
########################### CHARACTER TRIGRAMS AND FOURGRAMS (WITH SPACES) #######################

char_trigrams = []
char_fourgrams = []

for tweet in tweets_lemmatized:
    sentence = " ".join(tweet)
    char_trigrams.append(list(ngrams(sentence, 3)))
    char_fourgrams.append(list(ngrams(sentence, 4)))
    
    
##################################################################################################

In [39]:
######################## CHARACTER TRIGRAMS AND FOURGRAMS (WITHOUT SPACES) #######################

char_trigrams_nosp = []
char_fourgrams_nosp = []

for tweet in tweets_lemmatized:
    sentence = "".join(tweet)
    char_trigrams_nosp.append(list(ngrams(sentence, 3)))
    char_fourgrams_nosp.append(list(ngrams(sentence, 4)))
    
    
##################################################################################################

In [43]:
############################### CHARACTER AND PUNCTUATION FLOODING ##############################

flooding = []

for tweet in tweets_lemmatized:
    amount = 0
    for word in tweet:
        for i in range(len(word)-2):
            if word[i] == word[i + 1] and word[i + 1] == word[i + 2]:
                amount += 1
    if amount > 0:
        flooding.append([amount, True])
    else:
        flooding.append([amount, False])
        
##################################################################################################

In [66]:
############################### PUNCTUATION AND CAPITALIZATION ####################################
punctuation = []
capitalisation = []

count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))

for tweet in tweets_lemmatized:
    sentence = " ".join(tweet)
    amountPunct = count(sentence, string.punctuation)
    amountCap = len(re.findall(r'[A-Z]',sentence))
    
    if amountPunct > 0:
        punctuation.append([amountPunct, True])
    else:
        punctuation.append([amountPunct, False])
        
    if amountCap > 0:
        capitalisation.append([amountCap, True])
    else:
        capitalisation.append([amountCap, False])
        
##################################################################################################

[[11, True],
 [5, True],
 [7, True],
 [1, True],
 [3, True],
 [7, True],
 [2, True],
 [0, False],
 [3, True],
 [2, True],
 [9, True],
 [5, True],
 [5, True],
 [2, True],
 [1, True],
 [0, False],
 [4, True],
 [5, True],
 [12, True],
 [5, True],
 [0, False],
 [18, True],
 [2, True],
 [46, True],
 [1, True],
 [16, True],
 [8, True],
 [1, True],
 [4, True],
 [5, True],
 [2, True],
 [1, True],
 [0, False],
 [3, True],
 [7, True],
 [13, True],
 [7, True],
 [7, True],
 [1, True],
 [3, True],
 [4, True],
 [62, True],
 [14, True],
 [1, True],
 [7, True],
 [1, True],
 [5, True],
 [18, True],
 [9, True],
 [1, True],
 [2, True],
 [3, True],
 [0, False],
 [0, False],
 [3, True],
 [2, True],
 [4, True],
 [4, True],
 [4, True],
 [1, True],
 [10, True],
 [5, True],
 [6, True],
 [5, True],
 [6, True],
 [7, True],
 [2, True],
 [3, True],
 [3, True],
 [14, True],
 [2, True],
 [5, True],
 [5, True],
 [6, True],
 [11, True],
 [1, True],
 [0, False],
 [13, True],
 [5, True],
 [3, True],
 [2, True],
 [60, Tr