In [12]:
import pandas as pd
import numpy as np
import re 
import heapq 
import string

from tqdm import tqdm_notebook
from nltk import ngrams
from nltk.tokenize.casual import casual_tokenize
from nltk.stem import WordNetLemmatizer 

In [4]:
df = pd.read_csv('data/original.txt', sep="\t", quotechar="~")
labels = np.asarray(df["Label"])
tweets = np.asarray(df["Tweet text"])

In [5]:
lemmatizer = WordNetLemmatizer() 
tweets_tokenized = [casual_tokenize(tweet) for tweet in tweets]
tweets_lemmatized = [[lemmatizer.lemmatize(token) for token in tweet] for tweet in tweets_tokenized]

In [40]:

## Get array of all lexical features
# If you want numeric flooding or punctuation features, make sure to set these parameters to True when calling the method
# bow_length is the length of your bag_of_words features.
def get_lexical_features(corpus, flooding_numeric=False, punctuation_numeric=False, bow_length=None):
    features = []
    
    token_unigrams = corpus
    token_bigrams = []
#     char_trigrams = []
#     char_fourgrams = []
    char_trigrams_nosp = []
    char_fourgrams_nosp = []        
    
    punctuation = []
    capitalisation = []
    flooding = []
    
    hashtag_freq = []
    hashtag_to_word = []
    emoticon_freq = []
    tweet_length = []
    
    
    count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))
    
    for tweet in corpus:
        tweet_length.append(len(tweet))
        sentence = " ".join(tweet)
        sentence_nospace = "".join(tweet)
        
        token_bigrams.append(list(ngrams(tweet, 2)))
#         char_trigrams.append(list(ngrams(sentence, 3)))
#         char_fourgrams.append(list(ngrams(sentence, 4)))
        char_trigrams_nosp.append(list(ngrams(sentence_nospace, 3)))
        char_fourgrams_nosp.append(list(ngrams(sentence_nospace, 4)))

        amount_punct = count(sentence, string.punctuation)
        amount_cap = len(re.findall(r'[A-Z]',sentence))

        if amount_punct > 0:
            if punctuation_numeric:
                punctuation.append(amount_punct)
            else:
                punctuation.append(1)
        else:
            punctuation.append(0)

        if amount_cap > 0:
            if punctuation_numeric:
                capitalisation.append(amount_cap)
            else:
                capitalisation.append(1)
        else:
            capitalisation.append(0)
        
        
        amount_flooding = 0
        amount_hashtags = 0
        amount_emoticons = 0
        for word in tweet:
            if word.startswith("#"):
                amount_hashtags += 1
            if word.startswith(":") and word.endswith(":"):
                amount_emoticons += 1
            for i in range(len(word)-2):
                if word[i] == word[i + 1] and word[i + 1] == word[i + 2]:
                    amount_flooding += 1
        if amount_flooding > 0:
            if flooding_numeric is True:
                flooding.append(amount_flooding)
            else:
                flooding.append(1)
        else:
            flooding.append(0)
            
        
        hashtag_freq.append((amount_hashtags / len(tweet)) * 100)
        hashtag_to_word.append(division_nonzero(amount_hashtags, (len(tweet) - amount_hashtags)))
        emoticon_freq.append((amount_emoticons / len(tweet)) * 100)
        
    features.append(bag_of_words(token_unigrams, bow_length))
    features.append(bag_of_words(token_bigrams, bow_length))
    features.append(bag_of_words(char_trigrams_nosp, bow_length))
    features.append(bag_of_words(char_fourgrams_nosp, bow_length))
    features.append(punctuation)
    features.append(capitalisation)
    features.append(flooding)
    features.append(hashtag_freq)
    features.append(hashtag_to_word)
    features.append(emoticon_freq)
    features.append(tweet_length)

    return features

In [34]:
###################################### BAGS OF WORDS ############################################

def bag_of_words(array, bow_length=None):
    # Map for word frequencies
    word2count = {} 

    # For every tweet, update word count for each word
    for tweet in tqdm_notebook(array): 
        for word in tweet: 
            if word not in word2count.keys(): 
                word2count[word] = 1
            else: 
                word2count[word] += 1

    if bow_length is None:
        bow_length = len(word2count)
    freq_words = heapq.nlargest(bow_length, word2count, key=word2count.get)

    # Array for bags of words
    X = [] 
    for tweet in tqdm_notebook(array): 
        vector = [] 
        for word in freq_words: 
            if word in tweet: 
                vector.append(1) 
            else: 
                vector.append(0) 
        X.append(vector) 
    return np.asarray(X)

def division_nonzero(n, d):
    return n / d if d else 0

bag_of_token_unigrams = bag_of_words(tweets_lemmatized)

##################################################################################################

HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




In [14]:
####################################### TOKEN BIGRAMS #############################################

token_bigrams = []

for tweet in tweets_lemmatized:
    token_bigrams.append(list(ngrams(tweet, 2)))
    
bag_of_token_bigrams = bag_of_words(token_bigrams)
    
##################################################################################################

HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




In [None]:
########################### CHARACTER TRIGRAMS AND FOURGRAMS (WITH SPACES) #######################

char_trigrams = []
char_fourgrams = []

for tweet in tweets_lemmatized:
    sentence = " ".join(tweet)
    char_trigrams.append(list(ngrams(sentence, 3)))
    char_fourgrams.append(list(ngrams(sentence, 4)))
    
bag_of_char_trigrams = bag_of_words(char_trigrams)
bag_of_char_fourgrams = bag_of_words(char_fourgrams)
    
##################################################################################################

HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))

In [39]:
######################## CHARACTER TRIGRAMS AND FOURGRAMS (WITHOUT SPACES) #######################

char_trigrams_nosp = []
char_fourgrams_nosp = []

for tweet in tweets_lemmatized:
    sentence = "".join(tweet)
    char_trigrams_nosp.append(list(ngrams(sentence, 3)))
    char_fourgrams_nosp.append(list(ngrams(sentence, 4)))
    
    
##################################################################################################

In [43]:
############################### CHARACTER AND PUNCTUATION FLOODING ##############################

flooding = []

for tweet in tweets_lemmatized:
    amount = 0
    for word in tweet:
        for i in range(len(word)-2):
            if word[i] == word[i + 1] and word[i + 1] == word[i + 2]:
                amount += 1
    if amount > 0:
        flooding.append([amount, True])
    else:
        flooding.append([amount, False])
        
##################################################################################################

In [66]:
############################### PUNCTUATION AND CAPITALIZATION ####################################
punctuation = []
capitalisation = []

count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))

for tweet in tweets_lemmatized:
    sentence = " ".join(tweet)
    amountPunct = count(sentence, string.punctuation)
    amountCap = len(re.findall(r'[A-Z]',sentence))
    
    if amountPunct > 0:
        punctuation.append([amountPunct, True])
    else:
        punctuation.append([amountPunct, False])
        
    if amountCap > 0:
        capitalisation.append([amountCap, True])
    else:
        
        capitalisation.append([amountCap, False])
        
##################################################################################################

In [35]:
#################################### HASHTAG FEATURES ##########################################

hashtag_freq = []
hashtag_to_word = []
emoticon_freq = []
tweet_length = []

for tweet in tweets_lemmatized:
    tweet_length.append(len(tweet))
    amount_hashtags = 0
    amount_emoticons = 0
    for word in tweet:
        if word.startswith("#"):
            amount_hashtags += 1
        if word.startswith(":") and word.endswith(":"):
            amount_emoticons += 1
    
    hashtag_freq.append((amount_hashtags / len(tweet)) * 100)
    hashtag_to_word.append(division_nonzero(amount_hashtags, (len(tweet) - amount_hashtags)))
    emoticon_freq.append((amount_emoticons / len(tweet)) * 100)
    
    
            


        
##################################################################################################

In [41]:
result = get_lexical_features(tweets_lemmatized, True, True, 100)

HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3834), HTML(value='')))




3834