In [28]:
import pandas as pd
import numpy as np
import re 
import heapq 


from nltk import ngrams
from nltk.tokenize.casual import casual_tokenize
from nltk.stem import WordNetLemmatizer 

In [7]:
df = pd.read_csv('data/original.txt', sep="\t", quotechar="~")
labels = np.asarray(df["Label"])
tweets = np.asarray(df["Tweet text"])

In [8]:
lemmatizer = WordNetLemmatizer() 
tweets_tokenized = [casual_tokenize(tweet) for tweet in tweets]
tweets_lemmatized = [[lemmatizer.lemmatize(token) for token in tweet] for tweet in tweets_tokenized]

In [None]:
###################################### BAGS OF WORDS ############################################

# Map for word frequencies
word2count = {} 

# For every tweet, update word count for each word
for tweet in tweets_lemmatized: 
    for word in tweet: 
        if word not in word2count.keys(): 
            word2count[word] = 1
        else: 
            word2count[word] += 1

# Get 90% most frequent words, for efficiency purposes
freq_words = heapq.nlargest((0.9 * len(word2count)), word2count, key=word2count.get)

# Array for bags of words
X = [] 
for tweet in tweets_lemmatized: 
    vector = [] 
    for word in freq_words: 
        if word in tweet: 
            vector.append(1) 
        else: 
            vector.append(0) 
    X.append(vector) 
X = np.asarray(X) 

##################################################################################################

In [34]:
####################################### TOKEN BIGRAMS #############################################

token_bigrams = []

for tweet in tweets_lemmatized:
    token_bigrams.append(list(ngrams(tweet, 2)))
    
##################################################################################################

In [38]:
########################### CHARACTER TRIGRAMS AND FOURGRAMS (WITH SPACES) #######################

char_trigrams = []
char_fourgrams = []

for tweet in tweets_lemmatized:
    sentence = " ".join(tweet)
    char_trigrams.append(list(ngrams(sentence, 3)))
    char_fourgrams.append(list(ngrams(sentence, 4)))
    
    
##################################################################################################

In [39]:
######################## CHARACTER TRIGRAMS AND FOURGRAMS (WITHOUT SPACES) #######################

char_trigrams_nosp = []
char_fourgrams_nosp = []

for tweet in tweets_lemmatized:
    sentence = "".join(tweet)
    char_trigrams_nosp.append(list(ngrams(sentence, 3)))
    char_fourgrams_nosp.append(list(ngrams(sentence, 4)))
    
    
##################################################################################################

[[('S', 'w', 'e'),
  ('w', 'e', 'e'),
  ('e', 'e', 't'),
  ('e', 't', 'U'),
  ('t', 'U', 'n'),
  ('U', 'n', 'i'),
  ('n', 'i', 't'),
  ('i', 't', 'e'),
  ('t', 'e', 'd'),
  ('e', 'd', 'N'),
  ('d', 'N', 'a'),
  ('N', 'a', 't'),
  ('a', 't', 'i'),
  ('t', 'i', 'o'),
  ('i', 'o', 'n'),
  ('o', 'n', 's'),
  ('n', 's', 'v'),
  ('s', 'v', 'i'),
  ('v', 'i', 'd'),
  ('i', 'd', 'e'),
  ('d', 'e', 'o'),
  ('e', 'o', '.'),
  ('o', '.', 'J'),
  ('.', 'J', 'u'),
  ('J', 'u', 's'),
  ('u', 's', 't'),
  ('s', 't', 'i'),
  ('t', 'i', 'n'),
  ('i', 'n', 't'),
  ('n', 't', 'i'),
  ('t', 'i', 'm'),
  ('i', 'm', 'e'),
  ('m', 'e', 'f'),
  ('e', 'f', 'o'),
  ('f', 'o', 'r'),
  ('o', 'r', 'C'),
  ('r', 'C', 'h'),
  ('C', 'h', 'r'),
  ('h', 'r', 'i'),
  ('r', 'i', 's'),
  ('i', 's', 't'),
  ('s', 't', 'm'),
  ('t', 'm', 'a'),
  ('m', 'a', 's'),
  ('a', 's', '.'),
  ('s', '.', '#'),
  ('.', '#', 'i'),
  ('#', 'i', 'm'),
  ('i', 'm', 'a'),
  ('m', 'a', 'g'),
  ('a', 'g', 'i'),
  ('g', 'i', 'n'),
  ('i', 'n',