# Libraries

In [1]:
from data import c3d
import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from ekphrasis.classes.tokenizer import SocialTokenizer

from nltk.tokenize import TweetTokenizer
from ekphrasis.classes.spellcorrect import SpellCorrector
from ekphrasis.classes.segmenter import Segmenter
import re
from ekphrasis.classes.preprocessor import TextPreProcessor
import emoji
from tqdm import trange 
import collections, numpy

## Loading  Data 

In [2]:
train, validation, test = c3d.load_data()

  0%|          | 9/5384 [00:00<01:01, 87.51it/s]

Data has apparently already been downloaded and unpacked.


100%|██████████| 5384/5384 [00:17<00:00, 301.35it/s]
100%|██████████| 6493/6493 [00:30<00:00, 215.99it/s]


In [3]:
train_tweets, train_labels = train.features, train.labels
val_tweets, val_labels = validation.features, validation.labels
test_tweets, test_labels = test.features, test.labels

In [4]:
collections.Counter(train_labels),collections.Counter(val_labels),collections.Counter(test_labels) # Data is balanced

(Counter({'negative': 3051, 'positive': 3629}),
 Counter({'positive': 1228, 'negative': 999}),
 Counter({'positive': 1636, 'negative': 1334}))

The various text preprocessing steps are:

 * Tokenization
 * Lower casing
 * Stop words removal
 * Stemming
 * Lemmatization

# Data Cleaning:

## Tokenization 
Tokenization is the process of splitting the given text into smaller pieces called tokens

In [4]:
sentence = train_tweets[0]
print(sentence)
words = word_tokenize(sentence)
print(words)

So since season is starting I'ma start coming to school looking like a homeless person 🙂👌🏼🏊🏻‍♀️
['So', 'since', 'season', 'is', 'starting', "I'ma", 'start', 'coming', 'to', 'school', 'looking', 'like', 'a', 'homeless', 'person', '🙂👌🏼🏊🏻\u200d♀️']


## Lower casing
Converting a word to lower case (DEPRESSED -> depressed)


In [5]:
sentence = sentence.lower()
print(sentence)

so since season is starting i'ma start coming to school looking like a homeless person 🙂👌🏼🏊🏻‍♀️


## Stop words removal

In [6]:
stop_words = set(stopwords.words('english')) 
print(stop_words)

{'my', 'being', 'of', 'a', 'for', 'most', 'our', 'these', 'up', 'an', 'ourselves', 'as', 'can', 'under', 'where', 'your', 'her', 'further', 'its', 'or', 'won', 'before', "you'll", "won't", "you're", 'no', 'theirs', 'his', 'such', 'ours', 'any', 'o', 'what', 'was', 'didn', 'be', 'on', 'm', 'only', 'doesn', 'just', 'yours', 'haven', 'through', "she's", 'ain', 'were', 'shouldn', 'in', "should've", 'because', 'doing', "it's", 'not', 'you', 'hadn', 'against', 'she', 's', 'been', 'but', 'has', 'this', "you'd", 'is', 'other', 'y', 'below', 'am', 'whom', 'then', 'so', "hadn't", 'more', 'with', 'having', 'too', 'hasn', 're', 'weren', "shouldn't", 'hers', 'from', 'few', 'until', 'their', 'why', 'off', 'will', 'at', 'down', 'are', "shan't", 'he', 'how', 'there', 'themselves', 'nor', 'herself', 'me', 'if', 'the', 'll', "hasn't", 'yourselves', 't', 'himself', 'by', 'above', 'aren', "haven't", 'now', 'mustn', 'do', 'them', 'some', 'had', 'who', 'when', 'each', "that'll", "don't", 'don', 'it', 'which

We can see that if NLTK stopwords are used than all the negative contractions will be removed which plays a significant role in sentiment analysis.

In [7]:
sentence = train_tweets[0]
tknzr = TweetTokenizer()
social_tokenizer = SocialTokenizer(lowercase=True).tokenize

print("Tweet Tokenizer:",tknzr.tokenize(sentence))
print()
print("Social tokenizer:",social_tokenizer(sentence))


Tweet Tokenizer: ['So', 'since', 'season', 'is', 'starting', "I'ma", 'start', 'coming', 'to', 'school', 'looking', 'like', 'a', 'homeless', 'person', '🙂', '👌', '🏼', '🏊', '🏻', '\u200d', '♀', '️']

Social tokenizer: ['so', 'since', 'season', 'is', 'starting', 'i', "'", 'ma', 'start', 'coming', 'to', 'school', 'looking', 'like', 'a', 'homeless', 'person', '🙂', '👌', '🏼', '🏊', '🏻', '\u200d', '♀️']


## Stemming: 
It is a process of transforming a word to its root form.


In [8]:
ps = PorterStemmer()
for word in sentence.split():
    print(ps.stem(word))

So
sinc
season
is
start
i'ma
start
come
to
school
look
like
a
homeless
person
🙂👌🏼🏊🏻‍♀️


## Lemmatization: 
Lemmatization reduces the words to a word existing in the language.

In [9]:
lemmatizer = WordNetLemmatizer()
for word in sentence.split():
    print(lemmatizer.lemmatize(word, pos='v'))

So
since
season
be
start
I'ma
start
come
to
school
look
like
a
homeless
person
🙂👌🏼🏊🏻‍♀️


### Remove Links

In [7]:
tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", train_tweets[7]).split())

In [8]:
tweet

'How can u be friends with ur mans ex — I LOVE MIRNA SHES FAB AF. Plus wtf were not like 7 ...'

## Preprocessor on Github: 
### ekphrasis <a href="https://github.com/cbaziotis/ekphrasis"> on github </a>

The Spell Corrector is based on Peter Norvig's spell-corrector. Just like the segmentation algorithm, we utilize word statistics in order to find the most probable candidate. Besides the provided statistics, you can use your own.

In [12]:
sp = SpellCorrector(corpus="english")
print(sp.correct("korrectud"))

Reading english - 1grams ...
corrected


### Tweet preprocessor adapted pipeline

In [5]:
words_to_delete = ['theirs', 'she', 'of', 
                  'all', 'her', 'ourselves', 'that', 'some', 'your', 
                  'what', 'or', 'me',  'now', 'after',
                  'until', 'them', 'through', 'who', 'herself', 'he', 
                   'y', 'each', 'under', 'hers', 'other', 'down', 
                  'this', 'their', 'as', 'on','few', 'which', 'further', 
                  'whom', 'its', 'so', 'yourselves', 'because', 'it', 'both', 'in', 'nor', 
                    'yours', 'yourself', 'before','since', 
                  'there', 'himself', 'then', 
                  'him', 'over',  'here',  'an',  'into','next','d','u','r','im','m','have', 
                  'the', 'again','such', 'myself', 'they', 
                  'we', 'those', 'between', 'once','even','have'
                   'how', 'from',  'ours', 'during','be','ama','r','i','do','but',
                  'his', 'against', 'below',  'to', 'about', 
                   'by', 'i', 'where', 'a', 'very', 'our', 'my', 'for', 'and','ur'
                  'while', 'only', 'up', 'these', 'just', 'same','how',
                  'you', 'themselves', 'above', 'with',  'than', 
                  'own', 'out', 'when', 'any', 'too', 'o', 'at']
def load_dict_contractions():
    
    return {
        "ain't":"is not","amn't":"am not","aren't":"are not","can't":"cannot","'cause":"because","couldn't":"could not",
        "couldn't've":"could not have","could've":"could have","daren't":"dare not","daresn't":"dare not","dasn't":"dare not","didn't":"did not","doesn't":"does not",
        "don't":"do not","e'er":"ever","em":"them","everyone's":"everyone is","finna":"fixing to",
        "gimme":"give me","gonna":"going to", "gon't":"go not","gotta":"got to",  "hadn't":"had not", "hasn't":"has not","haven't":"have not",
        "he'd":"he would", "he'll":"he will","he's":"he is", "he've":"he have","how'd":"how would",
        "how'll":"how will", "how're":"how are","how's":"how is", "I'd":"I would", "I'll":"I will", "I'm":"I am",
        "I'm'a":"I am about to","I'm'o":"I am going to","isn't":"is not","it'd":"it would",
        "it'll":"it will", "it's":"it is","I've":"I have","kinda":"kind of","let's":"let us","mayn't":"may not",
        "may've":"may have","mightn't":"might not","might've":"might have", "mustn't":"must not","mustn't've":"must not have",
        "must've":"must have","needn't":"need not", "ne'er":"never", "o'":"of", "o'er":"over","ol'":"old",
        "oughtn't":"ought not","shalln't":"shall not","shan't":"shall not","she'd":"she would","she'll":"she will","she's":"she is",
        "shouldn't":"should not","shouldn't've":"should not have","should've":"should have","somebody's":"somebody is",
        "someone's":"someone is","something's":"something is","that'd":"that would","that'll":"that will",
        "that're":"that are","that's":"that is","there'd":"there would","there'll":"there will", "there're":"there are","there's":"there is", "these're":"these are",
        "they'd":"they would","they'll":"they will","they're":"they are", "they've":"they have","this's":"this is",
        "those're":"those are","'tis":"it is","'twas":"it was","wanna":"want to","wasn't":"was not","we'd":"we would",
        "we'd've":"we would have","we'll":"we will","we're":"we are","weren't":"were not","we've":"we have",
        "what'd":"what did","what'll":"what will","what're":"what are","what's":"what is",
        "what've":"what have","when's":"when is","where'd":"where did","where're":"where are","where's":"where is",
        "where've":"where have","which's":"which is","who'd":"who would","who'd've":"who would have","who'll":"who will",
        "who're":"who are","who's":"who is","who've":"who have","why'd":"why did","why're":"why are","why's":"why is",
        "won't":"will not","wouldn't":"would not","would've":"would have","y'all":"you all","you'd":"you would",
        "you'll":"you will","you're":"you are","you've":"you have","Whatcha":"What are you","luv":"love","sux":"sucks"
        ,"shes":"she is","wtf":"what the fuck"}

In [6]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated",
              'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used
    # for word segmentation
    segmenter="twitter",
    # corpus from which the word statistics are going to be used
    # for spell correction
    corrector="english",
    unpack_hashtags=False,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correction=True,
    tokenizer=SocialTokenizer(lowercase=False).tokenize
)


Reading english - 1grams ...


In [7]:
def clean_tweet(tweet,processor=text_processor):
    tweet=emoji.demojize(tweet)
    tweet = tweet.lower()
    # Replacement of words such as I've to I have 
    tweet = tweet.replace("’","'")
    tweet = tweet.split()
    contractions=load_dict_contractions()
    tweet = [contractions[word] if word in contractions else word for word in tweet]
    tweet = " ".join(tweet) 
    tweet=" ".join(text_processor.pre_process_doc(tweet))

    # remove punctuations
    tweet = re.sub(u'[{}]'.format('!"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~'), u'',tweet)
    # Lemmatizer
    lemmatizer = WordNetLemmatizer()
    s=''
    for word in tweet.split():
        s=s+" "+lemmatizer.lemmatize(word, pos='v')
     # unuseful words removal
    for w in words_to_delete:
        pattern = r'\b'+w+r'\b'
        s = re.sub(pattern, '', s)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', s)
    #Tokenize 
    #tokenizer=SocialTokenizer(lowercase=True).tokenize
    #resultf=tokenizer(s)
    return tweet

In [8]:
def clean_tweets (tweets):
    clean_all_tweets=[]
    for i in trange(len(tweets)):
         clean_all_tweets.append(clean_tweet(tweets[i]))
    return(clean_all_tweets)

In [9]:
Clean_train_tweets=clean_tweets(train_tweets)

100%|██████████| 6680/6680 [00:11<00:00, 559.43it/s]


In [10]:
for i in range(10):
    print()
    print(train_tweets[i])
    print(Clean_train_tweets[i])
    print()


So since season is starting I'ma start coming to school looking like a homeless person 🙂👌🏼🏊🏻‍♀️
 season start start come school look like homeless person slightly_smiling_face ok_hand_medium light_skin_tone woman_swimming_light_skin_tone


I never liked it anyway https://t.co/7bTrApaFjd
 never like anyway url


Retweeted Aries Spears (@AriesSpears):

I think its disgustin &amp; disrespectful some blk folks r so short sighted... https://t.co/RPicbgeLRl
 retweeted aries spear user think disgustin disrespectful blk folks short sight url


@anniecross00 depression is a mental disorder you cannot help... I've been diagnosed like don't even fucking come at me about mental illness
 user depression mental disorder cannot help diagnose like not fuck come mental illness


@EverythinOakley yikes I hate that stuff
 user yikes hate stuff


Bitch I've done meth and I wasn't tweaking as much as y'all are https://t.co/SV0r58R0d8
 bitch meth not tweak much url


I was diagnosed with severe depression 

# Data Representation: 
## Word Representation:
### One hot encoding:
Binary representation of words: 

&rArr; Depending on the vocabulary: each word get a representation of 1*n vector representation.

Negative aspect of one hot encoding :
 * The number of dimensions increase linearly as we add words (used memory is Large)
 * Embedding matrix is very sparse mainly made of zeroes 
 * No context of word because every word is treated on its own
 * No frequency information is present

### Bag-of-words: (BoW)
Creates a vocabulary of all the tokens occuring in akk tweets, The frequency of the word in each tweet is inserted. 
&rArr; The number of dimensions increases with obs in dataset
Negative aspect of BoW:
 * Ignores the meaning of words. The same word can be used in multiple places based on the context or nearby words.
 * Vector can be huge and it ca be costly for both time and computation

&rArr; TF-IDF (term frequency-inverse document frequency)=TF * IDF with TF = Number of times term appears in a document/total number of items in the document and IDF= log(Total number of documents/Number of documents with Term in it)
 * TF-IDF is based on the bag-of-words (BoW) model, therefore it does not capture position in text, semantics, co-occurrences in different documents.

## Word Embedding:
### Word2vec:
Mapping of words into vectors, the words existing in smilar contexts will have similar word embeddings. 
To generate vectors from words there's two algorithms:
 * CBoW(Continuous Bag of Words): Predict the target word from a context &rarr; Small corpus
 * Skip Gram: Predict the context words from a word &rarr; Large corpus

Choosing number of dimensions: Extreme accuracy can be obtained with 300D

Pros:
   * Calculating the semantic similarity between words.
   * Can feed it raw text and it output word vectors

Cons:
   * Words having multiple sense are represented in one vector 
   * Can’t handle out-of-vocabulary words, have to re-train to add new words.

### GloVe:
It is an unsupervised learning algorithm for obtaining vector representations for words. It puts emphasis on the importance co-occurences to extract meaning. The idea behind it is that a certain word generally co-occurs more often with one word than another.
 
 &rArr; It proves to perform better than Word2vec in the word analogy tasks.

### FastText:
It treats each word as composed of character ngrams so the vector for a word is made of the sum of this character n grams.
 Pros:
 * Generate better word embeddings for rare words.
 * Handle Out-of-Vocabulary words unlike Word2vec and GloVe.

### ELmo:
Instead of using a fixed embedding for each word, like models like GloVe do, ELMo looks at the entire sentence before assigning each word in it its embedding.
 * ELMo word representations are purely character-based, which allows it to treat out-of-vocabulary tokens unseen during training.
 * Unlike other word embeddings, it generates word vectors on run time.
 * It gives embedding of anything you put in — characters, words, sentences, paragraphs, but it is built for sentence embeddings in mind.

## BERT(Bidirectional Encoder Representations from Transformer): 
It uses the transformer architecture in addition to a number of different techniques to train the model. 
It achives state of the art results. 
Based on this <a href="https://arxiv.org/abs/1810.04805"> article</a>, <a href="https://github.com/google-research/bert">BERT </a> is the word embedding algorithm I'm using.

For visualization (<a href='https://towardsdatascience.com/deconstructing-bert-part-2-visualizing-the-inner-workings-of-attention-60a16d86b5c1'>link</a>)

Tensoflow hub <a href='https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'>Link</a>

BERT for embedding on <a href='https://medium.com/@aieeshashafique/feature-extraction-from-bert-25887ed2152a'> medium</a>

Evaluation of embeddings to check <a href='https://arxiv.org/pdf/1801.09536.pdf'> this article</a>

In [11]:
import tensorflow_hub as hub
import tensorflow as tf
import bert
FullTokenizer = bert.bert_tokenization.FullTokenizer
from tensorflow.keras.models import Model 
import math
print("TF version: ", tf.__version__)
print("Hub version: ", hub.__version__)

TF version:  2.0.0
Hub version:  0.8.0


### 1. Prepare inputs:

In [20]:
# See BERT paper: https://arxiv.org/pdf/1810.04805.pdf
# And BERT implementation convert_single_example() at https://github.com/google-research/bert/blob/master/run_classifier.py

def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [13]:
max_seq_length = 128  # maximum length of a sequence after tokenizing
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
                            trainable=True) #pretrained 
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

#pooled_output: pooled output of the entire sequence with shape [batch_size, hidden_size].
#sequence_output: representations of every token in the input sequence with shape [batch_size, max_sequence_length, hidden_size].


In [14]:
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output,sequence_output])

In [21]:
#BERT tokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [22]:
s = Clean_train_tweets[1]

In [23]:
stokens=tokenizer.tokenize(s)

In [24]:
stokens = ["[CLS]"] + stokens + ["[SEP]"]

In [25]:
input_ids = get_ids(stokens, tokenizer, max_seq_length)
input_masks = get_masks(stokens, max_seq_length)
input_segments = get_segments(stokens, max_seq_length)

In [26]:
print(stokens)
print(input_ids)
print(input_masks)
print(input_segments)

['[CLS]', 'never', 'like', 'anyway', 'ur', '##l', '[SEP]']
[101, 2196, 2066, 4312, 24471, 2140, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [27]:
pool_embs,seq_embs= model.predict([[input_ids],[input_masks],[input_segments]])
#Embeddings are generated using the pre-trained model previously loaded .

In [32]:
seq_embs.shape

(1, 128, 768)

In [33]:
seq_embs

array([[[-0.04586147,  0.07677846,  0.15028717, ..., -0.3676868 ,
          0.25273815,  0.41104478],
        [ 0.50704026, -0.07457219,  0.7028382 , ...,  0.06672566,
          0.00297276,  0.2975085 ],
        [ 0.8509491 , -0.05657218,  1.025556  , ..., -0.27750093,
         -0.30000427, -0.07440143],
        ...,
        [ 0.12525357,  0.2656413 ,  0.7679463 , ...,  0.01556945,
          0.07984328,  0.2148341 ],
        [ 0.10444136,  0.21031368,  0.7588055 , ..., -0.02725608,
          0.08136114,  0.22190559],
        [ 0.13000613,  0.18980533,  0.7512319 , ..., -0.07465325,
          0.06056323,  0.15698689]]], dtype=float32)