In [205]:
from typing import List
import re
import string
import numpy as np
import pandas as pd
from collections import Counter
import random

import torch
import torch.nn as nn
from torch import optim

# Data Preprocessing

In [206]:
df_train = pd.read_csv("/Users/chemin/Data_Science/Now_21Spring/advanced_topics/final_project/nlp-getting-started/train.csv")
df_test =  pd.read_csv("/Users/chemin/Data_Science/Now_21Spring/advanced_topics/final_project/nlp-getting-started/test.csv")

In [207]:
df_train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [209]:
len(df_train)

7613

In [210]:
df_train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

## Tweets Example

In [211]:
class TweetsExample:
    """
    Data wrapper for a single example for classification.

    Attributes:
        words (List[string]): list of words
        label (int): 0 or 1 (0 = negative, 1 = positive)
    """

    def __init__(self, words, label):
        self.words = words
        self.label = label

    def __repr__(self):
        return repr(self.words) + "; label=" + repr(self.label)

    def __str__(self):
        return self.__repr__()

In [212]:
def get_normalized_words(text: str) -> str:
    """
    Given a string, return a list of words normalized as follows.
    Split the string to make words first by using regex compile() function
    and string.punctuation + '0-9\\r\\t\\n]' to replace all those char with a space character.
    Split on space to get word list.
    Ignore words < 3 char long.
    Lowercase all words.
    """
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if len(w) > 2]  # ignore a, an, to, at, be, ...
    words = [w.lower() for w in words]
    return words

In [213]:
def read_tweets_examples(df) -> List[TweetsExample]:
    """
    Reads tweets examples in the df format; tokenizes and cleans the sentences and forms
    TweetsExample.

    NOTE: we need to lowercase the data. This is because the GloVe embeddings don't
    distinguish case and so can only be used with lowercasing.

    :param df: df to read from
    :return: a list of TweetsExamples parsed from the df
    """
    exs = []
    for i in range(len(df)):
        sent = df.iloc[i]['text']
        tokenized_cleaned_sent = get_normalized_words(sent)
        label  = df.iloc[i]['target']
        exs.append(TweetsExample(tokenized_cleaned_sent, label))
    return exs

In [216]:
sent = df_train.iloc[0]['text']
sent

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [217]:
target = df_train.iloc[0]['target']
target

1

In [218]:
get_normalized_words(sent)

['our',
 'deeds',
 'are',
 'the',
 'reason',
 'this',
 'earthquake',
 'may',
 'allah',
 'forgive',
 'all']

## Indexer

In [219]:
class Indexer(object):
    """
    Bijection between objects and integers starting at 0. Useful for mapping
    labels, features, etc. into coordinates of a vector space.

    Attributes:
        objs_to_ints
        ints_to_objs
    """
    def __init__(self):
        self.objs_to_ints = {}
        self.ints_to_objs = {}

    def __repr__(self):
        return str([str(self.get_object(i)) for i in range(0, len(self))])

    def __str__(self):
        return self.__repr__()

    def __len__(self):
        return len(self.objs_to_ints)

    def get_object(self, index):
        """
        :param index: integer index to look up
        :return: Returns the object corresponding to the particular index or None if not found
        """
        if (index not in self.ints_to_objs):
            return None
        else:
            return self.ints_to_objs[index]

    def contains(self, object):
        """
        :param object: object to look up
        :return: Returns True if it is in the Indexer, False otherwise
        """
        return self.index_of(object) != -1

    def index_of(self, object):
        """
        :param object: object to look up
        :return: Returns -1 if the object isn't present, index otherwise
        """
        if (object not in self.objs_to_ints):
            return -1
        else:
            return self.objs_to_ints[object]

    def add_and_get_index(self, object, add=True):
        """
        Adds the object to the index if it isn't present, always returns a nonnegative index
        :param object: object to look up or add
        :param add: True by default, False if we shouldn't add the object. If False, equivalent to index_of.
        :return: The index of the object
        """
        if not add:
            return self.index_of(object)
        if (object not in self.objs_to_ints):
            new_idx = len(self.objs_to_ints)
            self.objs_to_ints[object] = new_idx
            self.ints_to_objs[new_idx] = object
        return self.objs_to_ints[object]

## Pre-trained Embeddings

In [220]:
class WordEmbeddings:
    """
    Wraps an Indexer and a list of 1-D numpy arrays where each position in the list is the vector for the corresponding
    word in the indexer. The 0 vector is returned if an unknown word is queried.
    """
    def __init__(self, word_indexer, vectors):
        self.word_indexer = word_indexer
        self.vectors = vectors

    def get_embedding_length(self):
        return len(self.vectors[0])

    def get_embedding(self, word):
        """
        Returns the embedding for a given word
        :param word: The word to look up
        :return: The UNK vector if the word is not in the Indexer or the vector otherwise
        """
        word_idx = self.word_indexer.index_of(word)
        if word_idx != -1:
            return self.vectors[word_idx]
        else:
            return self.vectors[self.word_indexer.index_of("UNK")]

In [221]:
def read_word_embeddings(embeddings_file: str) -> WordEmbeddings:
    """
    Loads the given embeddings (ASCII-formatted) into a WordEmbeddings object. Augments this with an UNK embedding
    that is the 0 vector. 
    :param embeddings_file: path to the file containing embeddings
    :return: WordEmbeddings object reflecting the words and their embeddings
    """
    f = open(embeddings_file)
    word_indexer = Indexer()
    vectors = []
    # Make position 0 a PAD token, which can be useful in implementing batching.
    word_indexer.add_and_get_index("PAD")
    # Make position 1 the UNK token
    word_indexer.add_and_get_index("UNK")
    for line in f:
        if line.strip() != "":
            space_idx = line.find(' ')
            word = line[:space_idx]
            numbers = line[space_idx+1:]
            float_numbers = [float(number_str) for number_str in numbers.split()]
            vector = np.array(float_numbers)
            word_indexer.add_and_get_index(word)
            # Append the PAD and UNK vectors to start. Have to do this weirdly because we need to read the first line
            # of the file to see what the embedding dim is
            if len(vectors) == 0:
                vectors.append(np.zeros(vector.shape[0]))
                vectors.append(np.zeros(vector.shape[0]))
            vectors.append(vector)
    f.close()
    print("Read in " + repr(len(word_indexer)) + " vectors of size " + repr(vectors[0].shape[0]))
    # Turn vectors into a 2-D numpy array
    return WordEmbeddings(word_indexer, np.array(vectors))

In [222]:
embeddings_file = '/Users/chemin/Data_Science/Now_21Spring/advanced_topics/final_project/glove.6B.300d-relativized.txt'
word_embeddings = read_word_embeddings(embeddings_file)

Read in 14923 vectors of size 300


## Train/Valid split

In [223]:
train = df_train.sample(frac=.8)
valid = df_train[~df_train.index.isin(train.index)]
train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)

In [224]:
train_exs = read_tweets_examples(train)
val_exs = read_tweets_examples(valid)

In [225]:
print(repr(len(train_exs)) + "/" + repr(len(val_exs)) + "  " + " train/valid  examples")

6090/1523   train/valid  examples


# DANClassifier model

In [226]:
class DANClassifier(nn.Module):
    def __init__(self, emb_size, hid, out, word_embeddings=None):
        super(DANClassifier, self).__init__()
        self.word_embeddings = word_embeddings
        self.emb = nn.Embedding.from_pretrained(form_input(word_embeddings.vectors), padding_idx=0)
        self.V = nn.Linear(emb_size, hid)
        self.W = nn.Linear(hid, out)
        self.g = nn.Sigmoid()
        self.log_softmax = nn.LogSoftmax(dim=0)
        nn.init.xavier_uniform_(self.V.weight)
        nn.init.xavier_uniform_(self.W.weight)

    def forward(self, x):
        """
        Runs the neural network on the given data and returns log probabilities of the various classes.

        :param x: a [inp]-sized tensor of input data
        :return: an [out]-sized tensor of log probabilities. 
        """
        x = self.emb(x)
        x = torch.mean(x, dim=0)
        x = self.g(self.V(x))
        x = self.W(x)
        return self.log_softmax(x)

    def predict(self, ex_words: List[str]) -> int:
        word_indexer = self.word_embeddings.word_indexer
        word_idx_list = []
        for word in ex_words:
            word_idx = word_indexer.index_of(word)
            if word_idx == -1:
                word_idx = 0
            word_idx_list.append(word_idx)
        x = torch.LongTensor(np.array(word_idx_list))
        log_probs = self.forward(x)
        y_pred = torch.argmax(log_probs)
        return y_pred
    
    def predict_all(self, all_ex_words: List[List[str]]) -> List[int]:
        """      
        :param all_ex_words: A list of all exs to do prediction on
        :return:
        """
        return [self.predict(ex_words) for ex_words in all_ex_words]

# Model train

In [227]:
def form_input(x) -> torch.Tensor:
    """
    Form the input to the neural network.

    :param x: a [num_samples x inp] numpy array containing input data
    :return: a [num_samples x inp] Tensor
    """
    return torch.from_numpy(x).float()

In [253]:
def train_deep_averaging_network(train_exs: List[TweetsExample], val_exs: List[TweetsExample], word_embeddings: WordEmbeddings) -> DANClassifier:
    """
    :param train_exs: training examples
    :param val_exs: validation set, in case you wish to evaluate your model during training
    :param word_embeddings: set of loaded word embeddings
    :return: A trained DANClassifier model
    """
    # DEFINE elements for DANClassifier init
    emb_size = word_embeddings.get_embedding_length()
    word_indexer = word_embeddings.word_indexer

    hid_size = 500
    num_classes = 2

    # TRAINING
    # set hyperparameters
    num_epochs = 20
    learning_rate = 0.001

    # init model and optimizer
    DAN = DANClassifier(emb_size, hid_size, num_classes, word_embeddings)
    optimizer = optim.Adam(DAN.parameters(), lr=learning_rate)
    # run training
    for epoch in range(0, num_epochs):
        ex_indices = [i for i in range(0, len(train_exs))]
        random.shuffle(ex_indices)
        total_loss = 0.0
        for ex_idx in ex_indices:
            word_idx_list = []
            for word in train_exs[ex_idx].words:
                word_idx = word_indexer.index_of(word)
                if word_idx == -1:
                    word_idx = 0
                word_idx_list.append(word_idx)
            x = torch.LongTensor(np.array(word_idx_list))
            y = train_exs[ex_idx].label
            # Build one-hot representation of y. Instead of the label 0 or 1, y_onehot is either [0, 1] or [1, 0]. This
            # way we can take the dot product directly with a probability vector to get class probabilities.
            y_onehot = torch.zeros(num_classes)
            # scatter will write the value of 1 into the position of y_onehot given by y
            y_onehot.scatter_(0, torch.from_numpy(np.asarray(y,dtype=np.int64)), 1)
            # Zero out the gradients from the DAN object. *THIS IS VERY IMPORTANT TO DO BEFORE CALLING BACKWARD()*
            DAN.zero_grad()
            log_probs = DAN.forward(x)
            # Can also use built-in NLLLoss as a shortcut here but we're being explicit here
            loss = torch.neg(log_probs).dot(y_onehot)
            total_loss += loss
            # Computes the gradient and takes the optimizer step
            loss.backward()
            optimizer.step()
        print("Total loss on epoch %i: %f" % (epoch, total_loss))
    return DAN

In [254]:
model = train_deep_averaging_network(train_exs, val_exs, word_embeddings)

Total loss on epoch 0: 3475.391846
Total loss on epoch 1: 3201.024170
Total loss on epoch 2: 3082.034668
Total loss on epoch 3: 2991.865479
Total loss on epoch 4: 2932.710693
Total loss on epoch 5: 2868.629639
Total loss on epoch 6: 2817.174561
Total loss on epoch 7: 2765.109619
Total loss on epoch 8: 2726.741943
Total loss on epoch 9: 2692.804443
Total loss on epoch 10: 2612.700684
Total loss on epoch 11: 2590.556396
Total loss on epoch 12: 2544.613037
Total loss on epoch 13: 2492.034668
Total loss on epoch 14: 2444.452881
Total loss on epoch 15: 2394.654785
Total loss on epoch 16: 2334.840088
Total loss on epoch 17: 2294.222656
Total loss on epoch 18: 2232.053711
Total loss on epoch 19: 2178.365967


# Model evaluate

In [255]:
def evaluate(classifier, exs):
    """
    Evaluates a given classifier on the given examples
    :param classifier: classifier to evaluate
    :param exs: the list of TweetsExamples to evaluate on
    :return: None (but prints output)
    """
    return print_evaluation([ex.label for ex in exs], classifier.predict_all([ex.words for ex in exs]))

In [256]:
def print_evaluation(golds: List[int], predictions: List[int]):
    """
    Prints evaluation statistics comparing golds and predictions, each of which is a sequence of 0/1 labels.
    Prints accuracy as well as precision/recall/F1 of the positive class, which can sometimes be informative if either
    the golds or predictions are highly biased.

    :param golds: gold labels
    :param predictions: pred labels
    :return:
    """
    num_correct = 0
    num_pos_correct = 0
    num_pred = 0
    num_gold = 0
    num_total = 0
    if len(golds) != len(predictions):
        raise Exception("Mismatched gold/pred lengths: %i / %i" % (len(golds), len(predictions)))
    for idx in range(0, len(golds)):
        gold = golds[idx]
        prediction = predictions[idx]
        if prediction == gold:
            num_correct += 1
        if prediction == 1:
            num_pred += 1
        if gold == 1:
            num_gold += 1
        if prediction == 1 and gold == 1:
            num_pos_correct += 1
        num_total += 1
    acc = float(num_correct) / num_total
    output_str = "Accuracy: %i / %i = %f" % (num_correct, num_total, acc)
    prec = float(num_pos_correct) / num_pred if num_pred > 0 else 0.0
    rec = float(num_pos_correct) / num_gold if num_gold > 0 else 0.0
    f1 = 2 * prec * rec / (prec + rec) if prec > 0 and rec > 0 else 0.0
    output_str += ";\nPrecision (fraction of predicted positives that are correct): %i / %i = %f" % (num_pos_correct, num_pred, prec)
    output_str += ";\nRecall (fraction of true positives predicted correctly): %i / %i = %f" % (num_pos_correct, num_gold, rec)
    output_str += ";\nF1 (harmonic mean of precision and recall): %f;\n" % f1
    print(output_str)
    return acc, f1, output_str

In [257]:
print("=====Train Accuracy=====")
train_acc, train_f1, train_out = evaluate(model, train_exs)
print("=====Val Accuracy=====")
val_acc, val_f1, val_out = evaluate(model, val_exs)

=====Train Accuracy=====
Accuracy: 5202 / 6090 = 0.854187;
Precision (fraction of predicted positives that are correct): 2118 / 2515 = 0.842147;
Recall (fraction of true positives predicted correctly): 2118 / 2609 = 0.811805;
F1 (harmonic mean of precision and recall): 0.826698;

=====Val Accuracy=====
Accuracy: 1171 / 1523 = 0.768877;
Precision (fraction of predicted positives that are correct): 458 / 606 = 0.755776;
Recall (fraction of true positives predicted correctly): 458 / 662 = 0.691843;
F1 (harmonic mean of precision and recall): 0.722397;

