In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchtext.data import Field, BucketIterator
import torchtext

import spacy

import sys, os, io, time, random, math
import numpy as numpy
import pandas as pd


# Tokenizer Setup
I am using spaCy to tokenize since it's a little more robust than the default pytorch tokenizer
Seq2Seq reverses the input sentences, but since this techincally is more semantic analysis
than sequence to sequence, we're going to leave it in the forward order.

In [2]:
spacy_en = spacy.load('en')

In [3]:
def tokenize_input_lang(text):
    '''
    Tokenize input from a string into a list of tokens and reverses it
    '''
    return [tok.text for tok in spacy_en.tokenizer(text)]


## Fields setup

In [4]:
TEXT = Field(tokenize = tokenize_input_lang, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

LABEL = Field(dtype = torch.int)
# The original file contains: id    keyword    location    target
# However I haven't decided how I want to include the keywords in this model yet (probably concatentaiton)
# So for now it is being trained without
#
# To train with
fields = [(None, None), (None, None), (None,None), ('text', TEXT), ('label', LABEL)]



# Importing and Loading Data To Use In PyTorch

In [5]:
twitter_dataset = torchtext.data.TabularDataset('train.csv','csv',fields,skip_header=True)



In [6]:
# torchtext.data.split() returns default a 70-30 split for training-testin
# but since testing is provided by kaggle we will treat this as our 
# training-validation split
train_data, valid_data = twitter_dataset.split()

## Double check we've loaded the right number and split correctly

In [7]:
print(f"Number of total examples: {len(twitter_dataset.examples)}")
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")

Number of total examples: 7613
Number of training examples: 5329
Number of validation examples: 2284


## Example of one of the training data (Tokenized correctly and reversed (?) )

In [8]:
print(vars(train_data.examples[1]))
print(type(vars(train_data.examples[1])['label'][0]))

{'text': ['so', 'i', 'pick', 'myself', 'off', 'the', 'ground', 'and', 'swam', 'before', 'i', 'drowned', '.', 'hit', 'the', 'bottom', 'so', 'hard', 'i', 'bounced', 'twice', 'suffice', 'this', 'time', 'around', 'is', 'different', '.'], 'label': ['1']}
<class 'str'>


## Slang Embeddings setup

In [14]:
# def load_embeddings(fname):
#     fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
#     n, d = map(int, fin.readline().split())
#     data = {}
#     for line in fin:
#         tokens = line.rstrip().split(' ')
#         data[tokens[0]] = list(map(float, tokens[1:]))
#     return data

# slang_emb = load_embeddings('ud_embeddings/ud_basic.vec')
slang_emb = torchtext.vocab.Vectors(name = 'ud_embeddings/ud_basic.vec',
                                   cache = 'ud_embeddings',
                                   unk_init = torch.Tensor.normal_)

  0%|          | 0/542320 [00:00<?, ?it/s]Skipping token b'542320' with 1-dimensional vector [b'300']; likely a header
100%|██████████| 542320/542320 [00:44<00:00, 12129.84it/s]


### Some helpful functions to verify things are working correctly

Next block from https://colab.research.google.com/github/bentrevett/pytorch-sentiment-analysis/blob/master/B%20-%20A%20Closer%20Look%20at%20Word%20Embeddings.ipynb#scrollTo=DMkoy7iFMeN3

In [23]:
def get_vector(embeddings, word):
    assert word in embeddings.stoi, f'*{word}* is not in the vocab!'
    return embeddings.vectors[embeddings.stoi[word]]

def closest_words(embeddings, vector, n = 10):
    
    distances = [(word, torch.dist(vector, get_vector(embeddings, word)).item())
                 for word in embeddings.itos]
    
    return sorted(distances, key = lambda w: w[1])[:n]

def print_tuples(tuples):
    for w, d in tuples:
        print(f'({d:02.04f}) {w}') 
        

def analogy(embeddings, word1, word2, word3, n=5):
    
    #get vectors for each word
    word1_vector = get_vector(embeddings, word1)
    word2_vector = get_vector(embeddings, word2)
    word3_vector = get_vector(embeddings, word3)
    
    #calculate analogy vector
    analogy_vector = word2_vector - word1_vector + word3_vector
    
    #find closest words to analogy vector
    candidate_words = closest_words(embeddings, analogy_vector, n+3)
    
    #filter out words already in analogy
    candidate_words = [(word, dist) for (word, dist) in candidate_words 
                       if word not in [word1, word2, word3]][:n]
    
    print(f'{word1} is to {word2} as {word3} is to...')
    
    return candidate_words

In [24]:
print_tuples(analogy(slang_emb, 'man', 'actor', 'woman'))

ma is to actor as woman is to...
(5.9133) actress
(6.1269) actors
(6.2390) actrivist
(6.2655) grimepp
(6.2856) mayim


## Build Vocab

In [25]:
TEXT.build_vocab(train_data,
                vectors = slang_emb)
LABEL.build_vocab(train_data)

In [26]:
print(f"Unique tokens in text vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in label vocabulary: {len(LABEL.vocab)}")

Unique tokens in text vocabulary: 17668
Unique tokens in label vocabulary: 4


# Hyper-Parameters

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 128

## Iterator setup

In [28]:
train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE, 
    device = device)



# Model setup

Encoder from LSTM (Context vectors) + fully connected layer to predict sentiment

The Encoder is based on Sew2Seq tutorial https://colab.research.google.com/github/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb#scrollTo=Ao4yzOdnyv8s. 

Much of this math and logic will come from there

## Encoder

We're going to use a 2 layer LSTM for our encoder. If I have time I would like to do like Seq2Seq and do a 4 layer

### Starting with simple RNN structure

Consider the input sequence $X$ where $x_t \in X$ is the input token to the first encoder layer at time t, and the hidden states $H = \{h_1, h_2, ..., h_T\}$ are the output a hidden layer. Also let $e(x)$ represent the embedding of the input token $x$. Then with superscripts representing the layer, we can consider our input functions as

$$h_t^1 = \text{EncoderRNN}^1(e(x_t),h^1_{t-1})$$
and
$$h_t^2 = \text{EncoderRNN}^2(h^1_t,h^2_{t-1})$$

Let the initial hidden state as input for each layer be $h_0^l$ and the final context vector per layer be $z^l = h_T^l$

### Transforming into LSTM

We think about LSTM's being simple extensions from RNN's by adding an extra cell state an extra 'hidden state', although they have different functions. We denote this $c_t^l$ Our simple input function:

$$h_t = \text{RNN}(e(x_t), h_{t-1})$$

can then be transformed into

$$ (h_t, c_t) = \text{LSTM}(e(x_t), h_{t-1}, c_{t-1})$$

We'll also need an initial cell state $c_0^l$ and we will transform our context-vector/final-hidden-state to be the tupple $z^l = (h_T^l, c_T^l)$.

By extension to two layers then we have 

$$\begin{align*}
(h_t^1, c_t^1) &= \text{EncoderLSTM}^1(e(x_t), (h_{t-1}^1, c_{t-1}^1))\\
(h_t^2, c_t^2) &= \text{EncoderLSTM}^2(h_t^1, (h_{t-1}^2, c_{t-1}^2))
\end{align*}$$


In [29]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

## Fully Connected

Small fully connected layer to help with encapsulation

In [30]:
class FullyConnected(nn.Module):
    def __init__(self, hid_dim, output_dim):
        super().__init__()
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
    
    def forward(self, input):
        
        return self.fc_out(input.squeeze(0))

## Model class

In [32]:
class CustomModel(nn.Module):
    def __init__(self, encoder, fc, device):
        super().__init__()
        
        self.encoder = encoder
        self.fc = fc
        self.device = device
        
    def forward(self, src):
        
        hidden, cell = self.encoder(src)
        
        output = self.fc(hidden)
       
        return output
     
        
        
        

#### Insantiate everything

In [34]:
INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(LABEL.vocab)
ENC_EMB_DIM = 300
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
fc = FullyConnected(HID_DIM,OUTPUT_DIM)

model = CustomModel(enc,fc, device ).to(device)

## Initialize weights

We're going to initialize all weights from a uniform distribution between -0.08 and +0.08 (like in the seq2seq paper)

In [35]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

CustomModel(
  (encoder): Encoder(
    (embedding): Embedding(17668, 300)
    (rnn): LSTM(300, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (fc): FullyConnected(
    (fc_out): Linear(in_features=512, out_features=4, bias=True)
  )
)

Info about the number of trainable parameters

In [36]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,070,772 trainable parameters


Using Adam optimizer

In [38]:
optimizer = optim.Adam(model.parameters())