# Dataset and Embedding

In [1]:
## Imports
import torch
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np
import pickle
import string
import random

## Load data and embedding

In [2]:
# Load data
path_in = './data/'

train = pd.read_csv(path_in + 'train.csv')
#test = pd.read_csv(path_in + 'test.csv')

In [3]:
## Load the saved embedding GloVe with dimension 50
embedding = pickle.load(open("./data/embeddings/GloVe50d.p", "rb") )

## Define net

### Define embedding layer

Layer to allow feeding a sentence. The layer splits the sentence up and extracts the weights from the GloVe embedding. 
Note that not all words are in the dictionary. The layer truncates sentences with more than 45 words/punctuation.
It returns the weight matrix.

In [4]:
## In: sentence
## Out: tensor with dimension [seq_len, 1, dim]

def get_weights(sentence):
    ## Convert to lowercase and split sentence
    sentence = sentence.lower()

    ## Sepearate punctuation
    chs = string.punctuation
    for ch in chs:
        idx = sentence.find(ch)
        
        if idx != -1:
            sentence = sentence.replace(ch, " " + ch)
            
    sentence = sentence.split(' ') 
     
    ## Load embedding 
    glove = embedding
    emb_dim = 50
    
     ## Truncate
    sentence = sentence[0:45]

    ## Find weights
    matrix_len = len(sentence)
    weights_matrix = np.zeros((matrix_len, 50))
    words_found = 0

    for i, word in enumerate(sentence):
        try: 
            weights_matrix[i] = glove[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

    #print("{} / {} words found".format(words_found, len(sentence)))    
    
    ## Pad with zeros. Longest sentence in data has 44 words. Pad to 45.
    z = np.zeros((1, 50))
    
    for i in range(45-matrix_len):      
        weights_matrix = np.concatenate((weights_matrix, z), axis=0)
    
    ## Convert to torch tensor
    weights_matrix = torch.tensor(weights_matrix)
    weights_matrix = weights_matrix.view(45, -1, 50)    
    weights_matrix = weights_matrix.float()
    
    return weights_matrix

In [5]:
## Dataloader function to extract the sentence and the label from the dataset by index
def get_train_data(sample):  
    sentence = train['question_text'].iloc[sample]
    label = train['target'].iloc[sample]
    label = torch.tensor(label).float()
    label.unsqueeze_(-1)
    
    return sentence, label

### Net

Takes a sentence as input using the custom function defined earlier.

In [6]:
class Net(nn.Module):
    def __init__(self, hidden_layer_1):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(input_size=50, hidden_size=hidden_layer_1)       
        self.fc1 = nn.Linear(hidden_layer_1, 1)
        self.out = nn.Sigmoid()

    def forward(self, sentence):        
        ## Convert sentence into tensor:
        x = get_weights(sentence)
        
        ## Network
        output, (h_out, _) = self.lstm(x)        
        output = output[-1, :, :]
        output.squeeze_()     
        x = self.out(self.fc1(output))
        
        return x

## Train the net

In [7]:
### Function for training
epochs = 20
learning_rate = 0.001
momentum = 0.9
hidden_layers = 50
train_size = 200


net = Net(hidden_layers)
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
criterion = nn.BCELoss()


## Iterate through all data
loss_list = []
acc_list = []



for epoch in range(epochs):
    print("Epoch {} / {}".format(epoch+1, epochs))
    loss_sub =[]
    acc_sub = []
      
    correct = 0
    
        
    for sample in range(len(train)): 

        sentence, label = get_train_data(sample)
        
        # Set gradients to zero
        optimizer.zero_grad()
        
        # Forward pass
        output = net(sentence)
        
        # Evaluate output / compute loss
        loss = criterion(output, label)       
        
        # Backward pass / optimize
        loss.backward()
        
        # Update weights
        optimizer.step()
                
        ## Evaluate
        output = np.where(output.detach().numpy() > 0.5, 1, 0)
        correct += (output == label.numpy()).sum()
        acc = correct / train_size
        acc_sub.append(acc)
        
        loss_sub.append(loss.item())
             
    acc_list.append(np.mean(acc_sub))   
    loss_list.append(np.mean(loss_sub)) 
    
    #print(loss_list[-1])
    print("Loss: {} | Training accuracy: {}".format(loss_list[-1], acc_list[-1]))

    
print("\nTraining completed!")

Epoch 1 / 20


KeyboardInterrupt: 

In [27]:
s = np.linspace(0, len(train), len(train)+1)
random.shuffle(s)

In [189]:
print(acc_list)
print(loss_list)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.4854402683675289, 0.31068941615521906, 0.25003040410578253, 0.2278475936129689, 0.21975346248596905, 0.21630197189748288, 0.2151196076348424, 0.21459924940019845, 0.21443354384973645, 0.21403224976733326, 0.21425330694764852, 0.2137593784928322, 0.2140881977789104, 0.2138363854587078, 0.21392127502709626, 0.21357641637325286, 0.21371773034334182, 0.2138974308781326, 0.2137533018179238, 0.21389392411336303]
