# Dataset and Embedding

In [1]:
## Imports
import torch
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np
import pickle
import string
import random

## Load data and embedding

In [14]:
# Load data
path_in = './data/'

train = pd.read_csv(path_in + 'train.csv')
#test = pd.read_csv(path_in + 'test.csv')

In [4]:
embedding = {}

with open("./data/GloVe300d.txt") as f:
    for line in f:
        row = line.split()
        key = row[0]
        val = row[1:301]            
        embedding[key] = val 
        
#pickle.dump(embedding, "./data/GloVe50d.p", "w")

In [4]:
## Load the saved embedding GloVe with dimension 50
embedding = pickle.load(open("./data/GloVe50d.p", "rb") )

FileNotFoundError: [Errno 2] No such file or directory: './data/GloVe50d.p'

## Define net

### Define embedding layer

Layer to allow feeding a sentence. The layer splits the sentence up and extracts the weights from the GloVe embedding. 
Note that not all words are in the dictionary. The layer truncates sentences with more than 45 words/punctuation.
It returns the weight matrix.

In [27]:
## In: sentence
## Out: tensor with dimension [seq_len, 1, dim]

def get_weights(sentence, glove):
    ## Parameter
    embedding_dimension = 300
    sentence_max_length = 45
    
    ## Convert to lowercase and split sentence
    sentence = sentence.lower()

    ## Sepearate punctuation
    chs = string.punctuation
    for ch in chs:
        idx = sentence.find(ch)
        
        if idx != -1:
            sentence = sentence.replace(ch, " " + ch)
            
    sentence = sentence.split(' ') 
         
     ## Truncate
    sentence = sentence[0:sentence_max_length]

    ## Find weights
    matrix_len = len(sentence)
    weights_matrix = np.zeros((matrix_len, embedding_dimension))
    words_found = 0

    for i, word in enumerate(sentence):
        try: 
            weights_matrix[i] = glove[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dimension, ))

    #print("{} / {} words found".format(words_found, len(sentence)))    
    
    ## Pad with zeros. Longest sentence in data has 44 words. Pad to 45.
    z = np.zeros((1, embedding_dimension))
    
    for i in range(sentence_max_length-matrix_len):      
        weights_matrix = np.concatenate((weights_matrix, z), axis=0)
    
    ## Convert to torch tensor
    weights_matrix = torch.tensor(weights_matrix)
    weights_matrix = weights_matrix.view(sentence_max_length, -1, embedding_dimension)    
    weights_matrix = weights_matrix.float()
    
    return weights_matrix

In [26]:
## Dataloader function to extract the sentence and the label from the dataset by index
def get_train_data(sample):  
    sentence = train['question_text'].iloc[sample]
    label = train['target'].iloc[sample]
    label = torch.tensor(label).float()
    label.unsqueeze_(-1)
    
    return sentence, label

In [None]:
#### Test: custom dataset
class QuestionsDataset(Dataset):
    '''Questions dataset'''
    
    def __init__(self, csv_file, transform=None):
        """
        Args:
            csv_file (string): Path to file with questions, question ids and labels
        """

        self.traindata = pd.read_csv(csv_file)
        self.embedding = embedding = pickle.load(open("./data/GloVe300d.p", "rb") )
            
    def __len__(self):
        return len(self.traindata)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
    
            ## Parameter
            embedding_dimension = 300
            sentence_max_length = 45
            
            sentence = traindata['question_text'].iloc[idx]
            label = traindata['target'].iloc[idx]
            label = torch.tensor(label).float()
            label.unsqueeze_(-1)
    
            ## Convert to lowercase and split sentence
            sentence = sentence.lower()

            ## Sepearate punctuation
            chs = string.punctuation
            for ch in chs:
                idx = sentence.find(ch)

                if idx != -1:
                    sentence = sentence.replace(ch, " " + ch)

            sentence = sentence.split(' ') 

             ## Truncate
            sentence = sentence[0:sentence_max_length]

            ## Find weights
            matrix_len = len(sentence)
            weights_matrix = np.zeros((matrix_len, embedding_dimension))
            words_found = 0

            for i, word in enumerate(sentence):
                try: 
                    weights_matrix[i] = glove[word]
                    words_found += 1
                except KeyError:
                    weights_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dimension, ))

            #print("{} / {} words found".format(words_found, len(sentence)))    

            ## Pad with zeros. Longest sentence in data has 44 words. Pad to 45.
            z = np.zeros((1, embedding_dimension))

            for i in range(sentence_max_length-matrix_len):      
                weights_matrix = np.concatenate((weights_matrix, z), axis=0)

            ## Convert to torch tensor
            weights_matrix = torch.tensor(weights_matrix)
            weights_matrix = weights_matrix.view(sentence_max_length, -1, embedding_dimension)    
            weights_matrix = weights_matrix.float() 
        
        
        
        return weights_matrix, label

### Net

Takes a sentence as input using the custom function defined earlier.

In [25]:
class Net(nn.Module):
    def __init__(self, num_features, hidden_layer_1):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(input_size=num_features, hidden_size=hidden_layer_1)       
        self.fc1 = nn.Linear(hidden_layer_1, 1)
        self.out = nn.Sigmoid()

    def forward(self, sentence, embedding):        
        ## Convert sentence into tensor:
        x = get_weights(sentence, embedding)
        
        ## Network
        output, (h_out, _) = self.lstm(x)        
        output = output[-1, :, :]
        output.squeeze_()     
        x = self.out(self.fc1(output))
        
        return x

## Train the net

In [None]:
### Function for training
epochs = 20
learning_rate = 0.01
momentum = 0.9

hidden_layers = 150
num_features = 300
train_size = 10000

#net = Net(num_features, hidden_layers)
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
criterion = nn.BCELoss()


## Iterate through all data
loss_list = []
acc_list = []


for epoch in range(epochs):
    print("Epoch {} / {}".format(epoch+1, epochs))
    loss_sub =[]
    acc_sub = []    
    correct = 0
           
    for sample in range(len(train[0:train_size])): 

        sentence, label = get_train_data(sample)
        
        # Set gradients to zero
        optimizer.zero_grad()
        
        # Forward pass
        output = net(sentence, embedding)
        
        # Evaluate output / compute loss
        loss = criterion(output, label)       
        
        # Backward pass / optimize
        loss.backward()
        
        # Update weights
        optimizer.step()
                
        ## Evaluate
        output = np.where(output.detach().numpy() > 0.5, 1, 0)
        correct += (output == label.numpy()).sum()
        acc = correct / train_size
        acc_sub.append(acc)
        
        loss_sub.append(loss.item())
             
    acc_list.append(np.mean(acc_sub))   
    loss_list.append(np.mean(loss_sub)) 
    
    #print(loss_list[-1])
    print("Loss: {} | Training accuracy: {}".format(loss_list[-1], acc_list[-1]))

    
print("\nTraining completed!")

Epoch 1 / 20
Loss: 0.238948038039729 | Training accuracy: 0.4679262699999999
Epoch 2 / 20
Loss: 0.2373404067995958 | Training accuracy: 0.46795828999999994
Epoch 3 / 20


In [27]:
s = np.linspace(0, len(train), len(train)+1)
random.shuffle(s)