# Dataset and Embedding

TODOs:
- Split training data into train, val, test
- Shuffle training data / Train in batches -> Use dataloader


- Train on more samples


- Schedule learning rates (optional)


- Comments and documentation

In [1]:
## Imports
import torch
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np
import pickle
import string
import random

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 20, 10
plt.rcParams.update({'font.size': 25})

In [None]:
if torch.cuda.is_available():
    cuda = torch.device('cuda')
    x_cpu = torch.empty(2)
    x_gpu = torch.empty(2, device=cuda)
    x_cpu_long = torch.empty(2, dtype=torch.int64)
    print("Cuda available")

## 1. Load data and embedding

In [2]:
# Load data
path_in = './data/'

train = pd.read_csv(path_in + 'train.csv')
#test = pd.read_csv(path_in + 'test.csv')

In [6]:
## Load embedding and save to pickle
embedding = {}

with open("./data/GloVe300d.txt") as f:
    for line in f:
        row = line.split()
        key = row[0]
        val = row[1:301]            
        embedding[key] = val
        
with open('./data/GloVe300d.p', 'wb') as handle:
    pickle.dump(embedding, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
## Load the saved embedding GloVe with dimension 50
embedding = pickle.load(open("./data/GloVe300d.p", "rb") )

### 1.1 Data exploration

There are over 1 300 000 sentences in the dataset. Let's see how long the sentences are

In [None]:
def getSentenceLength(sentence):
    length = len(sentence.split())    
    
    return length

sentence_lengths = []
for i in range(len(train)):
    sentence_lengths.append(getSentenceLength(train["question_text"][i]))

In [None]:
plt.hist(sentence_lengths, range=(0, 30), color="b")
plt.title("")
plt.show()

### How many words per sentence can be found?

Sample 30 sentences and compute statistic

In [None]:
def getNumberofWordsFound():
    ## Generate random
    population = list(np.arange(0, len(train)))
    population = random.sample(population, 10000)
    
    number_of_words = []
    
    for sample in population:
        sentence = train['question_text'].iloc[int(sample)]

        ## Convert to lowercase and split sentence
        sentence = sentence.lower()

        ## Sepearate punctuation
        chs = string.punctuation
        for ch in chs:
            idx = sentence.find(ch)

            if idx != -1:
                sentence = sentence.replace(ch, " " + ch)

        sentence = sentence.split(' ') 
        number_of_words.append(len(sentence))
        
    return number_of_words


num_of_words = getNumberofWordsFound()

In [None]:
plt.hist(num_of_words, color="green")
plt.show()

In [None]:
## To do: fit curve
print(np.mean(num_of_words))
print(np.std(num_of_words))

### 1.2 Generate weight matrix
Target dimension: [sequence len, sample, dimension]

In [None]:
seq_len = 45
dimension = 300
number_samples = 50000


### Initialize matrix
weights = np.zeros((seq_len, number_samples, dimension))
labels = []

### Extract weight matrix for each sample
for sample in range(number_samples):
    sentence = train['question_text'].iloc[sample]
    label = train['target'].iloc[sample]
    labels.append(label)
       
    ## Convert to lowercase and split sentence
    sentence = sentence.lower()

    ## Sepearate punctuation
    chs = string.punctuation
    for ch in chs:
        idx = sentence.find(ch)
        
        if idx != -1:
            sentence = sentence.replace(ch, " " + ch)
            
    sentence = sentence.split(' ')   
    
     ## Truncate
    sentence = sentence[0:seq_len]

    ## Find weights
    matrix_len = len(sentence)
    weights_matrix = np.zeros((matrix_len, dimension))
    words_found = 0

    for i, word in enumerate(sentence):
        try: 
            weights_matrix[i] = embedding[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(dimension, ))
   
    
    ## Pad with zeros to seq_len
    z = np.zeros((1, dimension))
    
    for i in range(seq_len-matrix_len):      
        weights_matrix = np.concatenate((weights_matrix, z), axis=0)
     
    weights[:, sample, :] = weights_matrix

print("Dimension of weight matrix: {}".format(weights.shape))    
print("Ratio positives/total: {}".format(labels.count(1)/number_samples))


## Convert to torch tensors
weights = torch.tensor(weights)
weights = weights.float()  

labels = torch.tensor(labels).float()


## Convert to GPU if cuda is available
# if torch.cuda.is_available():
#     weights = weights.to(cuda)
#     labels = labels.to(cuda)
#     print("Tensors converted to cuda")

In [None]:
## Define dataloader
from torch.utils.data import Dataset

In [None]:
class Dataset(Dataset):
    def __init__(self):
        ## Load full weight matrix
        seq_len = 45
        labels = []
        dimension = 300
        
        for sample in range(len(train)):
            sentence = train['question_text'].iloc[sample]
            label = train['target'].iloc[sample]
            labels.append(label)

            ## Convert to lowercase and split sentence
            sentence = sentence.lower()

            ## Sepearate punctuation
            chs = string.punctuation
            for ch in chs:
                idx = sentence.find(ch)

                if idx != -1:
                    sentence = sentence.replace(ch, " " + ch)

            sentence = sentence.split(' ')   

             ## Truncate
            sentence = sentence[0:seq_len]

            ## Find weights
            matrix_len = len(sentence)
            weights_matrix = np.zeros((matrix_len, dimension))
            words_found = 0

            for i, word in enumerate(sentence):
                try: 
                    weights_matrix[i] = embedding[word]
                    words_found += 1
                except KeyError:
                    weights_matrix[i] = np.random.normal(scale=0.6, size=(dimension, ))


            ## Pad with zeros to seq_len
            z = np.zeros((1, dimension))

            for i in range(seq_len-matrix_len):      
                weights_matrix = np.concatenate((weights_matrix, z), axis=0)

            self.weights[:, sample, :] = weights_matrix
    
    def __len__(self):
        return len(self.weights)
    
    
    def __getitem(self, idx):
        self.sample = weights[sample]
        
        return sample, label

In [None]:
Dataset()

## 2. Define network

In [None]:
class Net(nn.Module):
    def __init__(self, hidden_layer_1):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(input_size=300, hidden_size=hidden_layer_1)       
        self.fc1 = nn.Linear(hidden_layer_1, 1)
        self.out = nn.Sigmoid()

    def forward(self, x):        
        output, (h_out, _) = self.lstm(x)        
        output = output[-1, :, :]
        output.squeeze_()     
        x = self.out(self.fc1(output))

        return x

### 3. Train the net

In [None]:
### Function for training
epochs = 20
learning_rate = 0.001
momentum = 0.9
hidden_layers = 300
train_size = 50000


net = Net(hidden_layers)
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
criterion = nn.BCELoss()


## Iterate through all data
loss_list = []
acc_list = []



for epoch in range(epochs):
    print("Epoch {} / {}".format(epoch+1, epochs))
    loss_sub =[]
    acc_sub = []
      
    correct = 0
    
        
    for sample in range(train_size):                 
        ## Get training data
        x = weights[:, sample, :]      
        x.unsqueeze_(1)
        y = labels[sample]
        y = y.view(1)
        
        # Set gradients to zero
        optimizer.zero_grad()
        
        # Forward pass
        output = net(x)
        
        # Evaluate output / compute loss
        loss = criterion(output, y)       
        
        # Backward pass / optimize
        loss.backward()
        
        # Update weights
        optimizer.step()
                
        ## Evaluate
        output = np.where(output.detach().numpy() > 0.5, 1, 0)
        correct += (output == y.numpy()).sum()
        
    acc = correct / train_size
    acc_sub.append(acc)
        
    loss_sub.append(loss.item())
          
        
    acc_list.append(np.mean(acc_sub))   
    loss_list.append(np.mean(loss_sub)) 
    
    #print(loss_list[-1])
    print("Loss: {} | Training accuracy: {}".format(loss_list[-1], acc_list[-1]))

    
print("\nTraining completed!")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 20, 10
plt.rcParams.update({'font.size': 25})


fig = plt.figure(1)
plt.plot(loss_accuracy['validation_accuracy'], color="blue", label="validation")
plt.plot(loss_accuracy['training_accuracy'], color="gray", label="training")
plt.legend()
plt.title("Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy [%]")


fig = plt.figure(2)
plt.plot(loss_accuracy['training_loss'], color="red")
plt.title("Training Loss")
plt.xlabel("Iteration")
plt.ylabel("Loss [-]")
