# Exploration

In [1]:
## Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split


In [2]:
# Load data
path_in = './data/'

train = pd.read_csv(path_in + 'train.csv')
test = pd.read_csv(path_in + 'test.csv')

Data structure: Question ID (qid), question text, target.

Target 0 - Negative

Target 1 - Positive

Training data:

Around 1,300,000 samples. Will have to be split up into training, validation and testing.

In [3]:
## Train data
print('Number of samples for training: {}\nRatio positives to total: {:.2f}'.format(len(train), 
                                                                                len(train.loc[train['target'] == 1])/len(train)))

Number of samples for training: 1306122
Ratio positives to total: 0.06


In [4]:
## Example sentence
train['question_text'][3]

'How did Otto von Guericke used the Magdeburg hemispheres?'

In [5]:
print(train['target'][3])
print(train['qid'][3])

0
000042bf85aa498cd78e


# Preprocessing

Currently, the questions are all in words that a computer will have problems handling. We have to first process the questions by tokenizing them. For example "One fish, two fish, red fish blue fish" -> [1 2 3 2 4 2 5 2].

In [6]:
# Build the tokenizer dictionary in the tokenizer class
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['question_text'])
tokenizer.fit_on_texts(test['question_text'])

# Split train set into train and validation sets
train, validation = train_test_split(train, test_size=0.1)

# Tokenize the questions
train_sequences = tokenizer.texts_to_sequences(train['question_text'])
validation_sequences = tokenizer.texts_to_sequences(validation['question_text'])
test_sequences = tokenizer.texts_to_sequences(test['question_text'])

# Save the tokenizer dictionary and the number of words in it
word_index = tokenizer.word_index
vocab_size = len(word_index)

In [7]:
training_dataset = torch.utils.data.TensorDataset(torch.tensor(np.array(train_sequences)).int(), torch.tensor(np.array(train['target'])).float32())
validation_dataset = torch.utils.data.TensorDataset(torch.tensor(np.array(validation_sequences)).int(), torch.tensor(np.array(validation['target'])).float32())

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, int64, int32, int16, int8, uint8, and bool.

In [25]:
train_sequences = torch.nn.utils.rnn.pad_sequence(train_sequences)

torch.IntTensor(train_sequences)

AttributeError: 'list' object has no attribute 'size'

## Embeddings
As a start, the 'glove.6B.50d' embedding is used.

Also, you can use 'torchtext.vocab.GloVe(name='840B', dim=300)' (see [torchtext package](https://torchtext.readthedocs.io/en/latest/vocab.html#vocab) )

In [None]:
embeddings_File = './data/embeddings/glove.6B.50d/glove.6B.50d.txt'
embeddings_dim = 50
embeddings_index = {}

print('Loading word embeddings...')

with open(embeddings_File) as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embeddings_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;  

print('Finished loading word embeddings. {:.0f} words loaded.'.format(len(embeddings_matrix)))

Todo:

Explore the data. Find for example the longest quora question.

Pad the data with empty or null words

Embed the data

Create an RNN network to train the data

Look at the results and iterate

In [None]:
def weights_init(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight.data)
        nn.init.constant_(m.bias,0)
        
class Net(nn.Module):
    def __init__(self, embedding_dim, n_hidden_1, D_out):
        super().__init__()
        
        # Initialize word embeddings layer using weights
        weight = torch.FloatTensor(embeddings_matrix)
        embeds = nn.Embedding.from_pretrained(weight)
        
        self.lstm1 = nn.LSTM(embedding_dim, n_hidden_1)
        
        self.fc1 = nn.Linear(n_hidden_1, n_hidden_2)
        self.relu1 = nn.ReLU()
        
        self.fc2 = nn.Linear(n_hidden_2, D_out)
        self.out_act = nn.Sigmoid()
        
    def forward(self, x):
        # Embedding layer
        embeds_out = self.embeds(x)
        
        # LSTM layer
        hidden = None
        lstm_out, _ = self.lstm1(embeds_out, hidden)
        
        # Fully connected layer
        fc1_out = self.relu1(self.fc1(lstm_out))
        
        # Output layer
        y = self.out_act(self.fc2(fc1_out))
        
        return y
    


In [None]:
def train(data_set, model, criterion, train_loader, validation_loader, optimizer, epochs=100):
    model.train()
    loss_accuracy = {'training_loss':[], 'validation_accuracy':[], 'validation_precision':[], 'validation_recall':[]}
    
    for epoch in range(epochs):
        clear_output(wait=True)
        print("Epoch {} / {}\n=============".format(epoch+1, epochs))
            
        if epoch > 0:
            print("Training loss: {}\nValidation accuracy: {:.2f}%".format(loss.item(), accuracy))
            pass
        
        for x, y in train_loader:

            optimizer.zero_grad()
            ## Forward pass
            yhat = model(x)
            ## Compute loss
            loss = criterion(yhat, y)
            ## Compute gradient in backward pass
            loss.backward()
            ## Update weights
            optimizer.step() 
            
            loss_accuracy['training_loss'].append(loss.item())
         
        ## Compute validation accuracy
        model.eval()
        correct = 0
        for x, y in validation_loader:
            yhat = net(x)           
            yhat = np.where(yhat.detach().numpy() > 0.5, 1, 0)
            correct = (yhat == y.detach().numpy()).sum()                        
            accuracy = 100 * (correct / validation_loader.batch_size)
            
        loss_accuracy['validation_accuracy'].append(accuracy)       
        model.train()
        
        ## Add precision and recall
        
    print("Training complete!")
                
    return loss_accuracy

In [None]:
epochs = 50
learning_rate = 0.01

## Network dimensions
D_in = embeddings_dim
n_hidden_1 = 16
n_hidden_2 = 16
D_out = 1

## Load data
train_loader = torch.utils.data.DataLoader(dataset=training_dataset, batch_size=8000, shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=len(validation_dataset), shuffle=False)

## Initialize model
net = Net(D_in, n_hidden_1, n_hidden_2, D_out)
net.apply(weights_init)


optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
criterion = nn.BCELoss()

## Train the model
loss_accuracy = train(training_dataset, net, criterion, train_loader, validation_loader, optimizer, epochs=epochs)


## Plots
fig = plt.figure(1)
plt.plot(loss_accuracy['training_loss'], color="red")
plt.title("Training Loss")
plt.xlabel("Iteration")
plt.ylabel("Loss [-]")

fig = plt.figure(2)
plt.plot(loss_accuracy['validation_accuracy'], color="blue")
plt.title("Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy [%]")