# Fetch and Process Data 

In [113]:
import pandas as pd
from utils.data_processing import *
from utils.preprocessing import *
from tqdm.auto import tqdm

In [241]:
training_x, training_y = data_processing('./data/train.csv')

  0%|          | 0/159571 [00:00<?, ?it/s]

  0%|          | 0/159571 [00:00<?, ?it/s]

Num Tox: 16225
Num Not Tox: 16224


In [242]:
preprocessed_x = preprocess(training_x)

  0%|          | 0/32449 [00:00<?, ?it/s]

## Encode Data

In [243]:
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence

In [263]:
def buildVocab(training_data, tokenised=None, feedforward=None):
    
    vocab={}
    word_counts = {}
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    vocab['<oov>'] = 3
    
    processed_lines = []
    
    for line in training_data:
        
        tokens = line.split(' ')
        
        for token in tokens:
            
            if token not in vocab:
                vocab[token] = len(vocab)
                word_counts[token] = 1
                
            else:
                word_counts[token] += 1
        
        if not feedforward:
            tokens.insert(0, '<sos>')
            tokens.append('<eos>')
        processed_lines.append(tokens)
    
    return vocab, word_counts, processed_lines

def encodeData(data, vocab, test=None):
    
    encoded_data = []
    
    for line in data:
            
        encoded_line = []
        
        for token in line:
            
            if test==True:
                if token not in line:
                    encoded_line.append(vocab['<oov'])
                    continue
                    
            encoded_line.append(vocab[token])
        
        encoded_data.append(torch.LongTensor(encoded_line))
        
    return pad_sequence(encoded_data, batch_first=True)

def encodeMultiHot(line, vocab, test=None):
    

    empty_vec = np.zeros(len(vocab))

    for token in line:

        if token in vocab:
            empty_vec[vocab[token]] += 1


    return np.array(empty_vec)

In [245]:
from torch.utils.data import Dataset, DataLoader

In [284]:
def hottyY(int):
    
    if int == 0:
        return torch.Tensor([1, 0])
    elif int == 1:
        return torch.Tensor([0,1])

class netDataset(Dataset):
    
    def __init__(self, x,y=None, test=None):
        self.x = x
        self.test = test
        if not self.test:
            self.y = [hottyY(i) for i in y]
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        
        x = encodeMultiHot(self.x[idx], vocab)
                    
        if not self.test:
            y = self.y[idx]
        
            return x, y
        
        else:
            return x

## Feed Forward Network 

In [251]:
vocab, word_counts, post_processed_lines = buildVocab(preprocessed_x, feedforward=True)


In [265]:
training_dataset = netDataset(post_processed_lines, training_y)

In [269]:
import torch.nn as nn

In [270]:
class FeedForward(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(FeedForward, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.fc1 = nn.Linear(self.input_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, 2)
        self.activation = nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x):
        
        out1 = self.activation(self.fc1(x))
        out2 = self.fc2(out1)
        return self.sigmoid(out2)
        
        

### Network Parameters

In [271]:
input_size = len(vocab)
hidden_size = 128
num_epochs = 10

In [272]:
model = FeedForward(input_size, hidden_size)
print(model)

FeedForward(
  (fc1): Linear(in_features=113173, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
  (activation): ReLU()
  (sigmoid): Sigmoid()
)


### Training

In [274]:
#training_dataloader = DataLoader(training_dataset, batch_size=1, shuffle=True)

In [278]:
avg_loss = 0
model.train()
for i in tqdm(range(num_epochs), total=num_epochs, desc='Training'):
    
    optimizer.zero_grad()
    losses = []
    
    idx = 0
    for x,y in tqdm(training_dataset, desc=f'Epoch {i}'):
        
        x = torch.Tensor(x)
        out = model(x)
        
        loss = criterion(out, y)
        loss.backward()
        
        optimizer.step()
    
        losses.append(loss.item())
        
        if idx % 400  == 0:
            tqdm.write(f'Avg Loss: {sum(losses) / len(losses)}')
        
        idx += 1
    print(f'Average Loss: {sum(losses) / len(losses)}')

Training:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0:   0%|          | 0/32449 [00:00<?, ?it/s]

Avg Loss: 0.03970193862915039
Avg Loss: 0.7896829358772514
Avg Loss: 1.2191842275552984
Avg Loss: 1.0847577477416863
Avg Loss: 1.2292711912322443
Avg Loss: 1.071494076507832
Avg Loss: 1.1650097880417245
Avg Loss: 1.0856247607517355
Avg Loss: 1.1210545919660524
Avg Loss: 1.1650303669955213
Avg Loss: 1.1377810368517718
Avg Loss: 1.1365979783680347
Avg Loss: 1.117551064866927
Avg Loss: 1.0787048085911808
Avg Loss: 1.0776626692941258
Avg Loss: 1.0799770955404426
Avg Loss: 1.035887675213235
Avg Loss: 1.0840133428201866
Avg Loss: 1.086046906533517
Avg Loss: 1.058770263394971
Avg Loss: 1.0465514266616178
Avg Loss: 1.0233041491314303
Avg Loss: 1.0138812563383681
Avg Loss: 1.02582770429888
Avg Loss: 1.0269136585128602
Avg Loss: 1.0059384535667337
Avg Loss: 1.017490840225509
Avg Loss: 1.0084980741953062
Avg Loss: 1.0107755950780821
Avg Loss: 0.9925380884009708
Avg Loss: 0.9948563874104577
Avg Loss: 1.0158533328191515
Avg Loss: 1.0178937728838446
Avg Loss: 1.0201689890973835
Avg Loss: 1.031041238

Epoch 1:   0%|          | 0/32449 [00:00<?, ?it/s]

Avg Loss: 100.0
Avg Loss: 89.40149625935162
Avg Loss: 89.01373283395755
Avg Loss: 89.25895087427143
Avg Loss: 89.1630231105559
Avg Loss: 89.28035982008996
Avg Loss: 89.067055393586


KeyboardInterrupt: 

### Evaluate Model

In [280]:
testing_raw = pd.read_csv('./data/test.csv')
transformed_test, _ = processRawDataFromCSV(testing_raw, test=True)
preprocessed_x = preprocess(transformed_test)
#encoded_test = encodeMultiHot(post_processed_lines, vocab, test=True)


  0%|          | 0/153164 [00:00<?, ?it/s]

  0%|          | 0/153164 [00:00<?, ?it/s]

In [285]:
testing_dataset = netDataset(encoded_train, test=True)

In [286]:
model.eval()



FeedForward(
  (fc1): Linear(in_features=113173, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
  (activation): ReLU()
  (sigmoid): Sigmoid()
)

In [287]:
preds = []
for i in tqdm(testing_dataset):
    
    pred = model(torch.Tensor(i)).argmax().item()
    
    preds.append(pred)

  0%|          | 0/3283 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Recurrant Neural Network

In [117]:
vocab, word_counts, test_post_processed = buildVocab(preprocessed_x, test=True)


TypeError: buildVocab() got an unexpected keyword argument 'test'

In [91]:
encoded_train = encodeMultiHot(post_processed_lines, vocab)


tensor([0.5164, 0.5299], grad_fn=<SigmoidBackward0>)

In [43]:
training_dataset[0]

(tensor([1, 3, 4,  ..., 0, 0, 0]), 0)

In [98]:
loss

tensor(0.7079, grad_fn=<BinaryCrossEntropyBackward0>)