In [121]:
import pandas as pd
import string
from tqdm.notebook import tqdm
import torch
from torch.nn.utils.rnn import pad_sequence

In [69]:
train_csv = pd.read_csv('data/train.csv')

In [70]:
toy_train = train_csv.iloc[:200]

In [140]:
toy_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
195,007bbfa4da2bc32d,"Oh, also wouldn't films that are named Three o...",0,0,0,0,0,0
196,007bc29766a43e3c,"Review Request \n\nHi,\n\nI'd like to request ...",0,0,0,0,0,0
197,007db1f1477ea977,I don't at all propose that it should be trans...,0,0,0,0,0,0
198,007e1e47cd0e2fec,Homosexuals are intent on legitimizing their b...,0,0,0,0,0,0


## Pre Process Data

In [72]:
comments = toy_train['comment_text'].to_list()

In [73]:
def flat_text(text):
    return strip_punctuation(' '.join([x.lower() for x in text.split('\n')]))

In [74]:
def flatten_text(list_texts):
    processed_texts = []
    for i in list_texts:
        flattened = ' '.join([x.lower() for x in i.split('\n')])
        
        processed_texts.append(strip_punctuation(flattened))
        
    return processed_texts

In [75]:
def strip_punctuation(text):
    return ''.join([i for i in text if i not in string.punctuation])

In [109]:
train_comments = [flat_text(i) for i in comments]
vocab = []


In [128]:
def gen_encryptor_decryptor(comments):
    for i in comments:
        vocab = update_vocab(vocab, i)

encryptor, decryptor = gen_vocab(vocab)

In [129]:
def update_vocab(vocab, text):
    for token in text.split(' '):
        if token not in vocab:
            vocab.append(token)
            
    return vocab

In [130]:
def gen_vocab(vocab):
    vocab =  dict(enumerate(vocab))
    new_vocab = {num : tok for tok, num in vocab.items()}
    return new_vocab, vocab

In [131]:
def encode_text(encryptor, text):
    encoded_text = [encryptor[i] for i in text.split(' ')]
    return encoded_text

In [278]:
encoded = []
lengths = []
for i in train_comments:
    encoded.append(torch.LongTensor(encode_text(encryptor, i)))
    lengths.append(len(i))

In [391]:
encoded = pad_sequence(encoded, batch_first=False)

In [392]:
lengths = torch.LongTensor(lengths)

## Create label vectors

In [416]:
labels = []
for i in range(len(toy_train)):
    labels.append(toy_train.iloc[i, 2:].to_list())

In [411]:
#labels = pad_sequence(labels, batch_first=False)

In [417]:
labels = torch.LongTensor(labels)

In [418]:
class dataset(Dataset):
    def __init__(self, input_data, input_labels, input_lengths):
        self.input_data = input_data
        self.input_labels = input_labels
        self.input_lengths = input_lengths
    def __getitem__(self, index):
        x = self.input_data[index]
        y = self.input_labels[index]
        z = self.input_lengths[index]
        return (x,y, z)
    
    def __len__(self):
        return len(self.input_data)
        

In [465]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        
        packed_output, (hidden, cell) = self.lstm(embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [466]:
from torch.utils.data import Dataset, DataLoader

In [467]:
training_data = dataset(encoded, labels, lengths)

In [468]:
training_dataloader = DataLoader(training_data, batch_size=50)

## Create Model

In [469]:
model = classifier(len(vocab), embedding_dim = 256, hidden_dim = 128, output_dim=6,
                  n_layers=2, bidirectional=False, dropout=0.2)

In [470]:
from torch import nn

In [471]:
for i in training_dataloader:
    x = i[0]
    y = i[1]
    z = i[2]
    out = model(x,z)
    print(x.shape)
    print(y.shape)
    print(z.shape)
    
    break

torch.Size([50, 1243])
torch.Size([50, 6])
torch.Size([50])


In [473]:
out

tensor([[0.4437, 0.4766, 0.4901, 0.5092, 0.5367, 0.4871],
        [0.4485, 0.4763, 0.4910, 0.5074, 0.5369, 0.4841],
        [0.4482, 0.4773, 0.4904, 0.5084, 0.5357, 0.4877],
        [0.4497, 0.4722, 0.4919, 0.5135, 0.5357, 0.4883],
        [0.4479, 0.4814, 0.4915, 0.5034, 0.5377, 0.4845],
        [0.4542, 0.4801, 0.4891, 0.5058, 0.5353, 0.4850],
        [0.4453, 0.4780, 0.4908, 0.5073, 0.5368, 0.4853],
        [0.4493, 0.4769, 0.4873, 0.5055, 0.5385, 0.4864],
        [0.4477, 0.4760, 0.4913, 0.5046, 0.5332, 0.4875],
        [0.4495, 0.4765, 0.4854, 0.5034, 0.5360, 0.4874],
        [0.4489, 0.4730, 0.4884, 0.5054, 0.5371, 0.4869],
        [0.4473, 0.4759, 0.4872, 0.5063, 0.5335, 0.4858],
        [0.4491, 0.4796, 0.4868, 0.5082, 0.5380, 0.4878],
        [0.4500, 0.4795, 0.4884, 0.5061, 0.5353, 0.4903],
        [0.4497, 0.4773, 0.4861, 0.5013, 0.5327, 0.4829],
        [0.4486, 0.4770, 0.4877, 0.5034, 0.5329, 0.4868],
        [0.4479, 0.4821, 0.4889, 0.5060, 0.5335, 0.4889],
        [0.450