In [16]:
import torchtext
from torchtext import data

import torch
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.autograd import Variable

from collections import OrderedDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from pathlib import Path


import nltk

from tqdm import tqdm_notebook, tqdm


In [2]:
print = tqdm.write
tqdm = tqdm_notebook

In [3]:
data_path = Path('..', 'data')
train_path = Path(data_path, 'normalized_train.csv')

train_df = pd.read_csv(train_path)
train_df['comment_text'] = train_df['comment_text'].fillna('')

In [4]:
train_df, val_df = train_test_split(train_df, test_size=0.1,
                                    random_state=31)

In [6]:
pd.set_option('max_colwidth', -1)
train_df.sample(10, random_state=31)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
12579,21590fdb69ac401f,"""\n\n Image:Taiwan straits.jpg listed for deletion \n\n An image or media file that you uploaded, Image:Taiwan straits.jpg, has been listed at Wikipedia:Images and media for deletion. Please look there to see why this is (you may have to search for the title of the image to find its entry), if you are interested in it not being deleted. Thank you. """,0,0,0,0,0,0
95239,fea00ca779850150,"""\n\n Talkback Section \nResponded at my talk. (talk) """,0,0,0,0,0,0
18336,30662d209bf28e5a,Thanks... this is a vast improvement ) The main problem is that I just haven't had time to complete all the states... I'm working away at plugging the gaps bit by bit but it's going to take a while. Not exactly sure what to do in the meantime.,0,0,0,0,0,0
5418,0e70afb9cc4b0ac2,"Thanks for taking care of that. Another sockpuppet,",0,0,0,0,0,0
43579,7455193dec3fcde6,"Kathleen Andrews Submission \n\nHi Ritchie###, \n\nI would like to thank you for helping me submit Kathleen Andrews legacy to Wikipedia. Wow!!!!! You Made my day as I've been trying for a very long time!! THANK YOU!!!\n\nIm very new to Wikipedia, so I am not sure how to do this but in Kathleen Andrews story, it listed her as First Female Dispatcher, and thats true, but she was also the First Female ETS Bus Operator, First Female Dispatcher and First Female in ETS Management. The emphasis on the First female Bus Operator should be first, if that's possible.\n\nI look forward to hearing from you and a BIG THANK YOU for all of your help!!!\n\nTake care\n\nLisa Andrews",0,0,0,0,0,0
141819,f6b8aaa4bf61346c,I am not doing any disruptive changes! The sources you have used are not valid!!,0,0,0,0,0,0
117134,71f2b4b004e2ee9e,So how long will this one last and who am i going ot have ot degrade myself by begging to this time?,0,0,0,0,0,0
147217,38c9a5938084661f,"""\nDo you mean """"which was sufficient""""? I'm confused. """,0,0,0,0,0,0
44413,769cae54627f1fbe,"""\n Removing is probably fine. As for the , this is used in and would break the only method for adding a reference to the coordinates, so I would leave it. With regards to the loading times, I didn't think it was any worse than anything else. It could be due to the recent slowness with the servers. ―Œ(talk) """,0,0,0,0,0,0
47996,80399abe839de1d7,"""\n\nI've changed it to the """"neutrality"""" tag as we all seem to agree that this is the problem with the article. """,0,0,0,0,0,0


In [7]:
train_df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [11]:
def make_train_examples(df: pd.DataFrame, fields):
    fields = {field_name: (field_name, field)
                       for field_name, field in fields.items()}
    for _, row in tqdm(df.iterrows()):
        example = data.Example.fromdict(row, fields)
        yield example

In [12]:
text_field = data.Field(init_token='<START>', eos_token='<EOS>',
                        lower=True, tokenize=nltk.word_tokenize,
                        include_lengths=True, batch_first=True)
label_field = data.Field(sequential=False, use_vocab=False,
                         tensor_type=torch.FloatTensor)

# fields = [('id', None), ('comment_text', text_field),
#           ('toxic', label_field), ('severe_toxic', label_field),
#           ('obscene', label_field), ('threat', label_field),
#           ('insult', label_field), ('identity_hate', label_field)]

# train_set = data.TabularDataset(str(train_path), fields=fields,
#                                 skip_header=True, format='csv')

fields = OrderedDict({'comment_text': text_field, 'toxic': label_field,
                      'severe_toxic': label_field, 'obscene': label_field,
                      'threat': label_field, 'insult': label_field,
                      'identity_hate': label_field})


train_examples = list(make_train_examples(train_df, fields))
val_examples = list(make_train_examples(val_df, fields))

train_set = data.Dataset(train_examples, fields)
val_set = data.Dataset(val_examples, fields)

143613it [03:32, 675.19it/s]
15958it [00:26, 612.60it/s]


In [13]:
text_field.build_vocab(train_set, max_size=50000)

In [14]:
class TextClassifier(nn.Module):
    def __init__(self, *, n_classes, vocab_size,
                 num_hidden=256, padding_idx=None,
                 embedding_dim=100, embedding_weights=None,
                 bidirectional=True):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, 
                                  embedding_dim=embedding_dim,
                                  padding_idx=padding_idx)
        if embedding_weights is not None:
            self.embed.weight.data.copy_(embedding_weights)
            
        self.lstm = nn.LSTM(embedding_dim, num_hidden,
                            bidirectional=True, batch_first=True)
        self.out = nn.Linear(num_hidden * 2, n_classes)
    
    def forward(self, X, lengths):
        X = self.embed(X)
        X = nn.utils.rnn.pack_padded_sequence(X, lengths,
                                              batch_first=True)
        X, _ = self.lstm(X)
        X, lengths = nn.utils.rnn.pad_packed_sequence(X, batch_first=True)
        # take the last time step
        X = X[:, -1, :]
        X = self.out(X)
        return X

In [15]:
label_fields = ['toxic', 'severe_toxic', 'obscene',
                'threat', 'insult', 'identity_hate']
padding_idx = text_field.vocab.stoi[text_field.pad_token]



net = TextClassifier(n_classes=len(label_fields), 
                     vocab_size=len(text_field.vocab),
                     padding_idx=padding_idx)
net.cuda()
print(net)

    Found GPU0 GeForce GPU which is of cuda capability 5.0.
    PyTorch no longer supports this GPU because it is too old.
    


TypeError: expected string or bytes-like object

In [10]:

n_epochs = 5
batch_size = 4
input_name = 'comment_text'
label_names = ['toxic', 'severe_toxic', 'obscene', 'threat',
               'insult', 'identity_hate']

batches_train, batches_val = data.BucketIterator.splits(
    (train_set, val_set), batch_size=batch_size, repeat=False,
    sort_key=lambda x: len(getattr(x, input_name)),
    sort_within_batch=True)

optimizer = optim.Adam(net.parameters())
criterion = nn.BCEWithLogitsLoss()

for epoch in range(n_epochs):  
    
    running_loss = 0.0
    for n, batch in enumerate(tqdm(batches_train)):
        # get the inputs
        inputs, lengths = getattr(batch, input_name)
        lengths = lengths.cpu().numpy()
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        outputs = net(inputs, lengths)
        losses = []
    
        # get all output fields and their losses
        # and backprop for each of them
        for i, label_name in enumerate(label_names):
            targets = getattr(batch, label_name)
            label_outputs = outputs[:, i]
            loss = criterion(label_outputs, targets)
            losses.append(loss)
        
        total_loss = sum(losses)
        total_loss.backward()

        optimizer.step()

        # print statistics
        running_loss += total_loss.data[0]
        if n % 100 == 99:    # print every 1000 mini-batches
            print('[{}, {}] loss: {:.3f}'.format(epoch + 1,
                                                n + 1, running_loss / 100))
            running_loss = 0.0
            

print('Finished Training')

[1, 100] loss: 2.598
[1, 200] loss: 2.163
[1, 300] loss: 2.149
[1, 400] loss: 1.928
[1, 500] loss: 1.747
[1, 600] loss: 1.816
[1, 700] loss: 1.811
[1, 800] loss: 1.722
[1, 900] loss: 1.670
[1, 1000] loss: 1.620
[1, 1100] loss: 1.753
[1, 1200] loss: 1.439
[1, 1300] loss: 1.439
[1, 1400] loss: 1.424
[1, 1500] loss: 1.281
[1, 1600] loss: 1.306
[1, 1700] loss: 1.399
[1, 1800] loss: 1.345
[1, 1900] loss: 1.191
[1, 2000] loss: 1.224
[1, 2100] loss: 1.235
[1, 2200] loss: 1.191
[1, 2300] loss: 1.176
[1, 2400] loss: 1.128
[1, 2500] loss: 1.192
[1, 2600] loss: 0.926
[1, 2700] loss: 1.033
[1, 2800] loss: 0.915
[1, 2900] loss: 0.988
[1, 3000] loss: 0.871
[1, 3100] loss: 0.932
[1, 3200] loss: 0.930
[1, 3300] loss: 1.084
[1, 3400] loss: 0.853
[1, 3500] loss: 0.839
[1, 3600] loss: 0.895
[1, 3700] loss: 0.834
[1, 3800] loss: 0.876
[1, 3900] loss: 0.777
[1, 4000] loss: 0.880
[1, 4100] loss: 0.819
[1, 4200] loss: 0.844
[1, 4300] loss: 0.743
[1, 4400] loss: 0.821
[1, 4500] loss: 0.873
[1, 4600] loss: 0.7

[2, 100] loss: 0.556
[2, 200] loss: 0.444
[2, 300] loss: 0.431
[2, 400] loss: 0.445
[2, 500] loss: 0.635
[2, 600] loss: 0.558
[2, 700] loss: 0.410
[2, 800] loss: 0.489
[2, 900] loss: 0.479
[2, 1000] loss: 0.589
[2, 1100] loss: 0.554
[2, 1200] loss: 0.546
[2, 1300] loss: 0.439
[2, 1400] loss: 0.590
[2, 1500] loss: 0.546
[2, 1600] loss: 0.453
[2, 1700] loss: 0.546
[2, 1800] loss: 0.608
[2, 1900] loss: 0.480
[2, 2000] loss: 0.456
[2, 2100] loss: 0.575
[2, 2200] loss: 0.404
[2, 2300] loss: 0.616
[2, 2400] loss: 0.490
[2, 2500] loss: 0.365
[2, 2600] loss: 0.618
[2, 2700] loss: 0.405
[2, 2800] loss: 0.605
[2, 2900] loss: 0.645
[2, 3000] loss: 0.545
[2, 3100] loss: 0.413
[2, 3200] loss: 0.576
[2, 3300] loss: 0.684
[2, 3400] loss: 0.559
[2, 3500] loss: 0.610
[2, 3600] loss: 0.413
[2, 3700] loss: 0.364
[2, 3800] loss: 0.489
[2, 3900] loss: 0.455
[2, 4000] loss: 0.502
[2, 4100] loss: 0.484
[2, 4200] loss: 0.251
[2, 4300] loss: 0.494
[2, 4400] loss: 0.463
[2, 4500] loss: 0.437
[2, 4600] loss: 0.3

[3, 100] loss: 0.494
[3, 200] loss: 0.457
[3, 300] loss: 0.610
[3, 400] loss: 0.586
[3, 500] loss: 0.535
[3, 600] loss: 0.417
[3, 700] loss: 0.369
[3, 800] loss: 0.496
[3, 900] loss: 0.457
[3, 1000] loss: 0.428
[3, 1100] loss: 0.582
[3, 1200] loss: 0.571
[3, 1300] loss: 0.542
[3, 1400] loss: 0.561
[3, 1500] loss: 0.520
[3, 1600] loss: 0.418
[3, 1700] loss: 0.447
[3, 1800] loss: 0.524
[3, 1900] loss: 0.420
[3, 2000] loss: 0.458
[3, 2100] loss: 0.522
[3, 2200] loss: 0.475
[3, 2300] loss: 0.342
[3, 2400] loss: 0.583
[3, 2500] loss: 0.514
[3, 2600] loss: 0.523
[3, 2700] loss: 0.483
[3, 2800] loss: 0.516
[3, 2900] loss: 0.460
[3, 3000] loss: 0.469
[3, 3100] loss: 0.496
[3, 3200] loss: 0.442
[3, 3300] loss: 0.493
[3, 3400] loss: 0.572
[3, 3500] loss: 0.542
[3, 3600] loss: 0.491
[3, 3700] loss: 0.506
[3, 3800] loss: 0.527
[3, 3900] loss: 0.556
[3, 4000] loss: 0.418
[3, 4100] loss: 0.483
[3, 4200] loss: 0.389
[3, 4300] loss: 0.595
[3, 4400] loss: 0.524
[3, 4500] loss: 0.498
[3, 4600] loss: 0.4

[4, 100] loss: 0.521
[4, 200] loss: 0.414
[4, 300] loss: 0.314
[4, 400] loss: 0.470
[4, 500] loss: 0.370
[4, 600] loss: 0.507
[4, 700] loss: 0.385
[4, 800] loss: 0.459
[4, 900] loss: 0.389
[4, 1000] loss: 0.454
[4, 1100] loss: 0.506
[4, 1200] loss: 0.480
[4, 1300] loss: 0.497
[4, 1400] loss: 0.418
[4, 1500] loss: 0.343
[4, 1600] loss: 0.365
[4, 1700] loss: 0.454
[4, 1800] loss: 0.473
[4, 1900] loss: 0.721
[4, 2000] loss: 0.481
[4, 2100] loss: 0.413
[4, 2200] loss: 0.480
[4, 2300] loss: 0.558
[4, 2400] loss: 0.423
[4, 2500] loss: 0.339
[4, 2600] loss: 0.412
[4, 2700] loss: 0.527
[4, 2800] loss: 0.576
[4, 2900] loss: 0.481
[4, 3000] loss: 0.417
[4, 3100] loss: 0.523
[4, 3200] loss: 0.449
[4, 3300] loss: 0.432
[4, 3400] loss: 0.399
[4, 3500] loss: 0.418
[4, 3600] loss: 0.498
[4, 3700] loss: 0.488
[4, 3800] loss: 0.461
[4, 3900] loss: 0.402
[4, 4000] loss: 0.626
[4, 4100] loss: 0.408
[4, 4200] loss: 0.384
[4, 4300] loss: 0.435
[4, 4400] loss: 0.354
[4, 4500] loss: 0.383
[4, 4600] loss: 0.4

[5, 100] loss: 0.466
[5, 200] loss: 0.405
[5, 300] loss: 0.553
[5, 400] loss: 0.418
[5, 500] loss: 0.459
[5, 600] loss: 0.462
[5, 700] loss: 0.423
[5, 800] loss: 0.417
[5, 900] loss: 0.374
[5, 1000] loss: 0.488
[5, 1100] loss: 0.436
[5, 1200] loss: 0.469
[5, 1300] loss: 0.402
[5, 1400] loss: 0.559
[5, 1500] loss: 0.617
[5, 1600] loss: 0.384
[5, 1700] loss: 0.423
[5, 1800] loss: 0.424
[5, 1900] loss: 0.415
[5, 2000] loss: 0.356
[5, 2100] loss: 0.515
[5, 2200] loss: 0.412
[5, 2300] loss: 0.605
[5, 2400] loss: 0.455
[5, 2500] loss: 0.418
[5, 2600] loss: 0.618
[5, 2700] loss: 0.448
[5, 2800] loss: 0.433
[5, 2900] loss: 0.392
[5, 3000] loss: 0.579
[5, 3100] loss: 0.515
[5, 3200] loss: 0.331
[5, 3300] loss: 0.414
[5, 3400] loss: 0.449
[5, 3500] loss: 0.377
[5, 3600] loss: 0.543
[5, 3700] loss: 0.500
[5, 3800] loss: 0.565
[5, 3900] loss: 0.469
[5, 4000] loss: 0.409
[5, 4100] loss: 0.347
[5, 4200] loss: 0.390
[5, 4300] loss: 0.637
[5, 4400] loss: 0.426
[5, 4500] loss: 0.481
[5, 4600] loss: 0.4

In [18]:
def get_preds_val(net, val_iter, *, input_name, label_names):
    Y_true = []
    Y_hat = []
    net.eval()
    for batch in tqdm(val_iter):
        X, lengths = getattr(batch, input_name)
        lengths = lengths.cpu().numpy()
        out = net(X, lengths)
        out_probas = F.sigmoid(out)
        
        y_true_all = np.zeros(len(label_names), dtype='float32')
        y_hat_all = np.zeros(len(label_names), dtype='float32')
        for n, (label_name, label_proba) in enumerate(zip(label_names, out_probas)):
            y_true = getattr(batch, label_name)
            y_true_all[n] = y_true.cpu().tonumpy()
            
            y_hat_all[n] = label_proba
    Y_true = np.vstack(Y_true)
    Y_hat = np.vstack(Y_hat)
    return Y_hat, Y_true