In [None]:
%%bash 
pip3 install torchtext
pip3 install http://download.pytorch.org/whl/cu90/torch-0.3.1-cp36-cp36m-linux_x86_64.whl

In [1]:
import torchtext
from torchtext import data

import torch
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.autograd import Variable

from collections import OrderedDict, defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, accuracy_score,
                             roc_auc_score)
import numpy as np
import pandas as pd
from pathlib import Path

from tqdm import tqdm_notebook, tqdm

In [None]:
# https://arxiv.org/abs/1509.01626

In [11]:
label_names = ['toxic', 'severe_toxic', 'obscene', 'threat',
               'insult', 'identity_hate']

In [2]:
tqdm = tqdm_notebook

In [101]:
data_path = Path('..', 'data')
train_path = Path(data_path, 'train.csv')

train_df = pd.read_csv(train_path)
train_df['comment_text'] = train_df['comment_text'].fillna('')

In [102]:
test_path = Path(data_path, 'test.csv')

test_df = pd.read_csv(test_path)
submission = pd.DataFrame({'id': test_df['id']})

In [103]:
for label in label_names:
    test_df[label] = np.zeros(test_df.id.shape[0])

In [104]:
train_df, val_df = train_test_split(train_df, test_size=0.1,
                                    random_state=31)

In [105]:
train_df['comment_text'].map(len).describe()

count      90.000000
mean      453.311111
std       735.739140
min        24.000000
25%        94.000000
50%       188.500000
75%       405.250000
max      3800.000000
Name: comment_text, dtype: float64

In [106]:
def make_train_examples(df: pd.DataFrame, fields):
    fields = {field_name: (field_name, field)
                       for field_name, field in fields.items()}
    for _, row in tqdm(df.iterrows()):
        example = data.Example.fromdict(row, fields)
        yield example

In [107]:
max_len = 5000
text_field = data.Field(tokenize=(lambda s: list(s)), batch_first=True,
                       fix_length=max_len)
label_field = data.Field(sequential=False, use_vocab=False,
                         tensor_type=torch.FloatTensor)

fields = OrderedDict({'comment_text': text_field, 'toxic': label_field,
                      'severe_toxic': label_field, 'obscene': label_field,
                      'threat': label_field, 'insult': label_field,
                      'identity_hate': label_field})


train_examples = list(make_train_examples(train_df, fields))
val_examples = list(make_train_examples(val_df, fields))
test_examples = list(make_train_examples(test_df, fields))

train_set = data.Dataset(train_examples, fields)
val_set = data.Dataset(val_examples, fields)
test_set = data.Dataset(test_examples, fields)

In [108]:
text_field.build_vocab(train_set)

In [109]:
class CharCNN(nn.Module):
    def __init__(self, vocab_size, n_classes=6, dropout=0.1):
        super(CharCNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv1d(vocab_size, 256, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )

        self.conv2 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )            

        self.conv3 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU()
        )

        self.conv4 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU()    
        )

        self.conv5 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU()
        )

        self.conv6 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )


        self.fc1 = nn.Sequential(
            nn.Linear(46336, 1024),
            nn.ReLU(),
            nn.Dropout(p=dropout)
        )

        self.fc2 = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Dropout(p=dropout)
        )

        self.fc3 = nn.Linear(1024, n_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)

        # collapse
        x = x.view(x.size(0), -1)
        # linear layer
        x = self.fc1(x)
        # linear layer
        x = self.fc2(x)
        # linear layer
        x = self.fc3(x)

        return x

In [116]:
label_fields = ['toxic', 'severe_toxic', 'obscene',
                'threat', 'insult', 'identity_hate']
char_cnn = CharCNN(len(text_field.vocab), n_classes=len(label_fields))
char_cnn.cuda()

In [117]:
len(text_field.vocab)

102

In [118]:
def to_one_hot(y, n_dims=None):
    """ Take integer y (tensor or variable) with n dims and convert it to 1-hot representation with n+1 dims. """
    y_tensor = y.data if isinstance(y, Variable) else y
    y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
    n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1
    y_one_hot = torch.zeros(y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1)
    y_one_hot = y_one_hot.view(y.shape[0], n_dims, -1)
    return Variable(y_one_hot) if isinstance(y, Variable) else y_one_hot
    

In [119]:
n_epochs = 5
batch_size = 128
input_name = 'comment_text'

batches_train, batches_val = data.BucketIterator.splits(
    (train_set, val_set), batch_size=batch_size, repeat=False,
    sort_key=lambda x: len(getattr(x, input_name)))


optimizer = optim.Adam(char_cnn.parameters())
criterion = nn.BCEWithLogitsLoss()

for epoch in range(n_epochs):  
    
    running_loss = 0.0
    for n, batch in enumerate(tqdm(batches_train)):
        # get the inputs
        inputs = getattr(batch, input_name)
        inputs = to_one_hot(inputs, n_dims=len(text_field.vocab))
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        outputs = char_cnn(inputs)
        losses = []
    
        # get all output fields and their losses
        # and backprop for each of them
        for i, label_name in enumerate(label_names):
            targets = getattr(batch, label_name)
            label_outputs = outputs[:, i]
            loss = criterion(label_outputs, targets)
            losses.append(loss)
        
        total_loss = sum(losses)
        total_loss.backward()

        optimizer.step()

        # print statistics
        running_loss += total_loss.data[0]
        if n % 100 == 99:    # print every 1000 mini-batches
            print('[{}, {}] loss: {:.3f}'.format(epoch + 1,
                                                n + 1, running_loss / 100))
            running_loss = 0.0
            

print('Finished Training')

KeyboardInterrupt: 

In [139]:
def get_preds_val(net, val_iter, *, input_name, label_names, evaluate=True):
    net.eval()
    
    if evaluate:
        y_labels = defaultdict(list)
    scores_labels = defaultdict(list)
    
    for batch in tqdm(val_iter):
        X = getattr(batch, input_name)
        X = to_one_hot(X, n_dims=len(text_field.vocab))
        out = net(X)
        out_probas = F.sigmoid(out)
        
        # put label dimension first
        # so we can iterate over each label's predictions
        out_probas = out_probas.permute(1, 0)
        
        for n, (label_name, label_probas) in enumerate(zip(label_names, out_probas)):            
            label_probas = label_probas.data.cpu().numpy()
            if evaluate:
                y_true = getattr(batch, label_name).data.cpu().numpy()
                assert len(y_true) == len(label_probas)
                y_labels[label_name].append(y_true)
            
            scores_labels[label_name].append(label_probas)
    
    for label_name in label_names:
        if evaluate:
            y_labels[label_name] = np.hstack(y_labels[label_name])
        scores_labels[label_name] = np.hstack(scores_labels[label_name])
    
    if evaluate:
        return y_labels, scores_labels
    else:
        return scores_labels

In [127]:
y_labels, scores_labels = get_preds_val(char_cnn, batches_val,
                                        input_name=input_name,
                                        label_names=label_names)

In [128]:
roc_scores = []
accuracy_scores = []

for label_name in label_names:
    true_labels = y_labels[label_name]
    scores = scores_labels[label_name]
    score = roc_auc_score(true_labels, scores)
    roc_scores.append(score)
    print(label_name, score)

print()
mean_roc = np.mean(roc_scores)
print('Mean ROC', mean_roc)

toxic 1.0


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [135]:
batches_test = data.BucketIterator(dataset=test_set, batch_size=batch_size, repeat=False,
                                   sort_key=lambda x: len(getattr(x, input_name)))

In [140]:
scores_test_labels = get_preds_val(char_cnn, batches_test,
                                        input_name=input_name,
                                        label_names=label_names,evaluate=False)

In [141]:
for label_name in label_names:
    submission[label_name] = scores_test_labels[label_name]

In [142]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
79093,8405a1e1acf92f8a,0.502648,0.499437,0.506241,0.507524,0.497374,0.503701
89817,95c4c58ccccdfabf,0.502647,0.499439,0.506242,0.507526,0.497372,0.503701
123007,cd5ff97e82bed5cc,0.502648,0.499436,0.506242,0.507527,0.49737,0.503701
29731,31780de1ec340b37,0.502648,0.499435,0.506242,0.507522,0.49737,0.503698
25022,29b466936c8c11b2,0.502646,0.499434,0.506241,0.507527,0.497373,0.503701


In [None]:
submission.to_csv('char_cnn_submission.csv', index=False)