In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from torch.utils.data import Dataset
from torch.utils.data.sampler import BatchSampler
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler
import torch.optim as optim
from torch.autograd import Variable

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv('data/wiki_train.csv')
data

Unnamed: 0,rev_id,toxicity,comment,year,logged_in,ns,sample,split,is_toxic
0,2232.0,0.1,This: :One can make an analogy in mathematical...,2002,True,article,random,train,False
1,4216.0,0.0,` :Clarification for you (and Zundark's righ...,2002,True,user,random,train,False
2,26547.0,0.0,`This is such a fun entry. Devotchka I once...,2002,True,article,random,train,False
3,37330.0,0.3,` I fixed the link; I also removed ``homeopa...,2002,True,article,random,train,False
4,37346.0,0.1,`If they are ``indisputable`` then why does th...,2002,True,article,random,train,False
...,...,...,...,...,...,...,...,...,...
95687,699822249.0,0.2,"` :``Comment````. Gentlemen, this article pro...",2016,True,article,blocked,train,False
95688,699826615.0,0.0,*Support and recommend moving this (and my re...,2016,True,article,random,train,False
95689,699843603.0,0.0,` == File:Romantic Warriors cover.jpg == You...,2016,True,user,random,train,False
95690,699848324.0,0.0,` These sources don't exactly exude a sense ...,2016,True,article,blocked,train,False


In [3]:
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    text = " ".join(re.findall("[a-zA-Z]+", str(text)))
    word_tokens = word_tokenize(text)
    filtered_sentence = [lemmatizer.lemmatize(w) for w in word_tokens if not w.lower() in stop_words]
    return filtered_sentence

In [4]:
def get_tokens(dataframe, column):
    tokens = []
    for i in tqdm_notebook(dataframe[column][:]):
        _tokens = word_tokenize(str(preprocess(i)))
        tokens.append(_tokens)
        
    return tokens

In [5]:
train_data = pd.read_csv('data/wiki_train.csv')
train_data = train_data.dropna(axis = 0)

val_data = pd.read_csv('data/wiki_dev.csv')
val_data = val_data.dropna(axis = 0)

df_test = pd.read_csv('test_data.csv')
df_test = df_test.dropna(axis = 0)
df_test.loc[df_test['Label'] == 'BAD', 'Label'] = 1
df_test.loc[df_test['Label'] == 'NOT_BAD', 'Label'] = 0


train_feature = get_tokens(train_data, 'comment')
train_label = train_data['toxicity']



val_feature = get_tokens(val_data, 'comment')
val_label = val_data['toxicity']

test_feature = get_tokens(df_test, 'Text')
test_label = df_test['Label']

identity_terms = []
for i in tqdm_notebook(range(len(df_test['Text']))):
    _comment = df_test.loc[i,  'Text'].split(" ")
    if len(_comment) < 3:
        _term = _comment[1]
        identity_terms.append(_term)
identity_terms = list(set(identity_terms))


terms = []
for i in range(len(df_test['Text'])):
    _text = df_test.loc[i, 'Text'].split(' ')
    _term = list(set(_text).intersection(set(identity_terms)))
    if len(_term) > 0:
        terms.append(_term[0])
    else:
        terms.append(np.nan)
        
df_test['Identity_Terms'] = terms


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=95692.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=32128.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76564.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=76564.0), HTML(value='')))




In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_feature)
def vectorize(sent):
    vector = tokenizer.texts_to_sequences([sent])
    vector= pad_sequences(vector, maxlen = 50, dtype="int32")
    return vector

In [7]:
class TripletText(Dataset):
    """
    Train: For each sample (anchor) randomly chooses a positive and negative samples
    Test: Creates fixed triplets for testing
    """

    def __init__(self, dataset):
        self.text = dataset.comment
        self.labels = dataset.toxicity.round()

        self.train_labels = self.labels
        self.train_data = self.text
        self.labels_set = set(self.labels.round())
        self.label_to_indices = {label: np.where(self.labels.round() == label)[0]
                                 for label in self.labels_set}

    def __getitem__(self, index):
        text1, label1 = self.text[index], self.labels[index].round().item()
        positive_index = index
        while positive_index == index:
            positive_index = np.random.choice(self.label_to_indices[label1])
        negative_label = np.random.choice(list(self.labels_set - set([label1])))
        negative_index = np.random.choice(self.label_to_indices[negative_label])
        text2 = self.train_data[positive_index]
        text3 = self.train_data[negative_index]

        return (text1, text2, text3), []

    def __len__(self):
        return len(self.text)

In [8]:
triplet_train_dataset = TripletText(train_data.loc[:, :])
triplet_train_loader = torch.utils.data.DataLoader(triplet_train_dataset, batch_size=16, shuffle=True)

triplet_val_dataset = TripletText(val_data.loc[:, :])
triplet_val_loader = torch.utils.data.DataLoader(triplet_train_dataset, batch_size=16, shuffle=True)

In [9]:
class EmbeddingNet(nn.Module):
    def __init__(self):
        super(EmbeddingNet, self).__init__()

        self.fc = nn.Sequential(nn.Linear(50, 256),
                                nn.PReLU(),
                                nn.Linear(256, 512),
                                nn.PReLU(),
                                nn.Linear(512, 256)
                                )

    def forward(self, x):
        output = self.fc(x)
        return output

    def get_embedding(self, x):
        return self.forward(x)
    
    
class TripletNet(nn.Module):
    def __init__(self, embedding_net):
        super(TripletNet, self).__init__()
        self.embedding_net = embedding_net

    def forward(self, x1, x2, x3):
        output1 = self.embedding_net(x1)
        output2 = self.embedding_net(x2)
        output3 = self.embedding_net(x3)
        return output1, output2, output3

    def get_embedding(self, x):
        return self.embedding_net(x)

In [10]:
class TripletLoss(nn.Module):
    """
    Triplet loss
    Takes embeddings of an anchor sample, a positive sample and a negative sample
    """

    def __init__(self, margin):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative, size_average=True):
        distance_positive = (anchor - positive).pow(2).sum(0)  # .pow(.5)
        distance_negative = (anchor - negative).pow(2).sum(0)  # .pow(.5)
        losses = F.relu(distance_positive - distance_negative + self.margin)
        return losses.mean() if size_average else losses.sum()

In [11]:
def fit(train_loader, val_loader, model, loss_fn, optimizer, scheduler, n_epochs, log_interval, metrics=[],
        start_epoch=0):
    """
    Loaders, model, loss function and metrics should work together for a given task,
    i.e. The model should be able to process data output of loaders,
    loss function should process target output of loaders and outputs from the model
    Examples: Classification: batch loader, classification model, NLL loss, accuracy metric
    Siamese network: Siamese loader, siamese model, contrastive loss
    Online triplet learning: batch loader, embedding model, online triplet loss
    """
    for epoch in range(0, start_epoch):
        scheduler.step()

    for epoch in range(start_epoch, n_epochs):
        scheduler.step()

        # Train stage
        train_loss, metrics = train_epoch(train_loader, model, loss_fn, optimizer, log_interval, metrics)

        message = 'Epoch: {}/{}. Train set: Average loss: {:.4f}'.format(epoch + 1, n_epochs, train_loss)
        for metric in metrics:
            message += '\t{}: {}'.format(metric.name(), metric.value())

        val_loss, metrics = test_epoch(val_loader, model, loss_fn, metrics)
        val_loss /= len(val_loader)

        message += '\nEpoch: {}/{}. Validation set: Average loss: {:.4f}'.format(epoch + 1, n_epochs,
                                                                                 val_loss)
        
        for metric in metrics:
            message += '\t{}: {}'.format(metric.name(), metric.value())

        print(message)
        print('---------------------------------------------------------------------------------------------------')

def train_epoch(train_loader, model, loss_fn, optimizer, log_interval, metrics):
    for metric in metrics:
        metric.reset()

    model.train()
    losses = []
    total_loss = 0
    optimizer.zero_grad()

    for batch_idx, (data, target) in enumerate(train_loader):
        text1 = torch.FloatTensor(vectorize(preprocess(list(data[0])))[0])
        text2 = torch.FloatTensor(vectorize(preprocess(list(data[1])))[0])
        text3 = torch.FloatTensor(vectorize(preprocess(list(data[2])))[0])
        target = target if len(target) > 0 else None


        optimizer.zero_grad()
        outputs = model(text1, text2, text3)

        if type(outputs) not in (tuple, list):
            outputs = (outputs,)

        loss_inputs = outputs
        if target is not None:
            target = (target,)
            loss_inputs += target

        loss_outputs = loss_fn(*loss_inputs)
        loss = loss_outputs[0] if type(loss_outputs) in (tuple, list) else loss_outputs
        losses.append(loss.item())
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        for metric in metrics:
            metric(outputs, target, loss_outputs)

        if batch_idx % log_interval == 0:
            message = 'Train: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                batch_idx * len(data[0]), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), np.mean(losses))
            for metric in metrics:
                message += '\t{}: {}'.format(metric.name(), metric.value())

            print(message)
            losses = []

    total_loss /= (batch_idx + 1)
    return total_loss, metrics

def test_epoch(val_loader, model, loss_fn, metrics):
    with torch.no_grad():
        for metric in metrics:
            metric.reset()
        model.eval()
        val_loss = 0
        for batch_idx, (data, target) in enumerate(val_loader):
            text1 = torch.FloatTensor(vectorize(preprocess(list(data[0])))[0])
            text2 = torch.FloatTensor(vectorize(preprocess(list(data[1])))[0])
            text3 = torch.FloatTensor(vectorize(preprocess(list(data[2])))[0])

            outputs = model(text1, text2, text3)

            if type(outputs) not in (tuple, list):
                outputs = (outputs,)
            loss_inputs = outputs
            if target is not None:
                target = (target,)
                loss_inputs += target

            loss_outputs = loss_fn(*loss_inputs)
            loss = loss_outputs[0] if type(loss_outputs) in (tuple, list) else loss_outputs
            val_loss += loss.item()

            for metric in metrics:
                metric(outputs, target, loss_outputs)

    return val_loss, metrics


In [12]:
margin = 0.3
embedding_net = EmbeddingNet()
model = TripletNet(embedding_net)
loss_fn = TripletLoss(margin)
lr = 0.00001
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = lr_scheduler.StepLR(optimizer, 8, gamma=0.1, last_epoch=-1)
n_epochs = 20
log_interval = 1000

In [None]:
fit(triplet_train_loader, triplet_val_loader, model, loss_fn, optimizer, scheduler, n_epochs, log_interval)



Epoch: 1/20. Train set: Average loss: 47212.7443
Epoch: 1/20. Validation set: Average loss: 27599.8315
---------------------------------------------------------------------------------------------------
Epoch: 2/20. Train set: Average loss: 17337.5426
Epoch: 2/20. Validation set: Average loss: 8625.7889
---------------------------------------------------------------------------------------------------
Epoch: 3/20. Train set: Average loss: 10844.9372
Epoch: 3/20. Validation set: Average loss: 7466.8436
---------------------------------------------------------------------------------------------------
Epoch: 4/20. Train set: Average loss: 7751.9823
Epoch: 4/20. Validation set: Average loss: 6220.1928
---------------------------------------------------------------------------------------------------
Epoch: 5/20. Train set: Average loss: 8197.6968
Epoch: 5/20. Validation set: Average loss: 3804.9762
-------------------------------------------------------------------------------------------

