In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import torchtext
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create the dataset
class ToxicDataset(Dataset):
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)
        self.Posibilities = self.data.columns[3:15]
        self.X = self.data['Text']
        # make a dictionary of the words in X
        self.word2idx = {}
        self.idx2word = {}
        self.vocab_size = 0
        for sentence in self.X:
            for word in sentence.split():
                if word not in self.word2idx:
                    self.word2idx[word] = self.vocab_size
                    self.idx2word[self.vocab_size] = word
                    self.vocab_size += 1
        # Apply the dictionary to X
        self.X = self.X.apply(lambda x: [self.word2idx[word] for word in x.split()])
        self.X_data = []
        for sentence in self.X:
            self.X_data.append(torch.tensor(sentence))
        self.Y = self.data[self.Posibilities]
        # make tensor of Y
        self.Y = torch.tensor(self.Y.values, dtype=torch.float32)
        # convert the Y to True/False to 1/0
        self.Y = self.Y.apply_(lambda x: 1 if x > 0.5 else 0)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.X_data[idx], self.Y[idx]

In [3]:
# read the csv file
dataset = ToxicDataset('archive/youtoxic_english_1000.csv')

In [4]:
for i,data in enumerate(dataset):
    print (len(data[0]))
    print (data[1])
    if i == 10:
        break

287
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
25
tensor([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
77
tensor([1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])
107
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
47
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
20
tensor([1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.])
37
tensor([1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.])
57
tensor([1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0.])
5
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
9
tensor([1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])
43
tensor([1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])


In [5]:
# Create the model
class ToxicModel(nn.Module):

    def __init__(self, vocab_size, hidden_size, output_size):
        super(ToxicModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input, hidden):
        # Input: (seq_len, batch, input_size)
        # input is a batch of sentences
        # hidden is the hidden state of the lstm
        # output is the output of the lstm
        # hidden is the hidden state of the lstm
        # output is the output of the lstm
        output = self.embedding(input)
        output, hidden = self.lstm(output, hidden)
        output = self.linear1(output)
        output = self.linear2(output)
        output = self.sigmoid(output)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size),
                torch.zeros(1, 1, self.hidden_size))

In [6]:
# Create the model
model = ToxicModel(dataset.vocab_size, 128, len(dataset.Posibilities))


In [7]:
# load the model
model.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

In [12]:
# Create the loss function
criterion = nn.BCELoss()
# Create the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [13]:
from tqdm import tqdm

In [14]:
# Train the model
def train(model, data, criterion, optimizer, epochs):
    for epoch in range(epochs):
        tot_loss = 0
        # initialize the hidden state
        for i in tqdm(range(len(data))):
            x, y = data[i][0], data[i][1]
            x = x.unsqueeze(1)
            hidden = model.initHidden()
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            # take the last output of the lstm
            output, hidden = model(x, hidden)
            output = output[-1].squeeze(0)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            tot_loss += loss.item()
        
        print('Epoch: {} - Loss: {:.6f}'.format(epoch, tot_loss / len(data)))
            

In [15]:
epochs = 5
train(model, dataset, criterion, optimizer, epochs)

100%|██████████| 1000/1000 [00:22<00:00, 45.21it/s]


Epoch: 0 - Loss: 0.004008


100%|██████████| 1000/1000 [00:27<00:00, 35.96it/s]


Epoch: 1 - Loss: 0.002135


  6%|▋         | 63/1000 [00:01<00:28, 32.31it/s]


KeyboardInterrupt: 

In [21]:
# make a sample text for the model
sample = "I though you was a good person, but i realize that you are Black, a fucking idiot"
sample = sample.split()
# convert the sample to a tensor
sample = torch.tensor([dataset.word2idx[word] for word in sample])
# unsqueeze the tensor to make it a batch
sample = sample.unsqueeze(1)
# initialize the hidden state
hidden = model.initHidden()
# get the output of the model
output, hidden = model(sample, hidden)
# get the last output of the lstm
output = output[-1].squeeze(0)
# get the prediction
prediction = output.detach().numpy()
# get the posibilities
posibilities = dataset.Posibilities
# print the results as Posibility: probability
for i in range(len(posibilities)):
    print (posibilities[i], prediction[i])
    

IsToxic 0.9999912
IsAbusive 0.9999945
IsThreat 8.0006345e-08
IsProvocative 5.613571e-08
IsObscene 0.016929762
IsHatespeech 8.516501e-08
IsRacist 2.4437388e-09
IsNationalist 1.0034225e-10
IsSexist 1.2508373e-09
IsHomophobic 2.2771544e-13
IsReligiousHate 3.0460438e-08
IsRadicalism 3.6108947e-13


In [104]:
# save the model
torch.save(model.state_dict(), 'model.pt')