In [1]:
import pandas as pd
import numpy as np

import nltk
#nltk.download('averaged_perceptron_tagger')
from nltk.tag.hmm import HiddenMarkovModelTrainer
from nltk.tag import PerceptronTagger
from utils import leer_csv, starts_ends_tokens, separar_spans_toxicos

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Datos y funciones

In [2]:
todo = leer_csv("../data/completo_cleaned.csv")
train, test = train_test_split(todo, test_size=0.25, random_state=42)
print(train.shape, test.shape)

(6471, 2) (2158, 2)


In [3]:
def get_vocab(df):
    tokens = df["text"].apply(str.split)
    vocab = set()
    for i,row in tokens.iteritems():
        vocab |= set(row)
    return vocab

vocab = get_vocab(train)

In [4]:
def hacer_submission(model):
    tsd_test = pd.read_csv("../data/tsd_test.csv")

    tsd_test.text = tsd_test.text.str.lower()
    tokenized = tsd_test.text.apply(str.split)

    submission = tokenized.apply(model.tag)

    submission = submission.apply(tags2offsets)

    submission.to_csv("../data/spans-pred.txt", sep="\t", header=False)


In [5]:
def tags2offsets(tagged_comment):
    words = [word for word,tag in tagged_comment]
    tags = [tag for word, tag in tagged_comment]
    
    text = " ".join(words)
    starts, ends = starts_ends_tokens(text)
    offsets = []
    for i, (word, tag) in enumerate(tagged_comment):
        if "-" in tag:
            offsets.extend(list( range(starts[i], ends[i]+1)) )
    return offsets

In [6]:
def f1_score(system_offsets, ground_truth):
    if ground_truth:
        system_offsets = set(system_offsets)
        ground_truth = set(ground_truth)

        interseccion = system_offsets & ground_truth
        precision = len(interseccion)/len(system_offsets) if system_offsets else 0
        recall = len(interseccion)/len(ground_truth)
        f1 = (2*precision*recall)/(precision+recall) if (precision,recall) != (0,0) else 0
        
    elif system_offsets:
        f1 = 0 # no hay verdaderas, pero hay en la predicción, se define como 0
    else:
        f1 = 1
    return f1

[LSTM](#LSTM)

# Con etiquetado Tóxico - No Tóxico

## Modelo oculto de Markov

In [6]:
def etiquetar_secuencia(row):
    word_starts, word_ends = starts_ends_tokens(row.text)
    toxic_spans = separar_spans_toxicos(row)
    toxic_starts = [span[0] for span in toxic_spans]
    toxic_ends = [span[-1] for span in toxic_spans]
    
    secuencia = []
    
    for word_start, word_end in zip(word_starts, word_ends):
        tag = "-" if word_start in toxic_starts and word_end in toxic_ends else "+"
        secuencia.append(tag)
        
    secuencia = list(zip(row.text.split(), secuencia))
            
    return secuencia

In [5]:
train_sentences = [etiquetar_secuencia(row) for row in train.itertuples()]

### Modelado

In [6]:
# Tóxico y NO tóxico
trainer = HiddenMarkovModelTrainer(states=["-", "+"], symbols=vocab)

hmm1 = trainer.train(train_sentences)

In [9]:
def obtener_f1(df, model):
    tokenized_text = df.text.apply(str.split)
    tagged_text = tokenized_text.apply(model.tag)

    system_offsets = tagged_text.apply(tags2offsets)
    scores = [f1_score(offsets, ground_truth) 
                for offsets, ground_truth in zip(system_offsets, df.spans)]
    
    return np.mean(scores)

In [10]:
obtener_f1(train, hmm1)

0.5854434660739216

In [11]:
obtener_f1(test, hmm1)

0.28440539271538906

## Perceptron

In [23]:
perceptron = PerceptronTagger(load=False)
perceptron.train(train_sentences, nr_iter=15)

In [24]:
obtener_f1(train, perceptron)

0.9328094579702002

In [25]:
obtener_f1(test, perceptron)

0.49023810456719485

In [22]:
hacer_submission(perceptron)

# Con etiquetado POS

In [26]:
def etiquetar_secuencia_POS(row):
    word_starts, word_ends = starts_ends_tokens(row.text)
    toxic_spans = separar_spans_toxicos(row)
    toxic_starts = [span[0] for span in toxic_spans]
    toxic_ends = [span[-1] for span in toxic_spans]
    
    tokens = row.text.split()
    pos_tags = nltk.pos_tag(tokens)
    secuencia = [tag for text, tag in pos_tags]
    
    for idx, (word_start, word_end) in enumerate(zip(word_starts, word_ends)):
        tag = "-" if word_start in toxic_starts and word_end in toxic_ends else "+"
        secuencia[idx] = secuencia[idx]+f"{tag}"
        
    secuencia = list(zip(row.text.split(), secuencia))
            
    return secuencia

In [27]:
# POS + Tóxico y NO tóxico
train_sentences_pos = [etiquetar_secuencia_POS(row) for row in train.itertuples()]

## Modelo de Markov oculto

In [17]:
tags = ['DT+', 'NN-', 'CC-', 'JJ-', 'VBG+', 'NN+', 'CC+', 'JJ+', 'NNP+', 'PRP+', 'VBP+', 'CD+', 'NNS+', 'RB+', 
        'PRP$+', 'VBG-', 'WP+', 'VBZ+', 'VB+', 'RP+', 'IN+', 'VBN+', 'TO+', 'JJR+', 'WDT+', 'NNP-', 'WRB+', 'MD+',
        'VB-', 'VBD+', 'VBP-', 'RBS+', 'NNS-', 'DT-', 'VBZ-', ':+', 'PDT+', 'IN-', 'EX+', '.+', 'PRP-', 'JJS+', 
        'RBR+', 'VBN-', 'NNPS+', 'CD-', 'RB-', 'WRB-', 'RP-', ',+', 'MD-', 'JJS-', '$+', 'POS+', 'NNPS-', 'SYM+', 
        'TO-', '``+', "''-", 'WDT-', 'VBD-', 'JJR-', 'PRP$-', 'PDT-', 'WP-', '(+', ')+', 'UH+', 'WP$+', 'FW+', 
        '#+', 'RBS-', "''+", 'EX-', 'RBR-', 'FW-', ':-', ',-', 'UH-', 'SYM-', '.-', 'POS-']

In [18]:
trainer = HiddenMarkovModelTrainer(states = tags, symbols=vocab)

hmm2 = trainer.train(train_sentences)

In [19]:
obtener_f1(train, hmm2)

0.6603716306391332

In [20]:
obtener_f1(test, hmm2)

0.30981471634686236

## Perceptron

In [40]:
perceptron2 = PerceptronTagger(load=False)
perceptron2.train(train_sentences_pos, nr_iter=10)

In [41]:
obtener_f1(train, perceptron2)

0.890275118761953

In [42]:
obtener_f1(test, perceptron2)

0.4908126887094825

In [44]:
hacer_submission(perceptron2)

---------------
# LSTM

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [8]:
def etiquetar_secuencia(row):
    word_starts, word_ends = starts_ends_tokens(row.text)
    toxic_spans = separar_spans_toxicos(row)
    toxic_starts = [span[0] for span in toxic_spans]
    toxic_ends = [span[-1] for span in toxic_spans]
    
    secuencia = []
    
    for word_start, word_end in zip(word_starts, word_ends):
        tag = "-" if word_start in toxic_starts and word_end in toxic_ends else "+"
        secuencia.append(tag)
    
    return row.text.split(), secuencia

In [9]:
training_data = train.apply(etiquetar_secuencia, axis=1).tolist()
word_to_ix = {}

# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix) #Asign each word with a unique index

tag_to_ix = {"-":0, "+":1}  # Assign each tag with a unique index


In [10]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix.get(w,0) for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [40]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super().__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 
                            num_layers=4, bidirectional=False)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [41]:
%%time
EMBEDDING_DIM = 512
HIDDEN_DIM = 256

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
model.to(device)
print("Modelo creado")
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
print("loss y optimizer creados")

for epoch in range(40):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix).to(device)
        targets = prepare_sequence(tags, tag_to_ix).to(device)
        
        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in).to(device)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    
    if (epoch+1) % 5 == 0:
        print(epoch+1, loss)
        
print("Terminó entrenamiento")

Modelo creado
loss y optimizer creados
5 tensor(0.2259, device='cuda:0', grad_fn=<NllLossBackward>)
10 tensor(0.0523, device='cuda:0', grad_fn=<NllLossBackward>)
15 tensor(0.0057, device='cuda:0', grad_fn=<NllLossBackward>)
20 tensor(0.0035, device='cuda:0', grad_fn=<NllLossBackward>)
25 tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)
30 tensor(0.0081, device='cuda:0', grad_fn=<NllLossBackward>)
35 tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)
40 tensor(9.7109e-05, device='cuda:0', grad_fn=<NllLossBackward>)
Terminó entrenamiento
CPU times: user 23min 47s, sys: 18.5 s, total: 24min 5s
Wall time: 24min 6s


In [43]:
f1s = []
ix_to_tag = ["-", "+"]
with torch.no_grad():
    for row in test.itertuples():
        sentence, true_offsets = row.text.split(), row.spans
        sentence_in = prepare_sequence(sentence, word_to_ix).to(device)
        tags = model(sentence_in).cpu().numpy().argmax(axis=1)
        tags = [ix_to_tag[i] for i in tags] # {0,1} -> {-,+}
        tagged_comment = list(zip(sentence,tags))
        offsets = tags2offsets(tagged_comment)
        f1 = f1_score(offsets, true_offsets)
        f1s.append(f1)
        
np.mean(f1s)

0.3705882155326365

* 0.3830407100809408
    * EMBEDDING_DIM = 512
    * HIDDEN_DIM = 128
    * loss_function = nn.NLLLoss()
    * optimizer = optim.SGD(model.parameters(), lr=0.1)
    * 40 epochs
    * LSTM con 4 capas

* 0.3705882155326365
    * EMBEDDING_DIM = 512
    * HIDDEN_DIM = 256
    * loss_function = nn.NLLLoss()
    * optimizer = optim.SGD(model.parameters(), lr=0.1)
    * 40 epochs
    * LSTM con 4 capas

* 0.36030145365772975
    * EMBEDDING_DIM = 256
    * HIDDEN_DIM = 64
    * loss_function = nn.NLLLoss()
    * optimizer = optim.SGD(model.parameters(), lr=0.1)
    * 40 epochs
    * LSTM con dos capas

* 0.35609794475051537
    * EMBEDDING_DIM = 256
    * HIDDEN_DIM = 64
    * loss_function = nn.NLLLoss()
    * optimizer = optim.SGD(model.parameters(), lr=0.1)
    * 40 epochs
    
* 0.08526413345690455
    * EMBEDDING_DIM = 256
    * HIDDEN_DIM = 64
    * loss_function = nn.CrossEntropyLoss()
    * optimizer = optim.SGD(model.parameters(), lr=0.1)
    * 40 epochs
    
* 0.08526413345690455
    * EMBEDDING_DIM = 512
    * HIDDEN_DIM = 128
    * loss_function = nn.NLLLoss()
    * optimizer = optim.SGD(model.parameters(), lr=0.1)
    * 40 epochs
    * LSTM con 8 capas


[Inicio](#Datos-y-funciones)