In [1]:
import pandas as pd
import numpy as np

import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tag.hmm import HiddenMarkovModelTrainer
from nltk.tag import PerceptronTagger
from utils import leer_csv, starts_ends_tokens, separar_spans_toxicos

from sklearn.model_selection import train_test_split

import nltk

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/dante/miniconda3/envs/pln/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Datos y funciones

In [2]:
todo = leer_csv("../data/completo_cleaned.csv")
train, test = train_test_split(todo, test_size=0.25, random_state=42)
print(train.shape, test.shape)

(6471, 2) (2158, 2)


In [3]:
def get_vocab(df):
    tokens = df["text"].apply(str.split)
    vocab = set()
    for i,row in tokens.iteritems():
        vocab |= set(row)
    return vocab

vocab = get_vocab(train)

In [21]:
def hacer_submission(model):
    tsd_test = pd.read_csv("../data/tsd_test.csv")

    tsd_test.text = tsd_test.text.str.lower()
    tokenized = tsd_test.text.apply(str.split)

    submission = tokenized.apply(model.tag)

    submission = submission.apply(tags2offsets)

    submission.to_csv("../data/spans-pred.txt", sep="\t", header=False)


# Con etiquetado Tóxico - No Tóxico

## Modelo oculto de Markov

In [4]:
def etiquetar_secuencia(row):
    word_starts, word_ends = starts_ends_tokens(row.text)
    toxic_spans = separar_spans_toxicos(row)
    toxic_starts = [span[0] for span in toxic_spans]
    toxic_ends = [span[-1] for span in toxic_spans]
    
    secuencia = []
    
    for word_start, word_end in zip(word_starts, word_ends):
        tag = "-" if word_start in toxic_starts and word_end in toxic_ends else "+"
        secuencia.append(tag)
        
    secuencia = list(zip(row.text.split(), secuencia))
            
    return secuencia

In [5]:
train_sentences = [etiquetar_secuencia(row) for row in train.itertuples()]

### Modelado

In [6]:
# Tóxico y NO tóxico
trainer = HiddenMarkovModelTrainer(states=["-", "+"], symbols=vocab)

hmm1 = trainer.train(train_sentences)

In [7]:
def tags2offsets(tagged_comment):
    words = [word for word,tag in tagged_comment]
    tags = [tag for word, tag in tagged_comment]
    
    text = " ".join(words)
    starts, ends = starts_ends_tokens(text)
    offsets = []
    for i, (word, tag) in enumerate(tagged_comment):
        if "-" in tag:
            offsets.extend(list( range(starts[i], ends[i]+1)) )
    return offsets

In [8]:
def f1_score(system_offsets, ground_truth):
    if ground_truth:
        system_offsets = set(system_offsets)
        ground_truth = set(ground_truth)

        interseccion = system_offsets & ground_truth
        precision = len(interseccion)/len(system_offsets) if system_offsets else 0
        recall = len(interseccion)/len(ground_truth)
        f1 = (2*precision*recall)/(precision+recall) if (precision,recall) != (0,0) else 0
        
    elif system_offsets:
        f1 = 0 # no hay verdaderas, pero hay en la predicción, se define como 0
    else:
        f1 = 1
    return f1

In [9]:
def obtener_f1(df, model):
    tokenized_text = df.text.apply(str.split)
    tagged_text = tokenized_text.apply(model.tag)

    system_offsets = tagged_text.apply(tags2offsets)
    scores = [f1_score(offsets, ground_truth) 
                for offsets, ground_truth in zip(system_offsets, df.spans)]
    
    return np.mean(scores)

In [10]:
obtener_f1(train, hmm1)

0.5854434660739216

In [11]:
obtener_f1(test, hmm1)

0.28440539271538906

## Perceptron

In [23]:
perceptron = PerceptronTagger(load=False)
perceptron.train(train_sentences, nr_iter=15)

In [24]:
obtener_f1(train, perceptron)

0.9328094579702002

In [25]:
obtener_f1(test, perceptron)

0.49023810456719485

In [22]:
hacer_submission(perceptron)

# Con etiquetado POS

In [26]:
def etiquetar_secuencia_POS(row):
    word_starts, word_ends = starts_ends_tokens(row.text)
    toxic_spans = separar_spans_toxicos(row)
    toxic_starts = [span[0] for span in toxic_spans]
    toxic_ends = [span[-1] for span in toxic_spans]
    
    tokens = row.text.split()
    pos_tags = nltk.pos_tag(tokens)
    secuencia = [tag for text, tag in pos_tags]
    
    for idx, (word_start, word_end) in enumerate(zip(word_starts, word_ends)):
        tag = "-" if word_start in toxic_starts and word_end in toxic_ends else "+"
        secuencia[idx] = secuencia[idx]+f"{tag}"
        
    secuencia = list(zip(row.text.split(), secuencia))
            
    return secuencia

In [27]:
# POS + Tóxico y NO tóxico
train_sentences_pos = [etiquetar_secuencia_POS(row) for row in train.itertuples()]

## Modelo de Markov oculto

In [17]:
tags = ['DT+', 'NN-', 'CC-', 'JJ-', 'VBG+', 'NN+', 'CC+', 'JJ+', 'NNP+', 'PRP+', 'VBP+', 'CD+', 'NNS+', 'RB+', 
        'PRP$+', 'VBG-', 'WP+', 'VBZ+', 'VB+', 'RP+', 'IN+', 'VBN+', 'TO+', 'JJR+', 'WDT+', 'NNP-', 'WRB+', 'MD+',
        'VB-', 'VBD+', 'VBP-', 'RBS+', 'NNS-', 'DT-', 'VBZ-', ':+', 'PDT+', 'IN-', 'EX+', '.+', 'PRP-', 'JJS+', 
        'RBR+', 'VBN-', 'NNPS+', 'CD-', 'RB-', 'WRB-', 'RP-', ',+', 'MD-', 'JJS-', '$+', 'POS+', 'NNPS-', 'SYM+', 
        'TO-', '``+', "''-", 'WDT-', 'VBD-', 'JJR-', 'PRP$-', 'PDT-', 'WP-', '(+', ')+', 'UH+', 'WP$+', 'FW+', 
        '#+', 'RBS-', "''+", 'EX-', 'RBR-', 'FW-', ':-', ',-', 'UH-', 'SYM-', '.-', 'POS-']

In [18]:
trainer = HiddenMarkovModelTrainer(states = tags, symbols=vocab)

hmm2 = trainer.train(train_sentences)

In [19]:
obtener_f1(train, hmm2)

0.6603716306391332

In [20]:
obtener_f1(test, hmm2)

0.30981471634686236

## Perceptron

In [None]:
perceptron2 = PerceptronTagger(load=False)
perceptron2.train(train_sentences_pos, nr_iter=5)

In [None]:
obtener_f1(train, perceptron2)

In [None]:
obtener_f1(test, perceptron2)