In [84]:
import pandas as pd
import numpy as np

from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
from utils import leer_csv, starts_ends_tokens, separar_spans_toxicos

from sklearn.model_selection import train_test_split

In [54]:
train_data = leer_csv("../data/train_cleaned.csv")
train_data

Unnamed: 0,spans,text
0,"[8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21,...",another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","i am 56 years old, i am not your fucking junio..."
2,"[0, 1, 2, 3, 4]","damn, a whole family. sad indeed."
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]",what a knucklehead. how can anyone not know th...
4,"[32, 33, 34, 35, 36, 37, 38, 39, 40]","""who do you think should do the killing?""\n\na..."
...,...,...
7934,"[8, 9, 10, 11]",another fool pipes in.
7935,"[51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 6...",so if a restaurant owner puts up a sign saying...
7936,"[0, 1, 2, 4, 5, 6, 7, 8, 10, 11, 12, 13, 15, 1...",any faith that can't stand up to logic and rea...
7937,"[5, 6, 7, 8, 9, 10, 11, 12]",this idiotic. use the surplus to pay down the ...


# Obtener secuencias etiquetas

(palabra1,tag1), (palabra2,tag2),...

In [55]:
def etiquetar_secuencia(row):
    word_starts, word_ends = starts_ends_tokens(row.text)
    toxic_spans = separar_spans_toxicos(row)
    toxic_starts = [span[0] for span in toxic_spans]
    toxic_ends = [span[-1] for span in toxic_spans]
    
    secuencia = []
    
    for word_start, word_end in zip(word_starts, word_ends):
        tag = "T" if word_start in toxic_starts and word_end in toxic_ends else "N"
        secuencia.append(tag)
        
    secuencia = list(zip(row.text.split(), secuencia))
            
    return secuencia

In [56]:
etiquetar_secuencia(train_data.loc[0])

[('another', 'N'),
 ('violent', 'T'),
 ('and', 'T'),
 ('aggressive', 'T'),
 ('immigrant', 'T'),
 ('killing', 'N'),
 ('a', 'N'),
 ('innocent', 'N'),
 ('and', 'N'),
 ('intelligent', 'N'),
 ('us', 'N'),
 ('citizen....', 'N'),
 ('sarcasm', 'N')]

In [57]:
def get_vocab(df):
    tokens = df["text"].apply(str.split)
    vocab = set()
    for i,row in tokens.iteritems():
        vocab |= set(row)
    return vocab

vocab = get_vocab(train_data)

In [58]:
sentences = [etiquetar_secuencia(row) for row in train_data.itertuples()]

trainer = HiddenMarkovModelTrainer(states=["T", "N"], symbols=vocab)

model = trainer.train(sentences)

In [59]:
comment = train_data.loc[0,"text"].split()
comment

['another',
 'violent',
 'and',
 'aggressive',
 'immigrant',
 'killing',
 'a',
 'innocent',
 'and',
 'intelligent',
 'us',
 'citizen....',
 'sarcasm']

In [60]:
sentences[0]

[('another', 'N'),
 ('violent', 'T'),
 ('and', 'T'),
 ('aggressive', 'T'),
 ('immigrant', 'T'),
 ('killing', 'N'),
 ('a', 'N'),
 ('innocent', 'N'),
 ('and', 'N'),
 ('intelligent', 'N'),
 ('us', 'N'),
 ('citizen....', 'N'),
 ('sarcasm', 'N')]

In [61]:
tagged = model.tag(comment)
tagged

[('another', 'T'),
 ('violent', 'T'),
 ('and', 'T'),
 ('aggressive', 'T'),
 ('immigrant', 'T'),
 ('killing', 'T'),
 ('a', 'N'),
 ('innocent', 'N'),
 ('and', 'N'),
 ('intelligent', 'N'),
 ('us', 'N'),
 ('citizen....', 'N'),
 ('sarcasm', 'N')]

In [62]:
def tags2offsets(tagged_comment):
    words = [word for word,tag in tagged_comment]
    tags = [tag for word, tag in tagged_comment]
    
    text = " ".join(words)
    starts, ends = starts_ends_tokens(text)
    offsets = []
    for i, (word, tag) in enumerate(tagged_comment):
        if tag == "T":
            offsets.extend(list( range(starts[i], ends[i]+1)) )
    return offsets

In [63]:
system_offsets = tags2offsets(tagged)
system_offsets

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 16,
 17,
 18,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 41,
 42,
 43,
 44,
 45,
 46,
 47]

In [64]:
def f1_score(system_offsets, ground_truth):
    if ground_truth:
        system_offsets = set(system_offsets)
        ground_truth = set(ground_truth)

        interseccion = system_offsets & ground_truth
        precision = len(interseccion)/len(system_offsets) if system_offsets else 0
        recall = len(interseccion)/len(ground_truth)
        f1 = (2*precision*recall)/(precision+recall) if (precision,recall) != (0,0) else 0
        
    elif system_offsets:
        f1 = 0 # no hay verdaderas, pero hay en la predicción, se define como 0
    else:
        f1 = 1
    return f1

In [65]:
f1_score(system_offsets, train_data.loc[0,"spans"])

0.8055555555555556

In [66]:
tokenized_train_text = train_data.text.apply(str.split)
tagged_train_text = tokenized_train_text.apply(model.tag)

train_offsets = tagged_train_text.apply(tags2offsets)


In [67]:
train_scores = [f1_score(system_offsets, ground_truth) 
                for system_offsets, ground_truth in zip(train_offsets, train_data.spans)]
np.mean(train_scores)

0.5665808284065257

Ahora con trial

In [68]:
trial_data = leer_csv("../data/trial_cleaned.csv")

In [69]:
tokenized_test_test = trial_data.text.apply(str.split)
tagged_test_text = tokenized_test_test.apply(model.tag)

test_offsets = tagged_test_text.apply(tags2offsets)

test_scores = [f1_score(system_offsets, ground_truth) 
                for system_offsets, ground_truth in zip(test_offsets, trial_data.spans)]
np.mean(test_scores)

0.23340335538403886

Error en la implementación de F1?

**Entrenando con todo, literalmente**

In [78]:
train = leer_csv("../data/train_cleaned.csv")
trial = leer_csv("../data/trial_cleaned.csv")
print(train.shape, trial.shape)
todo = pd.concat([train, trial], ignore_index = True)
todo.to_csv("../data/tsd_completo.csv", index = False)

(7939, 2) (690, 2)


In [79]:
sentences = [etiquetar_secuencia(row) for row in todo.itertuples()]

vocab = get_vocab(todo)
trainer = HiddenMarkovModelTrainer(states=["T", "N"], symbols=vocab)

model = trainer.train(sentences)

In [80]:
tokenized_text = todo.text.apply(str.split)
tagged_text = tokenized_text.apply(model.tag)

system_offsets = tagged_text.apply(tags2offsets)


In [81]:
scores = [f1_score(offsets, ground_truth) 
                for offsets, ground_truth in zip(system_offsets, todo.spans)]
np.mean(scores)

0.4600794289527663

In [101]:
todo.shape

(8629, 2)

In [102]:
trial

Unnamed: 0,spans,text
0,"[15, 16, 17, 18, 19, 27, 28, 29, 30, 31]",Because he's a moron and a bigot. It's not any...
1,"[29, 30, 31, 32, 33, 34]",How about we stop protecting idiots and let na...
2,"[166, 167, 168, 169, 170, 171]","If people were smart, they would Boycott th..."
3,"[87, 88, 89, 90, 91, 92]",Trump Claimed that Russia will never invade th...
4,[],As long as your willing to pay a lot more for ...
...,...,...
685,"[129, 130, 131, 132, 133, 134]",But ... Trump's not bluffing. He's prepared to...
686,"[126, 127, 128, 129, 130, 131]",Can't believe the limited knowledge of this Ar...
687,"[24, 25, 26, 27, 28, 29]",I think it conservative idiots who cannot reac...
688,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",You're an id*ot...Go away.


In [104]:
1 - 690/8629

0.9200370842507822

---------

In [129]:
train, test = train_test_split(todo, test_size=0.25, random_state=42)
print(train.shape, test.shape)

(6471, 2) (2158, 2)


In [130]:
vocab = get_vocab(train)
train_sentences = [etiquetar_secuencia(row) for row in train.itertuples()]
trainer = HiddenMarkovModelTrainer(states=["T", "N"], symbols=vocab)
model = trainer.train(train_sentences)

In [131]:
def obtener_f1(df, model):
    tokenized_text = df.text.apply(str.split)
    tagged_text = tokenized_text.apply(model.tag)

    system_offsets = tagged_text.apply(tags2offsets)
    print(df.shape, system_offsets.shape)
    scores = [f1_score(offsets, ground_truth) 
                for offsets, ground_truth in zip(system_offsets, df.spans)]
    
    return np.mean(scores)

In [132]:
obtener_f1(train, model)

(6471, 2) (6471,)


0.516266133717116

In [133]:
obtener_f1(test, model)

(2158, 2) (2158,)


0.2635414212492578

In [137]:
tsd_test = pd.read_csv("../data/tsd_test.csv")

In [146]:
tsd_test.text = tsd_test.text.str.lower()
tokenized = tsd_test.text.apply(str.split)

In [152]:
submission = tokenized.apply(model.tag)

In [154]:
submission = submission.apply(tags2offsets)

In [160]:
submission.to_csv("../data/spans-pred.txt", sep="\t", header=False)