In [7]:
import pandas as pd
import numpy as np

from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
from utils import leer_csv, starts_ends_tokens, separar_spans_toxicos

from sklearn.model_selection import train_test_split

import nltk

In [22]:
train_data = leer_csv("../data/train_cleaned.csv")
train_data

Unnamed: 0,spans,text
0,"[8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21,...",Another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio..."
2,"[0, 1, 2, 3, 4]","Damn, a whole family. Sad indeed."
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]",What a knucklehead. How can anyone not know th...
4,"[32, 33, 34, 35, 36, 37, 38, 39, 40]","""who do you think should do the killing?""\n\nA..."
...,...,...
7934,"[8, 9, 10, 11]",Another fool pipes in.
7935,"[51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 6...",So if a restaurant owner puts up a sign saying...
7936,"[0, 1, 2, 4, 5, 6, 7, 8, 10, 11, 12, 13, 15, 1...",Any faith that can't stand up to logic and rea...
7937,"[5, 6, 7, 8, 9, 10, 11, 12]",This idiotic. Use the surplus to pay down the ...


In [31]:
def get_vocab(df):
    tokens = df["text"].apply(str.split)
    vocab = set()
    for i,row in tokens.iteritems():
        vocab |= set(row)
    return vocab

vocab = get_vocab(train_data)

# Obtener secuencias etiquetas

(palabra1,tag1), (palabra2,tag2),...

## Con etiquetado Tóxico - No Tóxico

In [55]:
def etiquetar_secuencia(row):
    word_starts, word_ends = starts_ends_tokens(row.text)
    toxic_spans = separar_spans_toxicos(row)
    toxic_starts = [span[0] for span in toxic_spans]
    toxic_ends = [span[-1] for span in toxic_spans]
    
    secuencia = []
    
    for word_start, word_end in zip(word_starts, word_ends):
        tag = "-" if word_start in toxic_starts and word_end in toxic_ends else "+"
        secuencia.append(tag)
        
    secuencia = list(zip(row.text.split(), secuencia))
            
    return secuencia

In [56]:
etiquetar_secuencia(train_data.loc[0])

[('another', 'N'),
 ('violent', 'T'),
 ('and', 'T'),
 ('aggressive', 'T'),
 ('immigrant', 'T'),
 ('killing', 'N'),
 ('a', 'N'),
 ('innocent', 'N'),
 ('and', 'N'),
 ('intelligent', 'N'),
 ('us', 'N'),
 ('citizen....', 'N'),
 ('sarcasm', 'N')]

### Modelado

In [58]:
# Tóxico y NO tóxico
sentences = [etiquetar_secuencia(row) for row in train_data.itertuples()]

trainer = HiddenMarkovModelTrainer(states=["T", "N"], symbols=vocab)

model = trainer.train(sentences)

In [59]:
comment = train_data.loc[0,"text"].split()
comment

['another',
 'violent',
 'and',
 'aggressive',
 'immigrant',
 'killing',
 'a',
 'innocent',
 'and',
 'intelligent',
 'us',
 'citizen....',
 'sarcasm']

In [60]:
sentences[0]

[('another', 'N'),
 ('violent', 'T'),
 ('and', 'T'),
 ('aggressive', 'T'),
 ('immigrant', 'T'),
 ('killing', 'N'),
 ('a', 'N'),
 ('innocent', 'N'),
 ('and', 'N'),
 ('intelligent', 'N'),
 ('us', 'N'),
 ('citizen....', 'N'),
 ('sarcasm', 'N')]

In [61]:
tagged = model.tag(comment)
tagged

[('another', 'T'),
 ('violent', 'T'),
 ('and', 'T'),
 ('aggressive', 'T'),
 ('immigrant', 'T'),
 ('killing', 'T'),
 ('a', 'N'),
 ('innocent', 'N'),
 ('and', 'N'),
 ('intelligent', 'N'),
 ('us', 'N'),
 ('citizen....', 'N'),
 ('sarcasm', 'N')]

---

## Probando modelos

In [42]:
def tags2offsets(tagged_comment):
    words = [word for word,tag in tagged_comment]
    tags = [tag for word, tag in tagged_comment]
    
    text = " ".join(words)
    starts, ends = starts_ends_tokens(text)
    offsets = []
    for i, (word, tag) in enumerate(tagged_comment):
        if tag == "T":
            offsets.extend(list( range(starts[i], ends[i]+1)) )
    return offsets

In [43]:
system_offsets = tags2offsets(tagged)
system_offsets

[8,
 9,
 10,
 11,
 12,
 13,
 14,
 16,
 17,
 18,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 41,
 42,
 43,
 44,
 45,
 46,
 47]

In [44]:
def f1_score(system_offsets, ground_truth):
    if ground_truth:
        system_offsets = set(system_offsets)
        ground_truth = set(ground_truth)

        interseccion = system_offsets & ground_truth
        precision = len(interseccion)/len(system_offsets) if system_offsets else 0
        recall = len(interseccion)/len(ground_truth)
        f1 = (2*precision*recall)/(precision+recall) if (precision,recall) != (0,0) else 0
        
    elif system_offsets:
        f1 = 0 # no hay verdaderas, pero hay en la predicción, se define como 0
    else:
        f1 = 1
    return f1

In [45]:
f1_score(system_offsets, train_data.loc[0,"spans"])

0.8923076923076924

In [46]:
tokenized_train_text = train_data.text.apply(str.split)
tagged_train_text = tokenized_train_text.apply(model.tag)

train_offsets = tagged_train_text.apply(tags2offsets)

In [47]:
train_scores = [f1_score(system_offsets, ground_truth) 
                for system_offsets, ground_truth in zip(train_offsets, train_data.spans)]
np.mean(train_scores)

0.6673465414319141

Ahora con trial

In [48]:
trial_data = leer_csv("../data/trial_cleaned.csv")

In [49]:
tokenized_test_test = trial_data.text.apply(str.split)
tagged_test_text = tokenized_test_test.apply(model.tag)

test_offsets = tagged_test_text.apply(tags2offsets)

test_scores = [f1_score(system_offsets, ground_truth) 
                for system_offsets, ground_truth in zip(test_offsets, trial_data.spans)]
np.mean(test_scores)

0.28128686274031717

Error en la implementación de F1?

**Entrenando con todo, literalmente**

In [50]:
train = leer_csv("../data/train_cleaned.csv")
trial = leer_csv("../data/trial_cleaned.csv")
print(train.shape, trial.shape)
todo = pd.concat([train, trial], ignore_index = True)
todo.to_csv("../data/tsd_completo.csv", index = False)

(7939, 2) (690, 2)


In [51]:
sentences = [etiquetar_secuencia(row) for row in todo.itertuples()]

vocab = get_vocab(todo)
trainer = HiddenMarkovModelTrainer(states=["T", "N"], symbols=vocab)

model = trainer.train(sentences)

In [52]:
tokenized_text = todo.text.apply(str.split)
tagged_text = tokenized_text.apply(model.tag)

system_offsets = tagged_text.apply(tags2offsets)


In [53]:
scores = [f1_score(offsets, ground_truth) 
                for offsets, ground_truth in zip(system_offsets, todo.spans)]
np.mean(scores)

0.6595644474657535

---------

In [129]:
train, test = train_test_split(todo, test_size=0.25, random_state=42)
print(train.shape, test.shape)

(6471, 2) (2158, 2)


In [130]:
vocab = get_vocab(train)
train_sentences = [etiquetar_secuencia(row) for row in train.itertuples()]
trainer = HiddenMarkovModelTrainer(states=["T", "N"], symbols=vocab)
model = trainer.train(train_sentences)

In [131]:
def obtener_f1(df, model):
    tokenized_text = df.text.apply(str.split)
    tagged_text = tokenized_text.apply(model.tag)

    system_offsets = tagged_text.apply(tags2offsets)
    print(df.shape, system_offsets.shape)
    scores = [f1_score(offsets, ground_truth) 
                for offsets, ground_truth in zip(system_offsets, df.spans)]
    
    return np.mean(scores)

In [132]:
obtener_f1(train, model)

(6471, 2) (6471,)


0.516266133717116

In [133]:
obtener_f1(test, model)

(2158, 2) (2158,)


0.2635414212492578

In [137]:
tsd_test = pd.read_csv("../data/tsd_test.csv")

In [146]:
tsd_test.text = tsd_test.text.str.lower()
tokenized = tsd_test.text.apply(str.split)

In [152]:
submission = tokenized.apply(model.tag)

In [154]:
submission = submission.apply(tags2offsets)

In [160]:
submission.to_csv("../data/spans-pred.txt", sep="\t", header=False)

# Cagadero con POS tagger

## Con etiquetado POS

In [None]:
def tags2offsets(tagged_comment):
    words = [word for word,tag in tagged_comment]
    tags = [tag for word, tag in tagged_comment]
    
    text = " ".join(words)
    starts, ends = starts_ends_tokens(text)
    offsets = []
    for i, (word, tag) in enumerate(tagged_comment):
        if tag[-1] == "-":
            offsets.extend(list( range(starts[i], ends[i]+1)) )
    return offsets

In [12]:
def etiquetar_secuencia_POS(row):
    word_starts, word_ends = starts_ends_tokens(row.text)
    toxic_spans = separar_spans_toxicos(row)
    toxic_starts = [span[0] for span in toxic_spans]
    toxic_ends = [span[-1] for span in toxic_spans]
    
    tokens = row.text.split()
    pos_tags = nltk.pos_tag(tokens)
    secuencia = [tag for text, tag in pos_tags]
    
    for idx, (word_start, word_end) in enumerate(zip(word_starts, word_ends)):
        tag = "-" if word_start in toxic_starts and word_end in toxic_ends else "+"
        secuencia[idx] = secuencia[idx]+f"{tag}"
        
    secuencia = list(zip(row.text.split(), secuencia))
            
    return secuencia

In [13]:
etiquetar_secuencia_POS(train_data.loc[0])

[('Another', 'DT+'),
 ('violent', 'NN-'),
 ('and', 'CC-'),
 ('aggressive', 'JJ-'),
 ('immigrant', 'NN-'),
 ('killing', 'VBG+'),
 ('a', 'DT+'),
 ('innocent', 'NN+'),
 ('and', 'CC+'),
 ('intelligent', 'JJ+'),
 ('US', 'NNP+'),
 ('Citizen....', 'NNP+'),
 ('Sarcasm', 'NNP+')]

In [26]:
# POS + Tóxico y NO tóxico
sentences = [etiquetar_secuencia_POS(row) for row in train_data.itertuples()]
sentences

[[('Another', 'DT+'),
  ('violent', 'NN-'),
  ('and', 'CC-'),
  ('aggressive', 'JJ-'),
  ('immigrant', 'NN-'),
  ('killing', 'VBG+'),
  ('a', 'DT+'),
  ('innocent', 'NN+'),
  ('and', 'CC+'),
  ('intelligent', 'JJ+'),
  ('US', 'NNP+'),
  ('Citizen....', 'NNP+'),
  ('Sarcasm', 'NNP+')],
 [('I', 'PRP+'),
  ('am', 'VBP+'),
  ('56', 'CD+'),
  ('years', 'NNS+'),
  ('old,', 'VBP+'),
  ('I', 'PRP+'),
  ('am', 'VBP+'),
  ('not', 'RB+'),
  ('your', 'PRP$+'),
  ('fucking', 'VBG-'),
  ('junior', 'JJ+'),
  ('pal.', 'NN+'),
  ('What', 'WP+'),
  ('you', 'PRP+'),
  ('are', 'VBP+'),
  ('saying', 'VBG+'),
  ('makes', 'VBZ+'),
  ('no', 'DT+'),
  ('sense.', 'NN+'),
  ('I', 'PRP+'),
  ("don't", 'VBP+'),
  ('know', 'VB+'),
  ('what', 'WP+'),
  ('you', 'PRP+'),
  ('are', 'VBP+'),
  ('basing', 'VBG+'),
  ('this', 'DT+'),
  ('on.', 'VBZ+'),
  ('The', 'DT+'),
  ('cheap', 'JJ+'),
  ('black', 'JJ+'),
  ('market', 'NN+'),
  ('crap', 'NN+'),
  ('is', 'VBZ+'),
  ('still', 'RB+'),
  ('coming', 'VBG+'),
  ('up', 'RP+'

In [29]:
tags = ['DT+', 'NN-', 'CC-', 'JJ-', 'VBG+', 'NN+', 'CC+', 'JJ+', 'NNP+', 'PRP+', 'VBP+', 'CD+', 'NNS+', 'RB+', 
        'PRP$+', 'VBG-', 'WP+', 'VBZ+', 'VB+', 'RP+', 'IN+', 'VBN+', 'TO+', 'JJR+', 'WDT+', 'NNP-', 'WRB+', 'MD+',
        'VB-', 'VBD+', 'VBP-', 'RBS+', 'NNS-', 'DT-', 'VBZ-', ':+', 'PDT+', 'IN-', 'EX+', '.+', 'PRP-', 'JJS+', 
        'RBR+', 'VBN-', 'NNPS+', 'CD-', 'RB-', 'WRB-', 'RP-', ',+', 'MD-', 'JJS-', '$+', 'POS+', 'NNPS-', 'SYM+', 
        'TO-', '``+', "''-", 'WDT-', 'VBD-', 'JJR-', 'PRP$-', 'PDT-', 'WP-', '(+', ')+', 'UH+', 'WP$+', 'FW+', 
        '#+', 'RBS-', "''+", 'EX-', 'RBR-', 'FW-', ':-', ',-', 'UH-', 'SYM-', '.-', 'POS-']

In [32]:
trainer = HiddenMarkovModelTrainer(states = tags, symbols=vocab)

model = trainer.train(sentences)

In [33]:
comment = train_data.loc[0,"text"].split()
comment

['Another',
 'violent',
 'and',
 'aggressive',
 'immigrant',
 'killing',
 'a',
 'innocent',
 'and',
 'intelligent',
 'US',
 'Citizen....',
 'Sarcasm']

In [34]:
sentences[0]

[('Another', 'DT+'),
 ('violent', 'NN-'),
 ('and', 'CC-'),
 ('aggressive', 'JJ-'),
 ('immigrant', 'NN-'),
 ('killing', 'VBG+'),
 ('a', 'DT+'),
 ('innocent', 'NN+'),
 ('and', 'CC+'),
 ('intelligent', 'JJ+'),
 ('US', 'NNP+'),
 ('Citizen....', 'NNP+'),
 ('Sarcasm', 'NNP+')]

In [35]:
tagged = model.tag(comment)
tagged

[('Another', 'DT+'),
 ('violent', 'NN-'),
 ('and', 'CC-'),
 ('aggressive', 'JJ-'),
 ('immigrant', 'NN-'),
 ('killing', 'VBG-'),
 ('a', 'DT+'),
 ('innocent', 'JJ+'),
 ('and', 'CC+'),
 ('intelligent', 'JJ+'),
 ('US', 'NNP+'),
 ('Citizen....', 'NNP+'),
 ('Sarcasm', 'NNP+')]

**Entrenando con todo, literalmente**

In [50]:
train = leer_csv("../data/train_cleaned.csv")
trial = leer_csv("../data/trial_cleaned.csv")
print(train.shape, trial.shape)
todo = pd.concat([train, trial], ignore_index = True)
todo.to_csv("../data/tsd_completo.csv", index = False)

(7939, 2) (690, 2)


In [55]:
sentences = [etiquetar_secuencia_POS(row) for row in todo.itertuples()]

vocab = get_vocab(todo)
trainer = HiddenMarkovModelTrainer(states=tags, symbols=vocab)

model = trainer.train(sentences)

In [56]:
tokenized_text = todo.text.apply(str.split)
tagged_text = tokenized_text.apply(model.tag)

system_offsets = tagged_text.apply(tags2offsets)

In [57]:
scores = [f1_score(offsets, ground_truth) 
                for offsets, ground_truth in zip(system_offsets, todo.spans)]
np.mean(scores)

0.6595644474657535

---------

In [58]:
train, test = train_test_split(todo, test_size=0.25, random_state=42)
print(train.shape, test.shape)

(6471, 2) (2158, 2)


In [59]:
vocab = get_vocab(train)
train_sentences = [etiquetar_secuencia_POS(row) for row in train.itertuples()]
trainer = HiddenMarkovModelTrainer(states=tags, symbols=vocab)
model = trainer.train(train_sentences)

In [60]:
def obtener_f1(df, model):
    tokenized_text = df.text.apply(str.split)
    tagged_text = tokenized_text.apply(model.tag)

    system_offsets = tagged_text.apply(tags2offsets)
    print(df.shape, system_offsets.shape)
    scores = [f1_score(offsets, ground_truth) 
                for offsets, ground_truth in zip(system_offsets, df.spans)]
    
    return np.mean(scores)

In [61]:
obtener_f1(train, model)

(6471, 2) (6471,)


0.6752940533189026

In [62]:
obtener_f1(test, model)

(2158, 2) (2158,)


0.28980778415456826