# Avaliação do Hidden Markov Model 

In [47]:
import operator
import os
import random
import functools
import collections
import nltk
import numpy as np
from nltk.tag.hmm import HiddenMarkovModelTrainer
from sklearn.model_selection import KFold

random.seed(1999)

In [48]:
# DIR = "./dados-categorias/PLs/"  
# DIR = "./dados-tipos/PLs/"  
# DIR = "./dados-categorias/STs/"
# DIR = "./dados-tipos/STs/"
# DIR = "./dados-categorias/Comentarios/"
DIR = "./dados-tipos/Comentarios/"

In [49]:
all_files = [DIR+f for f in os.listdir(DIR)]

In [50]:
def process_conll_file(location:str)->list:
    with open(location, "r") as f:
        data = f.read()
    data = data.split("\n\n")
    data = list(map(lambda x:x.split("\n"), data))
    data.pop()
    data = list(map(lambda x:[operator.itemgetter(*[0, -1])(y.split(" ")) for y in x], data))
    return data

def combine_files(locations:list)->list:
    extended = []
    for f in locations:
        extended.extend(process_conll_file(f))
    return extended

In [51]:
# Divisão entre Conjuntos de Treinamento e de Teste
if DIR=="./dados-categorias/PLs/" or DIR=="./dados-tipos/PLs/":
    train_size = int(0.75*len(all_files))
    random.shuffle(all_files)
    train_files = all_files[:train_size]
    test_files = all_files[train_size:]
    
    train = combine_files(train_files)
    test = combine_files(test_files)
else:
    all_data = combine_files(all_files)
    random.shuffle(all_data)
    train_size = int(0.75*len(all_data))
    train = all_data[:train_size]
    test = all_data[train_size:]
print(f"Número de Sentenças no Conjunto de Treinamento: {len(train)}")
print(f"Número de Sentenças no Conjunto de Teste: {len(test)}")

Número de Sentenças no Conjunto de Treinamento: 725
Número de Sentenças no Conjunto de Teste: 242


In [52]:
def retrieve_sents(data:list)->list:
    return list(map(lambda x:[w for w,t in x], data))

In [53]:
to_store = "HMM-CV"
os.mkdir(to_store)

# Aplicacao de 5-fold CV nas sentenças do conjunto de treinamento
kfold = KFold(n_splits=5)
train = np.array(train, dtype=object)
i = 1
for t, tt in kfold.split(train):
    to_train = train[t].tolist()
    to_val = train[tt].tolist()
    unlab_test = retrieve_sents(to_val)
    hmm = HiddenMarkovModelTrainer().train_supervised(to_train)
    yhmm = hmm.tag_sents(unlab_test)
    hmm_file = ""
    for preds, true in zip(yhmm, to_val):
        for j in range(len(preds)):
            hmm_file += true[j][0] + " " + true[j][1] + " " + preds[j][1] + "\n"
        hmm_file += "\n"
    with open(f"./{to_store}/predictions_file_{i}", "w") as f:
        f.write(hmm_file)
    i += 1

In [54]:
# # Treina e armazena as predições do modelo no conjunto de teste
# unlab_test = retrieve_sents(test)
# train = train.tolist()
# hmm = HiddenMarkovModelTrainer().train_supervised(train)
# yhmm = hmm.tag_sents(unlab_test)

# hmm_file = ""
# for preds, true in zip(yhmm, test):
#     for j in range(len(preds)):
#         hmm_file += true[j][0] + " " + true[j][1] + " " + preds[j][1] + "\n"
#     hmm_file += "\n"
# with open("predictions_file_final", "w") as f:
#     f.write(hmm_file)