# Modelo de Transformer para predicción de idioma

Usa el modulo transformer.py

In [1]:
import pickle as pk
import numpy as np
from collections import defaultdict
import torch
from transformer import *

Los datos de entrenamiento y evaluación vienen de un pickle generado de los datos de las lenguas:

* Entrenamiento: Datos de la biblia
* Test: Datos de jw300

In [2]:
#Open the pickle object with train test datasets
fname = open('train_test_sets.pk','rb')
train, lab, test, lab_y = pk.load(fname)
fname.close()

Por el tiempo de entrenamiento, sólo estoy tomando un subconjunto del entrenamiento, pero con los recursos necesarios se podría entrenar todo.

In [3]:
train, lab = train[:1000], lab[:1000]

In [4]:
print('Train size: {}\nTest size: {}'.format(len(train), len(test)))

Train size: 1000
Test size: 7234


### Funciones auxiliares

Funciones para gener e indexar las sentencias en los documentos.

In [7]:
#Funcion que crea un vocabulario de palabras con un indice numerico
def vocab():
    vocab = defaultdict()
    vocab.default_factory = lambda: len(vocab)
    return vocab    

#Funcion que pasa la cadena de simbolos a una secuencia con indices numericos
def text2numba(corpus, vocab):
    for doc in corpus:
        yield [vocab[w] for w in list(doc.strip().lower())]

#Función para indexar labels
def label2numba(labels, vocab):
    for label in labels:
        yield [vocab[l] for l in [label]]
        
        
#Índices de símbolos especiales
PAD_IDX = 0
BOS_IDX = 1
EOS_IDX = 2

#Función para hacer padding
def insert_pad(lista, PAD_IDX=PAD_IDX):    
    l = max([len(cad) for cad in lista])
    pad_lista = []
    for cad in lista:
        pad_cad = cad + [PAD_IDX for i in range(l-len(cad))]
        pad_lista.append(pad_cad)
    return pad_lista

#Funcuón para insertar símbolos a las cadenas
def process_seqs(seqs,BOS_IDX=BOS_IDX,EOS_IDX=EOS_IDX):
    return insert_pad([[BOS_IDX] + cad + [EOS_IDX] for cad in seqs])

### Creación de los datos de entrenemiento

Se generan los datos con los que se entrenará el transformer. Se usas símbolos típicos.

In [8]:
#Crear vobulario y cadenas
#Lenguaje fuente
src_voc = vocab()
src_voc['<PAD>'] = PAD_IDX
src_voc['<BOS>'] = BOS_IDX
src_voc['<EOS>'] = EOS_IDX
src_cads = process_seqs(list(text2numba(train,src_voc)))

#Crear vocabulario y cadenas
#Lenguaje objetivo
tgt_voc = vocab()
tgt_voc['<PAD>'] = PAD_IDX
tgt_voc['<BOS>'] = BOS_IDX
tgt_voc['<EOS>'] = EOS_IDX
tgt_cads = process_seqs(list(label2numba(lab,tgt_voc)))
#Longitudes de los vocabulario
len_src, len_tgt = len(src_voc), len(tgt_voc)

X, Y = src_cads, tgt_cads

print(X[0],Y[0])

[1, 3, 4, 5, 3, 6, 7, 8, 9, 4, 10, 7, 11, 12, 5, 11, 13, 13, 7, 5, 11, 14, 15, 11, 12, 5, 10, 7, 16, 5, 17, 12, 18, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Generación del modelo

Se usa un modelo sencillo:

* N: Número de copias de las capas.
* d_model: Dimensión del model (emnbeddings, etc)
* d_ff: Dimensión del FeedForward layer.
* h: Cabezales de atención.

In [9]:
#Llama al modelo
model = transformer_model(len_src, len_tgt, N=1, d_model=128, d_ff=256, h=1)

Se entrena el modelo

In [10]:
model.train(X,Y, its=100)

100%|██████████| 100/100 [1:04:19<00:00, 38.60s/it]


Se guarda el modelo para no tener que entrenar otra vez.

In [11]:
#Guarda el modelo
model.save(path='model.tr')

### Evaluación del modelo

Primero cargamos el modelo guardado.

In [12]:
#Carga el modelo con los mismos hiperparámetros
load_model = transformer_model(len_src, len_tgt, N=1, d_model=128, d_ff=256, h=1)
load_model.load('model.tr')

Funciones para transformar las sentencias de evaluación en cadenas que lea el Transformer.

In [13]:
def get_x_input(sents):
    "Transforma la entrada en formato del modelo"
    x_sents = []
    for sent in sents:
        x_sent = []
        for w in list(sent.strip().lower()):
            #Si no está el caracter, lo ignora
            if w in src_voc.keys():
                x_sent.append(src_voc[w])
            else:
                pass
        x_sents.append([BOS_IDX] + x_sent + [EOS_IDX])
        
    return x_sents

#Diccionario para obtener las lenguas desde los índices
tgt_voc_rev = {k:v for v,k in tgt_voc.items()}

Se prueba el modelo en las lenguas a evaluar:

In [19]:
evals = []
for i,s in enumerate(test[:1000]):
    #Transforma las cadenas en índices
    x = get_x_input([s])
    try:
        #Aplica predicción
        pred = model.predict([x],BOS=BOS_IDX, max_len=3).tolist()[1]
        print("Prediction: {} - Real label: {}".format(tgt_voc_rev[pred], lab_y[i]))
        evals.append((tgt_voc_rev[pred], lab_y[i]))
    except:
        #Por alguna razón marca error en algunas cadenas
        #Ignora esta cadenas
        pass

Prediction: <EOS> - Real label: Otomi
Prediction: Mixtec - Real label: Mixtec
Prediction: Mixtec - Real label: Mixtec
Prediction: Zapotec - Real label: Chol
Prediction: Zapotec - Real label: Chinantec
Prediction: Zapotec - Real label: Tojolabal
Prediction: Mixtec - Real label: Chinantec
Prediction: Zapotec - Real label: Mam
Prediction: Zapotec - Real label: Tabasco Chontal
Prediction: Zapotec - Real label: Mam
Prediction: Zapotec - Real label: Mam
Prediction: Zapotec - Real label: Huichol
Prediction: Mixtec - Real label: Mixtec
Prediction: Zapotec - Real label: Zapotec
Prediction: Zapotec - Real label: Tojolabal
Prediction: Zapotec - Real label: Mam
Prediction: Zapotec - Real label: Zapotec
Prediction: Mixtec - Real label: Amuzgo
Prediction: Zapotec - Real label: Chol
Prediction: Zapotec - Real label: Mazatec
Prediction: Zapotec - Real label: Otomi
Prediction: Zapotec - Real label: Totonac
Prediction: Zapotec - Real label: Mam
Prediction: Zapotec - Real label: Mazahua
Prediction: Zapot

Prediction: Zapotec - Real label: Tojolabal
Prediction: Zapotec - Real label: Mazahua
Prediction: Zapotec - Real label: Chol
Prediction: Zapotec - Real label: Amuzgo
Prediction: Mixtec - Real label: Totonac
Prediction: Zapotec - Real label: Zapotec
Prediction: Zapotec - Real label: Mazatec
Prediction: Zapotec - Real label: Zapotec
Prediction: Zapotec - Real label: Amuzgo
Prediction: Zapotec - Real label: Otomi
Prediction: Zapotec - Real label: Tabasco Chontal
Prediction: Zapotec - Real label: Huichol
Prediction: Zapotec - Real label: Tojolabal
Prediction: Zapotec - Real label: Mam
Prediction: Zapotec - Real label: Mixe
Prediction: Zapotec - Real label: Amuzgo
Prediction: Zapotec - Real label: Totonac
Prediction: Mixtec - Real label: Chol
Prediction: Zapotec - Real label: Otomi
Prediction: Zapotec - Real label: Chol
Prediction: Zapotec - Real label: Totonac
Prediction: Zapotec - Real label: Mixe
Prediction: Zapotec - Real label: Mam
Prediction: Mixtec - Real label: Mazatec
Prediction: Z

Prediction: Zapotec - Real label: Mam
Prediction: Zapotec - Real label: Otomi
Prediction: Zapotec - Real label: Huichol
Prediction: Zapotec - Real label: Zapotec
Prediction: Zapotec - Real label: Huichol
Prediction: Zapotec - Real label: Mixtec
Prediction: Zapotec - Real label: Mazatec
Prediction: Mixtec - Real label: Mixtec
Prediction: Zapotec - Real label: Chol
Prediction: Zapotec - Real label: Tabasco Chontal
Prediction: Zapotec - Real label: Mazatec
Prediction: Mixtec - Real label: Amuzgo
Prediction: Zapotec - Real label: Mixe
Prediction: Zapotec - Real label: Mazatec
Prediction: Zapotec - Real label: Mam
Prediction: Mixtec - Real label: Mazatec
Prediction: Mixtec - Real label: Nahuatl
Prediction: Mixtec - Real label: Chinantec
Prediction: Zapotec - Real label: Amuzgo
Prediction: Zapotec - Real label: Otomi
Prediction: Zapotec - Real label: Mazahua
Prediction: Mixtec - Real label: Otomi
Prediction: Zapotec - Real label: Zapotec
Prediction: Zapotec - Real label: Mam
Prediction: Zapo

Prediction: Zapotec - Real label: Tojolabal
Prediction: Zapotec - Real label: Tabasco Chontal
Prediction: Zapotec - Real label: Tabasco Chontal
Prediction: Zapotec - Real label: Huichol
Prediction: Zapotec - Real label: Mixe
Prediction: Zapotec - Real label: Mixe
Prediction: Zapotec - Real label: Mazahua
Prediction: Zapotec - Real label: Mazahua
Prediction: Zapotec - Real label: Tabasco Chontal
Prediction: Zapotec - Real label: Tojolabal
Prediction: Zapotec - Real label: Chol
Prediction: Zapotec - Real label: Amuzgo
Prediction: Mixtec - Real label: Nahuatl
Prediction: Zapotec - Real label: Mixtec
Prediction: Mixtec - Real label: Nahuatl
Prediction: Zapotec - Real label: Totonac
Prediction: Zapotec - Real label: Mixtec
Prediction: <EOS> - Real label: Amuzgo
Prediction: Zapotec - Real label: Amuzgo
Prediction: Zapotec - Real label: Mixtec
Prediction: Mixtec - Real label: Nahuatl
Prediction: Zapotec - Real label: Chinantec
Prediction: Zapotec - Real label: Zapotec
Prediction: Mixtec - Rea

Prediction: Zapotec - Real label: Huichol
Prediction: Mixtec - Real label: Totonac
Prediction: Zapotec - Real label: Amuzgo
Prediction: Zapotec - Real label: Zapotec
Prediction: Zapotec - Real label: Tojolabal
Prediction: Mixtec - Real label: Nahuatl
Prediction: Zapotec - Real label: Amuzgo
Prediction: Mixtec - Real label: Nahuatl
Prediction: Zapotec - Real label: Mixe
Prediction: Zapotec - Real label: Otomi
Prediction: Zapotec - Real label: Tojolabal
Prediction: Zapotec - Real label: Mam
Prediction: Zapotec - Real label: Nahuatl
Prediction: Zapotec - Real label: Zapotec
Prediction: Mixtec - Real label: Mam
Prediction: Zapotec - Real label: Mazatec
Prediction: Zapotec - Real label: Tojolabal
Prediction: Mixtec - Real label: Amuzgo
Prediction: Zapotec - Real label: Zapotec
Prediction: Zapotec - Real label: Mixtec
Prediction: Zapotec - Real label: Chol
Prediction: Zapotec - Real label: Mazahua
Prediction: Zapotec - Real label: Huichol
Prediction: Zapotec - Real label: Chol
Prediction: Za

Reporte de evaluación

In [27]:
from sklearn.metrics import classification_report

print(classification_report([e[1] for e in evals], [e[0] for e in evals]))

                 precision    recall  f1-score   support

          <EOS>       0.00      0.00      0.00         0
         Amuzgo       0.00      0.00      0.00        81
      Chinantec       0.00      0.00      0.00        36
           Chol       0.00      0.00      0.00        66
        Huichol       0.00      0.00      0.00        68
            Mam       0.00      0.00      0.00        78
        Mazahua       0.00      0.00      0.00        62
        Mazatec       0.00      0.00      0.00        61
           Mixe       0.00      0.00      0.00        63
         Mixtec       0.18      0.82      0.29        56
        Nahuatl       0.00      0.00      0.00        76
          Otomi       0.00      0.00      0.00        65
Tabasco Chontal       0.00      0.00      0.00        72
      Tojolabal       0.00      0.00      0.00        70
        Totonac       0.00      0.00      0.00        68
        Zapotec       0.10      0.92      0.18        78

       accuracy              