# Ernesto Antonio Reyes Ramírez 

# Procesamiento de Lenguaje Natural 

# Tarea 5

# Modelo de Lenguaje Neuronales

In [114]:
import os
import time
import shutil
import random
from typing import Tuple
from argparse import Namespace
import matplotlib.pyplot as plt
from itertools import permutations
from random import shuffle

#Preprocesing
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
import pandas as pd
import numpy as np

#Pytorch
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F


#Scikit-learn
from sklearn.metrics import accuracy_score

In [2]:
seed = 1111
random.seed(seed) 
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False

### Procesamos los datos

In [3]:
X_train = pd.read_csv("mex_train.txt", sep = "\r\n",engine = "python",header = None).loc[:,0].values.tolist()
X_val = pd.read_csv("mex_val.txt", sep = "\r\n",engine = "python",header = None).loc[:,0].values.tolist()

### Clases y funciones

In [189]:
class NgramData():
    
    def __init__(self,N: int,vocab_max: int = 5000, tokenizer = None, embeddings_model = None, nivel = "palabra"):
        self.tokenizer = tokenizer if tokenizer else self.default_tokenizer
        self.punct = set(['.',',',';',':','-','^','!','¡','¿','?','"','...','<url>','*','@usuario','»','\''])
        self.N = N
        self.vocab_max = vocab_max
        self.UNK = "<unk>"
        self.SOS = "<s>"
        self.EOS = "</s>"
        self.embeddings_model = embeddings_model
        self.nivel = nivel
        
    def get_vocab_size(self) -> int:
        return len(self.vocab)
        
    def default_tokenizer(self,doc:str) -> list:
        return doc.split(" ")
    
    def remove_word(self,word:str) ->bool:
        word = word.lower()
        is_punct = True if word in self.punct else False
        is_digit = word.isnumeric()
        return is_punct or is_digit
    
    def sortFreqDict(self,freq_dist) -> list:
        freq_dict = dict(freq_dist)
        return sorted(freq_dict,key = freq_dict.get,reverse = True)
    
    def get_vocab(self,corpus:list) -> set:
        if self.nivel == "palabra":
            freq_dist = FreqDist([w.lower() for sentence in corpus for w in self.tokenizer(sentence) if not self.remove_word(w)])
            sorted_words = self.sortFreqDict(freq_dist)[:self.vocab_max-3]
        else:
            freq_dist = FreqDist([w.lower() for sentence in corpus for w in list(sentence) if not self.remove_word(w)])
            sorted_words = self.sortFreqDict(freq_dist)
        return set(sorted_words)
    
    def fit(self,corpus: list) -> None:
        self.vocab = self.get_vocab(corpus)
        self.vocab.add(self.UNK)
        self.vocab.add(self.SOS)
        self.vocab.add(self.EOS)
        
        self.w2id = {}
        self.id2w = {}
        
        if self.embeddings_model is not None:
            primer_valor = list(self.embeddings_model.values())[0]
            longitud_primer_valor = len(primer_valor)
            self.embedding_matrix = np.empty([len(self.vocab),longitud_primer_valor])
            
        id = 0 
        for doc in corpus:
            if self.nivel == "palabra":
                new_doc = self.tokenizer(doc)
            else:
                new_doc = list(doc)
            for word in new_doc:
                word_ = word.lower()
                if word_ in self.vocab and not word_ in self.w2id:
                    self.w2id[word_] = id
                    self.id2w[id] = word_
                    
                    if self.embeddings_model is not None:
                        if word_ in self.embeddings_model:
                            self.embedding_matrix[id] = self.embeddings_model[word_]
                        else:
                            primer_valor = list(self.embeddings_model.values())[0]
                            longitud_primer_valor = len(primer_valor)
                            self.embedding_matrix[id] = np.random.rand(longitud_primer_valor)
                    
                    id += 1
        
        
        #tokens especiales
        
        self.w2id.update(
            {
                self.UNK: id,
                self.SOS: id+1,
                self.EOS: id+2
            }
        )
        
        
        self.id2w.update(
            {
                id: self.UNK,
                id+1: self.SOS,
                id+2: self.EOS
            }
        )
    
    
    def transform(self,corpus:list) -> Tuple[np.ndarray,np.ndarray]:
        
        X_ngrams = []
        y = []         
            
        for doc in corpus:
            doc_ngram = self.get_ngram_doc(doc)
            for words_window in doc_ngram:
                words_window_ids = [self.w2id[w] for w in words_window]
                X_ngrams.append(list(words_window_ids[:-1]))
                y.append(words_window_ids[-1])
                
        
        return np.array(X_ngrams),np.array(y)
    
    def get_ngram_doc(self,doc:str) -> list:
        if self.nivel == "palabra":
            doc_tokens = self.tokenizer(doc)
        else:
            doc_tokens = list(doc)
        doc_tokens = self.replace_unk(doc_tokens)
        doc_tokens = [w.lower() for w in doc_tokens]
        doc_tokens = [self.SOS]*(self.N-1)+ doc_tokens + [self.EOS]
        return list(ngrams(doc_tokens,self.N))
    
    def replace_unk(self,doc_tokens:list) -> list:
        for i,token in enumerate(doc_tokens):
            if token.lower() not in self.vocab:
                doc_tokens[i] = self.UNK
        
        return doc_tokens

In [142]:
class NeuralLM( nn.Module ):
    
    def __init__(self,args, embeddings = None):
        super(NeuralLM,self).__init__()
        
        self.window_size = args.N-1
        self.embedding_dim = args.d
        if embeddings is not None:
            self.emb = nn.Embedding(args.vocab_size, args.d)
            for i in range(embeddings.shape[0]):
                for j in range(embeddings.shape[1]):
                    self.emb.weight.data[i][j] = embeddings[i][j]
                    
        else :
            self.emb   = nn.Embedding(args.vocab_size, args.d)
        
        self.fc1 = nn.Linear(args.d*(args.N-1),args.d_h)
        self.drop1 = nn.Dropout(p=args.dropout)
        self.fc2 = nn.Linear(args.d_h,args.vocab_size,bias=False)
    
    def forward(self,x):
        x = self.emb(x)
        x = x.view(-1,self.window_size*self.embedding_dim)
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        return self.fc2(h)

In [22]:
def get_preds(raw_logits):
    probs = F.softmax(raw_logits.detach(),dim=1)
    y_pred = torch.argmax(probs,dim=1).cpu().numpy()
    
    return y_pred

In [9]:
def model_eval(data,model,gpu=False):
    with torch.no_grad():
        preds,tgts = [],[]
        for window_words, labels in data:
            if gpu:
                window_words = window_words.cuda()
                
            outputs = model(window_words)
            
            y_pred = get_preds(outputs)
            tgt = labels.numpy()
            tgts.append(tgt)
            preds.append(y_pred)
            
    tgts = [e for l in tgts for e in l]
    preds = [e for l in preds for e in l]
    
    return accuracy_score(tgts,preds)

In [10]:
def save_checkpoint(state,is_best,checkpoint_path, filename="checkpoint.pt"):
    filename = os.path.join(checkpoint_path,filename)
    torch.save(state,filename)
    if is_best:
        shutil.copyfile(filename,os.path.join(checkpoint_path, "model_best.pt"))

In [11]:
def print_closest_words(embeddings,ngram_data,word,n):
    word_id = torch.LongTensor([ngram_data.w2id[word]])
    word_embed = embeddings(word_id)
    dists = torch.norm(embeddings.weight - word_embed,dim=1).detach()
    lst = sorted(enumerate(dists.numpy()), key = lambda x:x[1])
    for idx, difference in lst[1:n+1]:
        print(ngram_data.id2w[idx],difference)

In [49]:
def parse_text(text,tokenizer,nivel = "palabra"):
    if nivel == "palabra":
        all_tokens = [w.lower() if w in ngram_data.w2id else "<unk>" for w in tokenizer.tokenize(text)]
        token_ids = [ngram_data.w2id[word.lower()] for word in all_tokens]
    else:
        all_tokens = [w.lower() if w in ngram_data.w2id else "<unk>" for w in list(text)]
        token_ids = [ngram_data.w2id[word.lower()] for word in all_tokens]
    return all_tokens, token_ids

In [25]:
def sample_next_word(logits, temperature = 1.0):
    logits = np.asarray(logits).astype("float64")
    preds = logits/temperature
    exp_preds = np.exp(preds)
    preds = exp_preds/np.sum(exp_preds)
    probas = np.random.multinomial(1,preds)
    return np.argmax(probas)

In [14]:
def predict_next_token(model, token_ids):
    word_ids_tensor = torch.LongTensor(token_ids).unsqueeze(0)
    y_raw_pred = model(word_ids_tensor).squeeze(0).detach().numpy()
    
    y_pred = sample_next_word(y_raw_pred,1.0)
    
    return y_pred

In [27]:
def generate_sentence(model, initial_text, tokenizer,nivel = "palabra"):
    all_tokens, window_word_ids = parse_text(initial_text,tokenizer,nivel)
    
    for i in range(300):
        y_pred = predict_next_token(model, window_word_ids)
        next_word = ngram_data.id2w[y_pred]
        all_tokens.append(next_word)
        
        if next_word == "</s>":
            break
        else:
            window_word_ids.pop(0)
            window_word_ids.append(y_pred)
    
    return " ".join(all_tokens)

In [28]:
def log_likelihood(model, text, ngram_model):
    
    X,y = ngram_data.transform([text])
    
    X,y = X[2:],y[2:]
    X = torch.LongTensor(X).unsqueeze(0)
    
    logits = model(X).detach()
    probs = F.softmax(logits, dim=1).numpy()
    
    return np.sum([np.log(probs[i][w]) for i, w in enumerate(y)])

In [119]:
def perplexity(model, text, ngram_model):
    X, y   = ngram_data.transform(text)
    X, y   = X[2:], y[2:]
    X      = torch.LongTensor(X).unsqueeze(0)
  
    logits = model(X).detach()
    probs  = F.softmax(logits, dim = 1).numpy()
  
    return - 1.0 / len(text) * np.sum([np.log(probs[i][w]) for i, w in enumerate(y)])

# 1. Modelo basado en caracteres

In [169]:
args = Namespace()
args.N = 6

In [170]:
tk = TweetTokenizer()
ngram_data = NgramData(args.N, 5000, tk.tokenize, nivel = "caracter")
ngram_data.fit(X_train)

In [171]:
print(f'Vocab size: {ngram_data.get_vocab_size()}')

Vocab size: 345


In [172]:
X_ngram_train, y_ngram_train = ngram_data.transform(X_train)
X_ngram_val, y_ngram_val     = ngram_data.transform(X_val)

In [173]:
print(f'Training observations: X:{X_ngram_train.shape}, y: {y_ngram_train.shape} ')
print(f'Validation observations: X:{X_ngram_val.shape}, y: {y_ngram_val.shape} ')

Training observations: X:(498949, 5), y: (498949,) 
Validation observations: X:(54110, 5), y: (54110,) 


In [174]:
[[ngram_data.id2w[w] for w in tw] for tw in X_ngram_train[:24]]

[['<s>', '<s>', '<s>', '<s>', '<s>'],
 ['<s>', '<s>', '<s>', '<s>', 'l'],
 ['<s>', '<s>', '<s>', 'l', 'o'],
 ['<s>', '<s>', 'l', 'o', ' '],
 ['<s>', 'l', 'o', ' ', 'p'],
 ['l', 'o', ' ', 'p', 'e'],
 ['o', ' ', 'p', 'e', 'o'],
 [' ', 'p', 'e', 'o', 'r'],
 ['p', 'e', 'o', 'r', ' '],
 ['e', 'o', 'r', ' ', 'd'],
 ['o', 'r', ' ', 'd', 'e'],
 ['r', ' ', 'd', 'e', ' '],
 [' ', 'd', 'e', ' ', 't'],
 ['d', 'e', ' ', 't', 'o'],
 ['e', ' ', 't', 'o', 'd'],
 [' ', 't', 'o', 'd', 'o'],
 ['t', 'o', 'd', 'o', ' '],
 ['o', 'd', 'o', ' ', 'e'],
 ['d', 'o', ' ', 'e', 's'],
 ['o', ' ', 'e', 's', ' '],
 [' ', 'e', 's', ' ', 'q'],
 ['e', 's', ' ', 'q', 'u'],
 ['s', ' ', 'q', 'u', 'e'],
 [' ', 'q', 'u', 'e', ' ']]

In [175]:
[ngram_data.id2w[w] for w in y_ngram_train[:22]]

['l',
 'o',
 ' ',
 'p',
 'e',
 'o',
 'r',
 ' ',
 'd',
 'e',
 ' ',
 't',
 'o',
 'd',
 'o',
 ' ',
 'e',
 's',
 ' ',
 'q',
 'u',
 'e']

In [176]:
# Set Batch Size in args
args.batch_size = 64

# Num workers
args.num_workers = 2

# Train
train_dataset = TensorDataset(torch.tensor(X_ngram_train, dtype = torch.int64),
                              torch.tensor(y_ngram_train, dtype = torch.int64))
train_loader  = DataLoader(train_dataset, 
                           batch_size = args.batch_size,
                           num_workers = args.num_workers,
                           shuffle     = True)
# Val
val_dataset = TensorDataset(torch.tensor(X_ngram_val, dtype = torch.int64),
                              torch.tensor(y_ngram_val, dtype = torch.int64))
val_loader  = DataLoader(val_dataset, 
                           batch_size  = args.batch_size,
                           num_workers = args.num_workers,
                           shuffle     = False)

In [177]:
batch = next(iter(train_loader))
print(f'X shape : {batch[0].shape}')
print(f'y shape : {batch[1].shape}')

X shape : torch.Size([64, 5])
y shape : torch.Size([64])


In [178]:
#Hiperparametros de la red
args.vocab_size = ngram_data.get_vocab_size()
args.d = 100  #dimension of word embeddings
args.d_h = 200  #dimension for hidden layer
args.dropout = 0.1

#parametros de entrenamiento
args.lr = 2.3e-1
args.num_epochs = 100
args.patience = 20

args.lr_patience = 10
args.lr_factor = 0.5

#saving directory
args.savedir = "model"
os.makedirs(args.savedir,exist_ok=True)

#Create model
model = NeuralLM(args)

#send to GPU
args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model.cuda()
    
    
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,"min",
                patience=args.lr_patience,
                verbose=True,
                factor=args.lr_factor
            )

In [46]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []


for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model.train()
    
    for window_words,labels in train_loader:
        
        if args.use_gpu:
            window_words  = window_words.cuda()
            labels = labels.cuda()
            
        #forward
        outputs  = model(window_words)
        loss = criterion(outputs,labels)
        loss_epoch.append(loss.item())
        
        #get training metrics
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt,y_pred))
        
        #backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    #get metric in training dataset
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)
    
    #get metric in validation dataset
    model.eval()
    tuning_metric = model_eval(val_loader,model,gpu=args.use_gpu)
    metric_history.append(mean_epoch_metric)
    
    #update scheduler
    scheduler.step(tuning_metric)
    
    #check for metric improvement
    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1
        
    #save best model if metric improved
    save_checkpoint(
        {
            "epoch":epoch+1,
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            "best_metric": best_metric,
        },
        is_improvement,
        args.savedir,
    )
    
    #early stopping
    if n_no_improve >= args.patience:
        print("No improvement. Breaking out of the loop")
        break
        
    print("Train acc: {}".format(mean_epoch_metric))
    print("Epoch [{}/{}], Loss: {:.4f} - Val accuracy: {:.4f} - Epoch time: {:.2f}".format(epoch+1,args.num_epochs,np.mean(loss_epoch),tuning_metric, (time.time() - epoch_start_time)))
    
print("--- %s seconds --- " % (time.time() - start_time))

Train acc: 0.42481884057971014
Epoch [1/100], Loss: 1.9675 - Val accuracy: 0.4481 - Epoch time: 25.28
Train acc: 0.4612278761061947
Epoch [2/100], Loss: 1.8122 - Val accuracy: 0.4652 - Epoch time: 19.39
Train acc: 0.47452626010003846
Epoch [3/100], Loss: 1.7636 - Val accuracy: 0.4311 - Epoch time: 18.22
Train acc: 0.481227154674875
Epoch [4/100], Loss: 1.7342 - Val accuracy: 0.4771 - Epoch time: 17.99
Train acc: 0.4864807778632807
Epoch [5/100], Loss: 1.7149 - Val accuracy: 0.4856 - Epoch time: 18.06
Train acc: 0.48943944786456334
Epoch [6/100], Loss: 1.6984 - Val accuracy: 0.4891 - Epoch time: 18.00
Train acc: 0.4927291746825702
Epoch [7/100], Loss: 1.6874 - Val accuracy: 0.4812 - Epoch time: 18.06
Train acc: 0.4949367545209696
Epoch [8/100], Loss: 1.6783 - Val accuracy: 0.4906 - Epoch time: 18.00
Train acc: 0.49768220148775166
Epoch [9/100], Loss: 1.6686 - Val accuracy: 0.4924 - Epoch time: 17.87
Train acc: 0.49935512055918946
Epoch [10/100], Loss: 1.6624 - Val accuracy: 0.4898 - Epo

Train acc: 0.5315654258047967
Epoch [78/100], Loss: 1.5383 - Val accuracy: 0.5204 - Epoch time: 17.99
Train acc: 0.5311950910606643
Epoch [79/100], Loss: 1.5383 - Val accuracy: 0.5199 - Epoch time: 18.03
Epoch 00080: reducing learning rate of group 0 to 1.7969e-03.
Train acc: 0.5314051077337438
Epoch [80/100], Loss: 1.5381 - Val accuracy: 0.5197 - Epoch time: 18.02
Train acc: 0.5319642170065411
Epoch [81/100], Loss: 1.5366 - Val accuracy: 0.5195 - Epoch time: 18.06
Train acc: 0.5317261446710273
Epoch [82/100], Loss: 1.5370 - Val accuracy: 0.5202 - Epoch time: 17.91
Train acc: 0.5316439816596126
Epoch [83/100], Loss: 1.5375 - Val accuracy: 0.5198 - Epoch time: 17.93
No improvement. Breaking out of the loop
--- 1534.4202225208282 seconds --- 


In [53]:
best_model = NeuralLM(args)
best_model.load_state_dict(torch.load("model/model_best.pt")["state_dict"])
best_model.train(False)

NeuralLM(
  (emb): Embedding(345, 100)
  (fc1): Linear(in_features=500, out_features=200, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=200, out_features=345, bias=False)
)

### Generando texto 3 veces

In [69]:
initial_tokens = "hola "

print("-"*30)
print("Learned embeddings")
print("-"*30)
print(generate_sentence(best_model, initial_tokens, tk,"caracter"))

------------------------------
Learned embeddings
------------------------------
h o l a   v e r g a <unk> <unk> <unk> </s>


In [70]:
initial_tokens = "madre"

print("-"*30)
print("Learned embeddings")
print("-"*30)
print(generate_sentence(best_model, initial_tokens, tk,"caracter"))

------------------------------
Learned embeddings
------------------------------
m a d r e   e s a s   s o y <unk>   n o   q u i é n </s>


In [107]:
initial_tokens = "quier"

print("-"*30)
print("Learned embeddings")
print("-"*30)
print(generate_sentence(best_model, initial_tokens, tk,"caracter"))

------------------------------
Learned embeddings
------------------------------
q u i e r o   q u e   e r e s   g u s t o   q u e   a h o r a   m e   d e   t u   m a d r e s   s u p e m o   s u   m a d r e <unk> </s>


### Midiendo el likelihood de 5 oraciones

In [108]:
print("log_likelihood: ", log_likelihood(best_model, "Estamos en la clase de procesamiento de lenguaje", ngram_data))

log_likelihood:  -76.42219


In [109]:
print("log_likelihood: ", log_likelihood(best_model, "Estamos lenguaje clase en la de procesamiento de", ngram_data))

log_likelihood:  -80.721405


In [111]:
print("log_likelihood: ", log_likelihood(best_model, "Hola a todos", ngram_data))

log_likelihood:  -17.178316


In [112]:
print("log_likelihood: ", log_likelihood(best_model, "todos a Hola", ngram_data))

log_likelihood:  -19.343298


In [113]:
print("log_likelihood: ", log_likelihood(best_model, "Hola todos a", ngram_data))

log_likelihood:  -19.64844


Estructuras morfologicas

In [115]:
word_list = list('madre')
perms = [''.join(perm) for perm in permutations(word_list)]

print('-' * 50)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse = True)[:5] :
    print(p, t)

print('-' * 50)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse = True)[-5:] :
    print(p, t)

--------------------------------------------------
-4.218685 madre
-9.529079 rmeda
-9.905755 darme
-12.359339 demar
-13.120632 marde
--------------------------------------------------
-31.143446 amder
-31.28386 dmrae
-32.327843 erdma
-33.812 rmdae
-36.56422 aedmr


### Perplejidad

In [141]:
perplexity(best_model, X_val, ngram_data)

143.3429002637987

# 2. Modelo basado en palabras

In [167]:
#Cargamos el word2vec
word2vec_dir = "word2vec_col_nlp_class_spring_2023/word2vec_col.txt"

In [168]:
full_txt = pd.read_csv(word2vec_dir, sep = '\r\n', engine = 'python', header = None).loc[:,0].values.tolist()
dimensions = np.array(full_txt[0].split(), dtype = int)
full_txt = full_txt[1:]
word2vec = dict()
for i in range(dimensions[0]):
    embedding = full_txt[i].split(' ')
    word2vec[embedding[0]] = np.float_(embedding[1:])

In [179]:
args = Namespace()
args.N = 4

In [190]:
tk = TweetTokenizer()
ngram_data = NgramData(args.N,5000,tk.tokenize, embeddings_model = word2vec, nivel = "palabra")
ngram_data.fit(X_train)

In [191]:
X_ngram_train, y_ngram_train = ngram_data.transform(X_train)
X_ngram_val, y_ngram_val     = ngram_data.transform(X_val)

In [193]:
print(f'Training observations: X:{X_ngram_train.shape}, y: {y_ngram_train.shape} ')
print(f'Validation observations: X:{X_ngram_val.shape}, y: {y_ngram_val.shape} ')

Training observations: X:(106964, 3), y: (106964,) 
Validation observations: X:(11594, 3), y: (11594,) 


In [194]:
[[ngram_data.id2w[w] for w in tw] for tw in X_ngram_train[:20]]

[['<s>', '<s>', '<s>'],
 ['<s>', '<s>', 'lo'],
 ['<s>', 'lo', 'peor'],
 ['lo', 'peor', 'de'],
 ['peor', 'de', 'todo'],
 ['de', 'todo', 'es'],
 ['todo', 'es', 'que'],
 ['es', 'que', 'no'],
 ['que', 'no', 'me'],
 ['no', 'me', 'dan'],
 ['me', 'dan', 'por'],
 ['dan', 'por', 'un'],
 ['por', 'un', 'tiempo'],
 ['un', 'tiempo', 'y'],
 ['tiempo', 'y', 'luego'],
 ['y', 'luego', 'vuelven'],
 ['luego', 'vuelven', 'estoy'],
 ['vuelven', 'estoy', 'hasta'],
 ['estoy', 'hasta', 'la'],
 ['hasta', 'la', 'verga']]

In [195]:
[ngram_data.id2w[w] for w in y_ngram_train[:20]]

['lo',
 'peor',
 'de',
 'todo',
 'es',
 'que',
 'no',
 'me',
 'dan',
 'por',
 'un',
 'tiempo',
 'y',
 'luego',
 'vuelven',
 'estoy',
 'hasta',
 'la',
 'verga',
 'de']

In [196]:
# Set Batch Size in args
args.batch_size = 64

# Num workers
args.num_workers = 2

# Train
train_dataset = TensorDataset(torch.tensor(X_ngram_train, dtype = torch.int64),
                              torch.tensor(y_ngram_train, dtype = torch.int64))
train_loader  = DataLoader(train_dataset, 
                           batch_size = args.batch_size,
                           num_workers = args.num_workers,
                           shuffle     = True)
# Val
val_dataset = TensorDataset(torch.tensor(X_ngram_val, dtype = torch.int64),
                              torch.tensor(y_ngram_val, dtype = torch.int64))
val_loader  = DataLoader(val_dataset, 
                           batch_size  = args.batch_size,
                           num_workers = args.num_workers,
                           shuffle     = False)

In [197]:
batch = next(iter(train_loader))
print(f'X shape : {batch[0].shape}')
print(f'y shape : {batch[1].shape}')

X shape : torch.Size([64, 3])
y shape : torch.Size([64])


In [198]:
#Hiperparametros de la red
args.vocab_size = ngram_data.get_vocab_size()
args.d = 100  #dimension of word embeddings
args.d_h = 200  #dimension for hidden layer
args.dropout = 0.1

#parametros de entrenamiento
args.lr = 2.3e-1
args.num_epochs = 100
args.patience = 20

args.lr_patience = 20
args.lr_factor = 0.5

#saving directory
args.savedir = "model"
os.makedirs(args.savedir,exist_ok=True)

#Create model
model = NeuralLM(args,ngram_data.embedding_matrix)

#send to GPU
args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model.cuda()
    
    
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,"min",
                patience=args.lr_patience,
                verbose=True,
                factor=args.lr_factor
            )

In [199]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []


for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model.train()
    
    for window_words,labels in train_loader:
        
        if args.use_gpu:
            window_words  = window_words.cuda()
            labels = labels.cuda()
            
        #forward
        outputs  = model(window_words)
        loss = criterion(outputs,labels)
        loss_epoch.append(loss.item())
        
        #get training metrics
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt,y_pred))
        
        #backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    #get metric in training dataset
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)
    
    #get metric in validation dataset
    model.eval()
    tuning_metric = model_eval(val_loader,model,gpu=args.use_gpu)
    metric_history.append(mean_epoch_metric)
    
    #update scheduler
    scheduler.step(tuning_metric)
    
    #check for metric improvement
    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1
        
    #save best model if metric improved
    save_checkpoint(
        {
            "epoch":epoch+1,
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            "best_metric": best_metric,
        },
        is_improvement,
        args.savedir,
    )
    
    #early stopping
    if n_no_improve >= args.patience:
        print("No improvement. Breaking out of the loop")
        break
        
    print("Train acc: {}".format(mean_epoch_metric))
    print("Epoch [{}/{}], Loss: {:.4f} - Val accuracy: {:.4f} - Epoch time: {:.2f}".format(epoch+1,args.num_epochs,np.mean(loss_epoch),tuning_metric, (time.time() - epoch_start_time)))
    
print("--- %s seconds --- " % (time.time() - start_time))

Train acc: 0.1467516447368421
Epoch [1/100], Loss: 5.6772 - Val accuracy: 0.1906 - Epoch time: 7.84
Train acc: 0.16359337619617223
Epoch [2/100], Loss: 5.2004 - Val accuracy: 0.1704 - Epoch time: 7.94
Train acc: 0.1672174043062201
Epoch [3/100], Loss: 5.0000 - Val accuracy: 0.1615 - Epoch time: 7.70
Train acc: 0.17227497009569379
Epoch [4/100], Loss: 4.8459 - Val accuracy: 0.1901 - Epoch time: 7.76
Train acc: 0.17494206040669857
Epoch [5/100], Loss: 4.7167 - Val accuracy: 0.1671 - Epoch time: 7.53
Train acc: 0.1782147129186603
Epoch [6/100], Loss: 4.6066 - Val accuracy: 0.1810 - Epoch time: 7.74
Train acc: 0.18166492224880382
Epoch [7/100], Loss: 4.4959 - Val accuracy: 0.1963 - Epoch time: 7.60
Train acc: 0.18428154904306218
Epoch [8/100], Loss: 4.4010 - Val accuracy: 0.2037 - Epoch time: 8.06
Train acc: 0.18827003588516747
Epoch [9/100], Loss: 4.3168 - Val accuracy: 0.1829 - Epoch time: 8.56
Train acc: 0.19038576555023923
Epoch [10/100], Loss: 4.2416 - Val accuracy: 0.1641 - Epoch tim

In [200]:
best_model = NeuralLM(args)
best_model.load_state_dict(torch.load("model/model_best.pt")["state_dict"])
best_model.train(False)

NeuralLM(
  (emb): Embedding(5000, 100)
  (fc1): Linear(in_features=300, out_features=200, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=200, out_features=5000, bias=False)
)

### 10 palabras más cercanas a tres palabras  

In [201]:
print("-"*30)
print("Learned embeddings")
print("-"*30)
print_closest_words(best_model.emb, ngram_data, "madre",10)

------------------------------
Learned embeddings
------------------------------
mama 15.1532955
abuela 15.8665285
hermana 15.943254
mamá 17.129944
hija 17.444803
suegra 17.64279
vecina 18.390766
papá 18.414007
padre 19.15226
tía 19.548462


In [202]:
print("-"*30)
print("Learned embeddings")
print("-"*30)
print_closest_words(best_model.emb, ngram_data, "perro",10)

------------------------------
Learned embeddings
------------------------------
gato 9.573887
perrito 12.469641
gatito 15.573966
enano 17.193613
gordo 17.3278
vecino 17.501198
niño 17.517479
sapo 17.797817
macho 17.798012
mono 17.865211


In [203]:
print("-"*30)
print("Learned embeddings")
print("-"*30)
print_closest_words(best_model.emb, ngram_data, "puto",10)

------------------------------
Learned embeddings
------------------------------
maldito 10.946679
malparido 14.632573
hijodeputa 15.781651
estupido 16.454464
culero 16.602884
asqueroso 16.712624
bastardo 16.99627
desgraciado 17.056053
pinche 17.07916
reverendo 17.18088


### Generar texto a partir de tres secuencias

In [227]:
initial_tokens = "<s> <s> <s>"

print("-"*30)
print("Learned embeddings")
print("-"*30)
print(generate_sentence(best_model, initial_tokens, tk))

------------------------------
Learned embeddings
------------------------------
<s> <s> <s> ya lo pinches valió verga <unk> </s>


In [231]:
initial_tokens = "hola a todos"

print("-"*30)
print("Learned embeddings")
print("-"*30)
print(generate_sentence(best_model, initial_tokens, tk))

------------------------------
Learned embeddings
------------------------------
hola a todos los putos dias del año <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> </s>


In [234]:
initial_tokens = "te quiero dar"

print("-"*30)
print("Learned embeddings")
print("-"*30)
print(generate_sentence(best_model, initial_tokens, tk))

------------------------------
Learned embeddings
------------------------------
te quiero dar en la madre <unk> 😡 </s>


### 5 ejemplos y medir el likelihood

In [235]:
print("log_likelihood: ", log_likelihood(best_model, "Estamos en la clase de procesamiento de lenguaje", ngram_data))

log_likelihood:  -36.10342


In [237]:
print("log_likelihood: ", log_likelihood(best_model, "lenguaje de Estamos la en procesamiento clase de", ngram_data))

log_likelihood:  -66.18956


In [238]:
print("log_likelihood: ", log_likelihood(best_model, "Chinguen su madre todos", ngram_data))

log_likelihood:  -11.439241


In [239]:
print("log_likelihood: ", log_likelihood(best_model, "su Chinguen todos madre", ngram_data))

log_likelihood:  -22.895464


In [240]:
print("log_likelihood: ", log_likelihood(best_model, "Chinguen todos su madre", ngram_data))

log_likelihood:  -14.243498


### Estructuras sintácticas

In [242]:
from itertools import permutations
from random import shuffle

word_list = 'chinguen a su madre todos'.split(' ')
perms = [' '.join(perm) for perm in permutations(word_list)]

print('-' * 50)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse = True)[:5] :
    print(p, t)

print('-' * 50)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse = True)[-5:] :
    print(p, t)

--------------------------------------------------
-3.7887769 todos chinguen a su madre
-10.807098 chinguen a su madre todos
-13.72316 chinguen su madre a todos
-13.897089 todos chinguen su madre a
-14.32552 a chinguen su madre todos
--------------------------------------------------
-38.827095 su todos madre a chinguen
-40.850376 a madre chinguen su todos
-41.14011 a su todos madre chinguen
-42.283287 madre a chinguen su todos
-42.904263 chinguen su todos madre a


### Perplejidad

In [243]:
perplexity(best_model, X_val, ngram_data)

113.62265371347404

Modelo sin word2vec

In [244]:
model_dir = "C:/Users/ernes/OneDrive/Documentos/Maestría-CIMAT/Segundo/NLP/Practicas/Practica5/model"

best_model_practica = NeuralLM(args)
best_model_practica.load_state_dict(torch.load(model_dir + "/" + 'model_best.pt')['state_dict'])
best_model_practica.train(False) 

NeuralLM(
  (emb): Embedding(5000, 100)
  (fc1): Linear(in_features=300, out_features=200, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=200, out_features=5000, bias=False)
)

In [245]:
perplexity(best_model_practica, X_val, ngram_data)

112.5419541396104

# Modelo de lenguaje que integre una conexión directa de la capa de embeddings hacía la salida,

Creamos nuestro modelo de lenguaje con una conexión directa entre los embeddings y la capa de salida

In [246]:
class NeuralLM( nn.Module ):
    
    def __init__(self,args, embeddings = None):
        super(NeuralLM,self).__init__()
        
        self.window_size = args.N-1
        self.embedding_dim = args.d
        if embeddings is not None:
            self.emb = nn.Embedding(args.vocab_size, args.d)
            for i in range(embeddings.shape[0]):
                for j in range(embeddings.shape[1]):
                    self.emb.weight.data[i][j] = embeddings[i][j]
                    
        else :
            self.emb   = nn.Embedding(args.vocab_size, args.d)
        
        self.fc1 = nn.Linear(args.d*(args.N-1),args.d_h)
        self.drop1 = nn.Dropout(p=args.dropout)
        self.fc2 = nn.Linear(args.d_h,args.vocab_size,bias=False)
        self.fc3   = nn.Linear(args.d*(args.N-1), args.vocab_size, bias = False)
    
    def forward(self,x):
        x = self.emb(x)
        x = x.view(-1,self.window_size*self.embedding_dim)
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        return self.fc2(h) + self.fc3(x)

Como ya tenemos los datos guardados en el punto anterior los vamos a reutilizar y solo crear una nueva instancia del modelo modificado.

In [247]:
#Hiperparametros de la red
args.vocab_size = ngram_data.get_vocab_size()
args.d = 100  #dimension of word embeddings
args.d_h = 200  #dimension for hidden layer
args.dropout = 0.1

#parametros de entrenamiento
args.lr = 2.3e-1
args.num_epochs = 100
args.patience = 20

args.lr_patience = 20
args.lr_factor = 0.5

#saving directory
args.savedir = "model"
os.makedirs(args.savedir,exist_ok=True)

#Create model
model = NeuralLM(args,ngram_data.embedding_matrix)

#send to GPU
args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model.cuda()
    
    
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,"min",
                patience=args.lr_patience,
                verbose=True,
                factor=args.lr_factor
            )

In [248]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []


for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model.train()
    
    for window_words,labels in train_loader:
        
        if args.use_gpu:
            window_words  = window_words.cuda()
            labels = labels.cuda()
            
        #forward
        outputs  = model(window_words)
        loss = criterion(outputs,labels)
        loss_epoch.append(loss.item())
        
        #get training metrics
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt,y_pred))
        
        #backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    #get metric in training dataset
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)
    
    #get metric in validation dataset
    model.eval()
    tuning_metric = model_eval(val_loader,model,gpu=args.use_gpu)
    metric_history.append(mean_epoch_metric)
    
    #update scheduler
    scheduler.step(tuning_metric)
    
    #check for metric improvement
    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1
        
    #save best model if metric improved
    save_checkpoint(
        {
            "epoch":epoch+1,
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            "best_metric": best_metric,
        },
        is_improvement,
        args.savedir,
    )
    
    #early stopping
    if n_no_improve >= args.patience:
        print("No improvement. Breaking out of the loop")
        break
        
    print("Train acc: {}".format(mean_epoch_metric))
    print("Epoch [{}/{}], Loss: {:.4f} - Val accuracy: {:.4f} - Epoch time: {:.2f}".format(epoch+1,args.num_epochs,np.mean(loss_epoch),tuning_metric, (time.time() - epoch_start_time)))
    
print("--- %s seconds --- " % (time.time() - start_time))

Train acc: 0.111565490430622
Epoch [1/100], Loss: 8.6190 - Val accuracy: 0.1620 - Epoch time: 8.61
Train acc: 0.12478693181818183
Epoch [2/100], Loss: 7.2368 - Val accuracy: 0.1183 - Epoch time: 8.37
Train acc: 0.13623280502392343
Epoch [3/100], Loss: 6.5823 - Val accuracy: 0.1131 - Epoch time: 8.15
Train acc: 0.15503887559808613
Epoch [4/100], Loss: 6.0690 - Val accuracy: 0.1178 - Epoch time: 8.17
Train acc: 0.17028072667464117
Epoch [5/100], Loss: 5.7458 - Val accuracy: 0.1063 - Epoch time: 8.46
Train acc: 0.1856870514354067
Epoch [6/100], Loss: 5.4289 - Val accuracy: 0.1360 - Epoch time: 8.12
Train acc: 0.2003102571770335
Epoch [7/100], Loss: 5.1875 - Val accuracy: 0.1307 - Epoch time: 8.21
Train acc: 0.21177295155502393
Epoch [8/100], Loss: 5.0138 - Val accuracy: 0.1164 - Epoch time: 8.29
Train acc: 0.22142269736842105
Epoch [9/100], Loss: 4.8662 - Val accuracy: 0.1193 - Epoch time: 8.02
Train acc: 0.2347357206937799
Epoch [10/100], Loss: 4.7004 - Val accuracy: 0.1132 - Epoch time:

In [249]:
best_model = NeuralLM(args)
best_model.load_state_dict(torch.load("model/model_best.pt")["state_dict"])
best_model.train(False)

NeuralLM(
  (emb): Embedding(5000, 100)
  (fc1): Linear(in_features=300, out_features=200, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=200, out_features=5000, bias=False)
  (fc3): Linear(in_features=300, out_features=5000, bias=False)
)

In [250]:
perplexity(best_model, X_val, ngram_data)

164.3523234577922

Ahora realizamos el modelo sin embeddings preentrenados.

In [251]:
#Hiperparametros de la red
args.vocab_size = ngram_data.get_vocab_size()
args.d = 100  #dimension of word embeddings
args.d_h = 200  #dimension for hidden layer
args.dropout = 0.1

#parametros de entrenamiento
args.lr = 2.3e-1
args.num_epochs = 100
args.patience = 20

args.lr_patience = 20
args.lr_factor = 0.5

#saving directory
args.savedir = "model"
os.makedirs(args.savedir,exist_ok=True)

#Create model
model = NeuralLM(args)

#send to GPU
args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model.cuda()
    
    
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,"min",
                patience=args.lr_patience,
                verbose=True,
                factor=args.lr_factor
            )

In [252]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []


for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model.train()
    
    for window_words,labels in train_loader:
        
        if args.use_gpu:
            window_words  = window_words.cuda()
            labels = labels.cuda()
            
        #forward
        outputs  = model(window_words)
        loss = criterion(outputs,labels)
        loss_epoch.append(loss.item())
        
        #get training metrics
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt,y_pred))
        
        #backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    #get metric in training dataset
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)
    
    #get metric in validation dataset
    model.eval()
    tuning_metric = model_eval(val_loader,model,gpu=args.use_gpu)
    metric_history.append(mean_epoch_metric)
    
    #update scheduler
    scheduler.step(tuning_metric)
    
    #check for metric improvement
    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1
        
    #save best model if metric improved
    save_checkpoint(
        {
            "epoch":epoch+1,
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            "best_metric": best_metric,
        },
        is_improvement,
        args.savedir,
    )
    
    #early stopping
    if n_no_improve >= args.patience:
        print("No improvement. Breaking out of the loop")
        break
        
    print("Train acc: {}".format(mean_epoch_metric))
    print("Epoch [{}/{}], Loss: {:.4f} - Val accuracy: {:.4f} - Epoch time: {:.2f}".format(epoch+1,args.num_epochs,np.mean(loss_epoch),tuning_metric, (time.time() - epoch_start_time)))
    
print("--- %s seconds --- " % (time.time() - start_time))

Train acc: 0.15254373504784688
Epoch [1/100], Loss: 5.5821 - Val accuracy: 0.1674 - Epoch time: 8.37
Train acc: 0.17065453050239235
Epoch [2/100], Loss: 4.8795 - Val accuracy: 0.1902 - Epoch time: 8.81
Train acc: 0.18176584928229667
Epoch [3/100], Loss: 4.5314 - Val accuracy: 0.1885 - Epoch time: 8.48
Train acc: 0.19350515849282296
Epoch [4/100], Loss: 4.2593 - Val accuracy: 0.1629 - Epoch time: 8.36
Train acc: 0.2041193181818182
Epoch [5/100], Loss: 4.0312 - Val accuracy: 0.1994 - Epoch time: 8.56
Train acc: 0.21449237440191388
Epoch [6/100], Loss: 3.8340 - Val accuracy: 0.1976 - Epoch time: 8.13
Train acc: 0.22802968002392343
Epoch [7/100], Loss: 3.6570 - Val accuracy: 0.1871 - Epoch time: 8.22
Train acc: 0.24819452751196172
Epoch [8/100], Loss: 3.5011 - Val accuracy: 0.1734 - Epoch time: 8.59
Train acc: 0.2670305023923445
Epoch [9/100], Loss: 3.3667 - Val accuracy: 0.2090 - Epoch time: 8.51
Train acc: 0.2828517494019139
Epoch [10/100], Loss: 3.2539 - Val accuracy: 0.1954 - Epoch tim

A conitnuación presentamos en una tabla la recopilación de todos los datos de los 4 modelos a comparar.

In [253]:
perplexity(best_model, X_val, ngram_data)

164.3523234577922

In [254]:
training = {'Modelo 1 con Embeddings preentrenados': {'Train acc': 0.3371, "Loss":2.8713, 'Val acc': 0.1810,'Epochs': 43, 'Time': 341.98, 'Perplexity': 113.6226}, 
            'Modelo con conexión': {'Train acc': 0.4455, "Loss": 2.39, 'Val acc': 0.1735,'Epochs': 41, 'Time': 345.19, 'Perplexity': 164.3523},
            'Modelo con conexión y Embeddings preentrenados': {'Train acc': 0.4862122458133971, "Loss": 2.0838, 'Val acc': 0.1944,'Epochs': 34, 'Time': 289.5669, 'Perplexity' : 164.3523234577922}

            }
pd.DataFrame(training).T

Unnamed: 0,Train acc,Loss,Val acc,Epochs,Time,Perplexity
Modelo 1 con Embeddings preentrenados,0.3371,2.8713,0.181,43.0,341.98,113.6226
Modelo con conexión,0.4455,2.39,0.1735,41.0,345.19,164.3523
Modelo con conexión y Embeddings preentrenados,0.486212,2.0838,0.1944,34.0,289.5669,164.352323


# Conclusiones

Notemos que al modificar el modelo agregando la conexión directa entre los embeddings y la salida obtenemos una mejor considerable en el accuracy del conjunto de entrenamiento. Pero con respecto al accuracy del concjunto de validación si bien aumenta al añadir los embeddings preentrenados esto no es por mucho, nada significativo. Por otro lado la perplejidad de los modelos con la conexión que añadimos si presentamos un mayor valor de perplejidad, muy parecido entre usar embeddings preentrenados o no. 