# NLP. Tarea 5: Modelo del Lenguaje Neuronal.

**Diego Moreno**

### 1. Modelo Neuronal a nivel de caracter.

Importamos librerías

In [4]:
# Tools
import os
import time
import shutil
import random
from typing import Tuple
from argparse import Namespace
import matplotlib.pyplot as plt
from itertools import permutations
from random import shuffle
# Preprocesing
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.tokenize import  TweetTokenizer
from nltk import FreqDist
import pandas as pd
import numpy as np
# Pytorch
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
# Scikitlearn
from sklearn.metrics import accuracy_score

Leemos los datos

In [5]:
seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False 

In [6]:
pth = ''
X_train = pd.read_csv(pth+'mex_train.txt', sep='\r\n',  engine='python', header=None).loc[:,0].values.tolist()
X_val = pd.read_csv(pth+'mex_val.txt', sep='\r\n',  engine='python', header=None).loc[:,0].values.tolist()

Para nivel de caracter, tenemos que fijarnos en una ventana de 6 o más:

In [7]:
args = Namespace()
args.N = 6

La clase de N-gramas se quedará igual pues la estrategia será solamente cambiar el tokenizador:

In [8]:
class NgramData():
    def __init__(self, N: int, vocab_max: int=5000, tokenizer=None, embedding_model=None):
        self.tokenizer = tokenizer if tokenizer else self.default_tokenizer
        self.punct = set(['.',',',';',':','-','^','«','»','"','!','¡','?','¿','\'','...','<url>','*','@usuario'])
        self.N = N
        self.vocab_max = vocab_max
        self.UNK = '<unk>'
        self.SOS = '<s>'
        self.EOS = '</s>'
        self.embedding_model = embedding_model

    def default_tokenizer(self, doc: str) -> list:
        return doc.split(' ')

    def get_vocab_size(self) -> int:
        return len(self.vocab)

    def remove_word(self, word: str) -> bool:
        word = word.lower()
        is_punct = True if word in self.punct else False
        is_digit = word.isnumeric()
        return is_punct or is_digit

    def get_vocab(self, corpus: list) -> set:
        freq_dist = FreqDist([w.lower() for sent in corpus \
                              for w in self.tokenizer(sent) \
                              if not self.remove_word(w)])
        sorted_words = self.sortFreqDict(freq_dist)[:self.vocab_max-3]
        return set(sorted_words)

    def sortFreqDict(self, freq_dist) -> list:
        freq_dist = dict(freq_dist)
        return sorted(freq_dist, key=freq_dist.get, reverse=True)

    def fit(self, corpus: list) -> None:
        self.vocab = self.get_vocab(corpus)
        self.vocab.add(self.UNK)
        self.vocab.add(self.SOS)
        self.vocab.add(self.EOS)

        self.w2id = {}
        self.id2w = {}

        if self.embedding_model is not None:
            self.embedding_matrix = np.empty([len(self.vocab), self.embedding_model.vector_size])

        ID = 0
        for doc in corpus:
            for word in self.tokenizer(doc):
                word_ = word.lower()
                if word_ in self.vocab and not word_ in self.w2id:
                    self.w2id[word_] = ID
                    self.id2w[ID] = word_
                    if self.embedding_model is not None:
                        if word_ in self.embedding_model:
                            self.embedding_matrix[ID] = self.embedding_model[word_]
                        else:
                            self.embedding_matrix[ID] = np.random.rand(self.embedding_model.vector_size) 
                    ID += 1
        #Special tokens  
        self.w2id.update({self.UNK: ID, 
                          self.SOS: ID+1,
                          self.EOS: ID+2})  
        self.id2w.update({ID  : self.UNK, 
                          ID+1: self.SOS,
                          ID+2: self.EOS})
    
    def replace_unk(self, doc_tokens: list) -> list: 
        for i, token in enumerate(doc_tokens):
            if token.lower() not in self.vocab:
                doc_tokens[i] = self.UNK
        return doc_tokens


    def get_ngram_doc(self, doc:str) -> list:
        doc_tokens = self.tokenizer(doc)
        doc_tokens = self.replace_unk(doc_tokens)
        doc_tokens = [w.lower() for w in doc_tokens]
        doc_tokens = [self.SOS]*(self.N - 1) + doc_tokens + [self.EOS]
        return list(ngrams(doc_tokens, self.N))
    
    def transform(self, corpus: list) -> Tuple[np.ndarray, np.ndarray]:
        X_ngrams = []
        y = []
        for doc in corpus:
            doc_ngram = self.get_ngram_doc(doc)
            for words_window in doc_ngram:
                words_window_ids = [self.w2id[w] for w in words_window]
                X_ngrams.append(list(words_window_ids[:-1]))
                y.append(words_window_ids[-1])
        return np.array(X_ngrams), np.array(y)

Definimos el nuevo tokenizador a nivel de caracter:

In [9]:
def CharTokenizer(doc: str) -> list:
    l = []
    for c in doc:
        l.append(c)
    return l

Usaremos nivel de caracteres:

In [10]:
char_level = True

In [11]:
if char_level:
    tk = CharTokenizer
else:
    tk = TweetTokenizer()
    tk = tk.tokenize
    
ngram_data = NgramData(args.N, 5000, tk)
ngram_data.fit(X_train)

In [12]:
print('Vocab Size:', ngram_data.get_vocab_size())

Vocab Size: 344


Creamos los datos transformados y los loader de los mismos para entrenamiento y validación

In [13]:
X_ngram_train, y_ngram_train = ngram_data.transform(X_train)
X_ngram_val, y_ngram_val = ngram_data.transform(X_val)

In [14]:
#Batch size
args.batch_size = 64
#Number workers
args.num_workers = 2

#Train
train_dataset = TensorDataset(torch.tensor(X_ngram_train, dtype=torch.int64),
                 torch.tensor(y_ngram_train, dtype=torch.int64))
train_loader = DataLoader(train_dataset,
                          batch_size = args.batch_size,
                          num_workers = args.num_workers,
                          shuffle = True)

#Validation
val_dataset = TensorDataset(torch.tensor(X_ngram_val, dtype=torch.int64),
                 torch.tensor(y_ngram_val, dtype=torch.int64))
val_loader = DataLoader(val_dataset,
                          batch_size = args.batch_size,
                          num_workers = args.num_workers,
                          shuffle = False)

In [15]:
batch = next(iter(train_loader))
print('X shape:', batch[0].shape)
print('y shape:', batch[1].shape)

X shape: torch.Size([64, 5])
y shape: torch.Size([64])


In [16]:
#[[ngram_data.id2w[w] for w in tw] for tw in batch[0].tolist()]

Clase del modelo neuronal:

In [168]:
class NeuralLM(nn.Module):
    def __init__(self, args):
        super(NeuralLM, self).__init__()

        self.window_size = args.N - 1
        self.embedding_size = args.d

        self.emb = nn.Embedding(args.vocab_size, args.d)
        self.fc1 = nn.Linear(args.d * (args.N - 1), args.d_h)
        self.drop1 = nn.Dropout(p = args.dropout)
        self.fc2 = nn.Linear(args.d_h, args.vocab_size, bias=False)

    def forward(self, x):
        x = self.emb(x)
        x = x.view(-1, self.window_size * self.embedding_size)
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        x = self.fc2(h)
        return x

Funciones para el entrenamiento:

In [19]:
def get_preds(raw_logits):
    probs = F.softmax(raw_logits.detach(), dim=1)
    y_pred = torch.argmax(probs, dim=1).cpu().numpy()
    return y_pred

def model_eval(data, model, gpu=False):
    with torch.no_grad():
        preds, tgts = [], []
        for window_words, labels in data:
            if gpu:
                window_words = window_words.cuda()
            outputs = model(window_words)

            #Predictions
            y_pred = get_preds(outputs)
            tgt = labels.numpy()
            tgts.append(tgt)
            preds.append(y_pred)
    tgts = [e for l in tgts for e in l]
    preds = [e for l in preds for e in l]
    return accuracy_score(tgts, preds)

def save_checkpoint(state, is_best, checkpoint_path, filename='checkpoint.pt'):
    filename = os.path.join(checkpoint_path, filename)
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, os.path.join(checkpoint_path, 'model_best.pt'))

Algunos hiperparámetros que tendrán que ser modificados posteriormente al usar un nuevo embedding

In [21]:
#Model hyperparameters
#Vocabulary size
args.vocab_size = ngram_data.get_vocab_size()
#Word embeddings dimension
args.d = 100
#Hidden layer dimension
args.d_h = 200
#Dropout
args.dropout = 0.1

#Training hyperparameters
args.lr = 2.3e-1
args.num_epochs = 100
args.patience = 20

#Scheduler hyperparameters
args.lr_patience = 10
args.lr_factor = 0.5

#Saving hyperparameters
args.savedir = pth + 'model'
os.makedirs(args.savedir, exist_ok=True)

#Create model
model = NeuralLM(args=args)

#Send to GPU
args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model.cuda()

#Loss, Optimizer and Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer = optimizer, 
                                                       mode = 'min',
                                                       factor = args.lr_factor,
                                                       patience = args.lr_patience,
                                                       verbose = True)

Etapa de entrenamiento del modelo:

In [30]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []

#Training
for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model.train()
    for window_words, labels in train_loader:
        if args.use_gpu:
            window_words = window_words.cuda()
            labels = labels.cuda()

        #Forward pass
        outputs = model(window_words)
        loss = criterion(outputs, labels)
        loss_epoch.append(loss.item())

        #Get training metrics
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt, y_pred))

        #Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    #Metrics in training dataset
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)

    #Metrics in validation dataset
    model.eval()
    tuning_metric = model_eval(val_loader, model, gpu=args.use_gpu)
    mean_epoch_metric = np.mean(tuning_metric)
    metric_history.append(mean_epoch_metric)

    #Update scheduler
    scheduler.step(tuning_metric)

    #Check metric improvement
    is_improve = tuning_metric > best_metric
    if is_improve:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1

    #Save best model
    save_checkpoint({'epoch'       : epoch+1,
                     'state_dict'  : model.state_dict(),
                     'optimizer'   : optimizer.state_dict(),
                     'scheduler'   : scheduler.state_dict(),
                     'best_metric' : best_metric}, is_improve, args.savedir)
    
    #Early stoping
    if n_no_improve >= args.patience:
        print('No improvement. Breaking out of loop.')
        break
    print('Train accuracy: ', mean_epoch_metric)
    print('Epoch [{}/{}]: Loss = {:.4f}, Val Acurracy = {:.4f}, Epoch time = {:.2f}'.
          format(epoch+1, args.num_epochs, np.mean(loss_epoch), tuning_metric, (time.time()-epoch_start_time)))
    
print('---------- %s seconds ---------' % time.time()-start_time)

Train accuracy:  0.4386065422287932
Epoch [1/100]: Loss = 1.9773, Val Acurracy = 0.4386, Epoch time = 31.76
Train accuracy:  0.46004435409351324
Epoch [2/100]: Loss = 1.8176, Val Acurracy = 0.4600, Epoch time = 30.50
Train accuracy:  0.4607835889854001
Epoch [3/100]: Loss = 1.7673, Val Acurracy = 0.4608, Epoch time = 30.55
Train accuracy:  0.4825540565514692
Epoch [4/100]: Loss = 1.7373, Val Acurracy = 0.4826, Epoch time = 30.89
Train accuracy:  0.4729624838292367
Epoch [5/100]: Loss = 1.7156, Val Acurracy = 0.4730, Epoch time = 31.02
Train accuracy:  0.4506006283496581
Epoch [6/100]: Loss = 1.7002, Val Acurracy = 0.4506, Epoch time = 30.81
Train accuracy:  0.47444095361301053
Epoch [7/100]: Loss = 1.6873, Val Acurracy = 0.4744, Epoch time = 30.63
Train accuracy:  0.4743300683792275
Epoch [8/100]: Loss = 1.6764, Val Acurracy = 0.4743, Epoch time = 30.30
Train accuracy:  0.4886712252818333
Epoch [9/100]: Loss = 1.6685, Val Acurracy = 0.4887, Epoch time = 32.59
Train accuracy:  0.4830900

KeyboardInterrupt: 

Mejor modelo:

In [32]:
#Model with learned embeddings
best_model = NeuralLM(args)
best_model.load_state_dict(torch.load(pth+'model/model_best.pt')['state_dict'])
best_model.train(False)

NeuralLM(
  (emb): Embedding(344, 100)
  (fc1): Linear(in_features=500, out_features=200, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=200, out_features=344, bias=False)
)

In [33]:
def print_closest_words(embeddings, ngram_data, word, n):
    word_id = torch.LongTensor([ngram_data.w2id[word]])
    word_embed = embeddings(word_id)
    dists = torch.norm(embeddings.weight - word_embed, dim=1).detach()
    lst = sorted(enumerate(dists.numpy()), key=lambda x : x[1])
    for idx, diff in lst[1:n+1]:
        print(ngram_data.id2w[idx], diff)

In [44]:
print('Learned embeddings')
print('¯'*20)
print_closest_words(best_model.emb, ngram_data, 'e', 10)

Learned embeddings
¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
a 3.8728173
o 4.1493793
  4.3050637
s 4.475555
n 4.615959
r 4.6823664
i 4.7688804
l 5.04975
t 5.1100945
<s> 5.1198354


### 1.1. Generamos texto 3 veces con máximo de 300 caracteres.

In [34]:
lenght_max = 300

def parse_text(text, tokenizer):
    all_tokens = [w.lower() if w in ngram_data.w2id else '<unk>' for w in tokenizer(text)]
    token_ids = [ngram_data.w2id[word.lower()] for word in all_tokens]
    return all_tokens, token_ids

def sample_next_word(logits, temperature=1.):
    logits = np.asarray(logits).astype('float64')
    preds = logits/temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probs = np.random.multinomial(1, preds)
    return np.argmax(probs)

def predict_next_token(model, token_ids):
    words_ids_tensor = torch.LongTensor(token_ids).unsqueeze(0)
    y_raw_pred = model(words_ids_tensor).squeeze(0).detach().numpy()
    y_pred = sample_next_word(y_raw_pred, 1.)
    return y_pred

def generate_sentence(model, initial_text, tokenizer):
    all_tokens, window_word_ids = parse_text(initial_text, tokenizer)
    for i in range(lenght_max):
        y_pred = predict_next_token(best_model, window_word_ids)
        next_word = ngram_data.id2w[y_pred]
        all_tokens.append(next_word)
        if next_word == '</s>':
            break
        else:
            window_word_ids.pop(0)
            window_word_ids.append(y_pred)
    if char_level:
        return ''.join(all_tokens)
    else:
        return ' '.join(all_tokens)

In [39]:
initial_tokens = '<s><s'
print('Learned embeddings')
print('¯'*20)
generate_sentence(best_model, initial_tokens, tk)

Learned embeddings
¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯


'<s><s putos perder a oierencanas anuncia un putos<unk> 😊🏻🖕🏻</s>'

In [40]:
initial_tokens = 'estoy'
print('Learned embeddings')
print('¯'*20)
generate_sentence(best_model, initial_tokens, tk)

Learned embeddings
¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯


'estoy que vale verga<unk></s>'

In [41]:
initial_tokens = 'yo op'
print('Learned embeddings')
print('¯'*20)
generate_sentence(best_model, initial_tokens, tk)

Learned embeddings
¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯


'yo opulidado el pubarmos cleto 😈#pasibron wey que esputo es elevar con tantar tienen llegué me dinerente nomar nada ti a habes <unk><unk> vienen como hdp sestrea</s>'

Notar que genera texto bueno, sin embargo, como es a nivel caracter, algunas palabras no tienen un significado sin embargo, el sentido se preserva. Puede que esto mejore al expandir la ventana a más de 6 caracteres.

## 1.2. Escribimos 5 ejemplos de oraciones y medimos su verosimilitud.

In [42]:
def log_likelihood(model, text, ngram_model):
    # Generate n-gram windows from input text and the respective label y
    X, y = ngram_data.transform([text])
    # Discard first two n-gram windows since they contain '<s>' tokens not necessary
    X, y = X[2:], y[2:]
    X = torch.LongTensor(X).unsqueeze(0)

    logits = model(X).detach()
    probs = F.softmax(logits, dim = 1).numpy()

    return np.sum([np.log(probs[i][w]) for i, w in enumerate(y)])

In [59]:
print('Log likelihood:', log_likelihood(best_model, 
                                        'La clase de lenguaje está muy padre', 
                                        ngram_data))

Log likelihood: -58.841915


In [60]:
print('Log likelihood:', log_likelihood(best_model,
                                        'La clase de lenguaje está muy chida',
                                        ngram_data))

Log likelihood: -61.371037


In [61]:
print('Log likelihood:', log_likelihood(best_model,
                                        'La clase de lenguaje está muy guay', 
                                        ngram_data))

Log likelihood: -69.30573


In [62]:
print('Log likelihood:', log_likelihood(best_model,
                                        'La clase de procesamiento del lenguaje está muy padre', 
                                        ngram_data))

Log likelihood: -96.00549


In [63]:
print('Log likelihood:', log_likelihood(best_model,
                                        'La clase de lenguaje está muy madre', 
                                        ngram_data))

Log likelihood: -56.34829


Comparamos que tienen sentido los resultados pues la palabra padre es más usada que chida y mucho más que guay., entre más palabras le pongamos menor será su verosimilitud como en el ejemplo 4. Sin embargo, logra fallar en el último ejemplo cuando decimos que está muy madre. En el español normal no se usa, no obstante, al haber sido entrenado con tuits groseros, se obtiene que es más probable decir que está muy madre. Otro dato curioso, es que al cambiar la palabra «está» por «esta», la verosimilitud disminuye, lo cual podría ser útil para correcciones ortográficas.

## 1.3. Estructuras morfológicas correctas.

In [68]:
if char_level:
    word_list = 'chingada'
    perms = [''.join(perm) for perm in permutations(word_list)]
else:
    word_list = 'sino gano me voy a la chingada'.split(' ')
    perms = [' '.join(perm) for perm in permutations(word_list)]
#print(len(perms))

for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse=True)[:5]:
    print(p, t)
print('-'*50)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse=True)[-5:]:
    print(p, t)

-5.246503 chingada
-5.246503 chingada
-14.173836 dachinga
-14.173836 dachinga
-15.5406885 dgachina
--------------------------------------------------
-66.48084 acihdnag
-67.964966 aacgdhni
-67.964966 aacgdhni
-68.05988 caagdhni
-68.05988 caagdhni


Notamos que hay dos chingadas debido a que se obtiene una de otra solo intercambiando las a's y similarmente para las demás palabras. Los resultados son los esperados pues la más probable es chingada, luego dachinga que contiene la palabra chinga y despues dgachina que contiene la palabra china. Las menos probables ni se pueden leer.

## 1.4. Perplejidad en validación

In [92]:
def perplexity(model, text, ngram_model):
    # Generate n-gram windows from input text and the respective label y
    X, y = ngram_data.transform([text])
    # Discard first two n-gram windows since they contain '<s>' tokens not necessary
    X, y = X[2:], y[2:]
    X = torch.LongTensor(X).unsqueeze(0)

    logits = model(X).detach()
    probs = F.softmax(logits, dim = 1).numpy()
    
    ans = 1.
    N = len(y)
    print('Validation set dimension:', N)
    probs = [(probs[i][w])**(1/N) for i, w in enumerate(y)]
    for p in probs:
        ans /= p
    return ans

In [91]:
print('Validation set perplexity:', perplexity(best_model,
                                        X_val, 
                                        ngram_data))

Validation set dimension: 615
Validation set perplexity: 1.4799053402082503


## 2. Modelo de lenguaje neuronal inicializado con embedding dado.
Leemos el embedding

In [113]:
pth = ''
emb_txt = pd.read_csv(pth+'word2vec_col.txt',
                        sep='\r\n', engine='python', 
                        header=None).loc[:,0].values.tolist()

In [116]:
emb_d = int(emb_txt[0].split()[1])
emb_N = int(emb_txt[0].split()[0])
emb_txt = emb_txt[1:]

Creamos el diccionario del embedding:

In [195]:
emb_dict = {}
for i in range(emb_N):
    row_list = emb_txt[i].split()
    emb_dict[row_list[0]] = torch.tensor(np.array(row_list[1:]).astype(np.float64))

In [197]:
def embedding(windows):
    emb = []
    for words in windows:
        w_emb = []
        for w in words:
            w_emb.append(emb_dict[ngram_data.id2w(w)])
        emb.append(torch.tensor(w_emb))
    return torch.tensor(emb)

Ahora usaremos nivel de palabras:

In [176]:
char_level = False
args.N = 4

In [177]:
if char_level:
    tk = CharTokenizer
else:
    tk = TweetTokenizer()
    tk = tk.tokenize
    
ngram_data = NgramData(args.N, 5000, tk)
ngram_data.fit(X_train)
print('Vocab Size:', ngram_data.get_vocab_size())

Vocab Size: 5000


In [178]:
X_ngram_train, y_ngram_train = ngram_data.transform(X_train)
X_ngram_val, y_ngram_val = ngram_data.transform(X_val)

In [179]:
#Batch size
args.batch_size = 64
#Number workers
args.num_workers = 2

#Train
train_dataset = TensorDataset(torch.tensor(X_ngram_train, dtype=torch.int64),
                 torch.tensor(y_ngram_train, dtype=torch.int64))
train_loader = DataLoader(train_dataset,
                          batch_size = args.batch_size,
                          num_workers = args.num_workers,
                          shuffle = True)

#Validation
val_dataset = TensorDataset(torch.tensor(X_ngram_val, dtype=torch.int64),
                 torch.tensor(y_ngram_val, dtype=torch.int64))
val_loader = DataLoader(val_dataset,
                          batch_size = args.batch_size,
                          num_workers = args.num_workers,
                          shuffle = False)

In [180]:
batch = next(iter(train_loader))
print('X shape:', batch[0].shape)
print('y shape:', batch[1].shape)

X shape: torch.Size([64, 3])
y shape: torch.Size([64])


Clase del modelo neuronal para el caso en que se preinicialice con un embedding dado:

In [181]:
class NeuralLM_embeding(nn.Module):
    def __init__(self, args, emb=None):
        super(NeuralLM_embeding, self).__init__()

        self.window_size = args.N - 1
        self.embedding_size = args.d

        #self.emb = nn.Embedding(args.vocab_size, args.d)
        self.emb = emb
        self.fc1 = nn.Linear(args.d * (args.N - 1), args.d_h)
        self.drop1 = nn.Dropout(p = args.dropout)
        self.fc2 = nn.Linear(args.d_h, args.vocab_size, bias=False)

    def forward(self, x):
        x = self.emb(x)
        x = x.view(-1, self.window_size * self.embedding_size)
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        x = self.fc2(h)
        return x

Cambiamos algunos hiperparámetros:

In [182]:
#Model hyperparameters
#Vocabulary size
args.vocab_size = ngram_data.get_vocab_size()
#Word embeddings dimension
args.d = emb_d
#Hidden layer dimension
args.d_h = 200 #Está bien puestp que emb_d es 100
#Dropout
args.dropout = 0.1

#Training hyperparameters
args.lr = 2.3e-1
args.num_epochs = 100
args.patience = 20

#Scheduler hyperparameters
args.lr_patience = 10
args.lr_factor = 0.5

#Saving hyperparameters
args.savedir = pth + 'model_emb'
os.makedirs(args.savedir, exist_ok=True)

#Create model
model_emb = NeuralLM_embeding(args=args, emb=embedding)

#Send to GPU
args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model.cuda()

#Loss, Optimizer and Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer = optimizer, 
                                                       mode = 'min',
                                                       factor = args.lr_factor,
                                                       patience = args.lr_patience,
                                                       verbose = True)

Etapa de entrenamiento del modelo:

In [192]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []

#Training
for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model.train()
    for window_words, labels in train_loader:
        if args.use_gpu:
            window_words = window_words.cuda()
            labels = labels.cuda()

        #Forward pass
        outputs = model(window_words)
        loss = criterion(outputs, labels)
        loss_epoch.append(loss.item())

        #Get training metrics
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt, y_pred))

        #Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    #Metrics in training dataset
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)

    #Metrics in validation dataset
    model.eval()
    tuning_metric = model_eval(val_loader, model, gpu=args.use_gpu)
    mean_epoch_metric = np.mean(tuning_metric)
    metric_history.append(mean_epoch_metric)

    #Update scheduler
    scheduler.step(tuning_metric)

    #Check metric improvement
    is_improve = tuning_metric > best_metric
    if is_improve:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1

    #Save best model
    save_checkpoint({'epoch'       : epoch+1,
                     'state_dict'  : model.state_dict(),
                     'optimizer'   : optimizer.state_dict(),
                     'scheduler'   : scheduler.state_dict(),
                     'best_metric' : best_metric}, is_improve, args.savedir)
    
    #Early stoping
    if n_no_improve >= args.patience:
        print('No improvement. Breaking out of loop.')
        break
    print('Train accuracy: ', mean_epoch_metric)
    print('Epoch [{}/{}]: Loss = {:.4f}, Val Acurracy = {:.4f}, Epoch time = {:.2f}'.
          format(epoch+1, args.num_epochs, np.mean(loss_epoch), tuning_metric, (time.time()-epoch_start_time)))
    
print('---------- %s seconds ---------' % time.time()-start_time)

IndexError: index out of range in self

In [198]:
e = nn.Embedding(10,3)
e(torch.tensor([[1,2,3],[1,2,3]]))

tensor([[[ 0.2927, -0.5446,  0.1714],
         [ 0.0929,  0.1798,  0.0761],
         [-1.8047,  0.3559,  1.4479]],

        [[ 0.2927, -0.5446,  0.1714],
         [ 0.0929,  0.1798,  0.0761],
         [-1.8047,  0.3559,  1.4479]]], grad_fn=<EmbeddingBackward>)

In [235]:
embedding(torch.tensor([[1,2,3],[1,2,3]]))

[[tensor([ -1.0961,  -1.0184,  -1.8950,  -1.9984,   4.2174,   6.2988,  -0.3170,
           -1.5824,   2.4694,   2.3756,  -1.1883,  -1.4300,  -0.6092,   0.6152,
            1.1048,   0.8866,   2.0087,  -1.8453,   3.2080,  -0.1376,  -4.1967,
           -1.6238,   3.1863,   0.6211,   1.6738,   1.9099,   1.2532,   3.1336,
            3.2359,  -4.5080,   1.6890,   2.0606,  -0.0722,   2.1841,   6.5726,
           -0.1412,  -1.5676,   0.1572,   3.5281,  -0.0361,   1.6923,   0.3356,
           -1.2028,   5.2517,  -1.0029,   2.4605,  -4.6152,   3.2214,   0.1949,
            1.4327,   1.9920,  -0.3237,  -2.1740,   0.9460,  -3.8526,   3.9340,
            1.6828,   4.4286,  -0.1994,   1.8463,  -2.1925, -10.4011,  -2.1528,
            2.8122,  -3.9068,   0.2875,  -0.2148,   5.8414,   0.1621,  -1.0202,
           -3.9861,  -3.2815,   0.9894,  -0.5335,  -1.1370,   0.7730,   0.8782,
            1.1881,  -0.6197,   6.8253,   3.5462,   2.0978,   0.7641,  -2.7847,
           -0.8488,   2.3285,   1.4681, 

In [234]:
def embedding(windows):
    emb = []
    for words in windows:
        w_emb = []
        for w in words:
            w_emb.append(emb_dict[ngram_data.id2w[w.item()]])
        emb.append(w_emb)
    return emb

In [216]:
a = torch.tensor([1,2,4])

In [224]:
a[0].item()

1