<a href="https://colab.research.google.com/github/AndrewPochapsky/chatbot/blob/master/ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import torch
from torch import nn, Tensor
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import re
import numpy as np
from pathlib import Path
from collections import Counter
from ast import literal_eval
from fastai.layers import CrossEntropyFlat
import spacy
import pickle
import random
import math

In [0]:
base_path = Path('drive/My Drive/datasets/cornell movie-dialogs corpus')


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


# Word2Vec


Text Preprocessing

In [0]:
def preprocess(s):
    s = s.replace('\n',' ').lower()
    return s

def tokenize(corpus, vocab = None):
    tokenizer = spacy.blank("en").tokenizer
    doc = tokenizer(corpus)
    tokens = []
    for token in doc:
        if(token.text.strip() != ""):
            if(vocab != None and token.text not in vocab):
                tokens.append('xxxunk')
            else:
                tokens.append(token.text)
    return tokens

def process_dataset():
    all_words = ""
    with open(base_path/'movie_lines.txt', encoding = 'ISO-8859-1') as f:
        for line in f:
            parts = line.split(' +++$+++ ')
            all_words += parts[-1]
    return all_words

def generate_vocab(tokens, min_freq = 0):
    all_unique_words_counter = Counter(tokens)
    vocab = {}
    vocab['xxxpad'] = 0
    vocab['xxxeos'] = 1
    vocab['xxxbos'] = 2
    vocab['xxxunk'] = 3
    index = 4
    for w in all_unique_words_counter.keys():
        if(all_unique_words_counter[w] >= min_freq and w.strip() != ""):
            vocab[w] = index
            index += 1
    return vocab

def replace_with_unk(tokens, vocab):
    for i in range(len(tokens)):
        tokens[i] = tokens[i] if tokens[i] in vocab else 'xxxunk'
        

def subsample(tokens, t = 1e-5):
    """
        Paper: https://arxiv.org/pdf/1310.4546.pdf
    """
    sampled_tokens = []
    counter = Counter(tokens)
    for token in tokens:
        f_w = counter[token]/len(tokens)
        p_w = 1 - math.sqrt(t/f_w)
        val = random.uniform(0, 1)
        if(val >= p_w):
            sampled_tokens.append(token)
            
    return sampled_tokens
        
        
        
        
def create_training_matrices(vocab, all_words, window_size = 5):	
	"""
        Returns x_train: Tensor()
    """
	numTotalWords = len(all_words)
	xTrain=[]
	yTrain=[]
	for i in range(numTotalWords):
		wordsAfter = all_words[i + 1:i + window_size + 1]
		wordsBefore = all_words[max(0, i - window_size):i]
		wordsAdded = wordsAfter + wordsBefore
		for word in wordsAdded:
			xTrain.append(vocab[all_words[i]])
			yTrain.append(vocab[word])
	return Tensor(xTrain), Tensor(yTrain)


In [6]:

full_corpus = process_dataset() 
full_corpus = preprocess(full_corpus)
print('Begin Tokenization')
tokens = tokenize(full_corpus) # list of string
print('Generating vocab')
vocab = generate_vocab(tokens, min_freq = 2)

replace_with_unk(tokens, vocab)

print(len(tokens))
print('Subsampling data')
tokens = subsample(tokens, 1e-5)
print(len(tokens))
print('Getting training data')
x_train, y_train = create_training_matrices(vocab, tokens, window_size = 3)

Begin Tokenization
Generating vocab
4194111
Subsampling data
666195
Getting training data


SkipGram Model

In [0]:
class SkipGramModel(nn.Module):
    def __init__(self, emb_size, emb_dim):
        super(SkipGramModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dim = emb_dim
        self.center_embeddings = nn.Embedding(emb_size, emb_dim, sparse = True)
        self.context_embeddings = nn.Embedding(emb_size, emb_dim, sparse = True)
        self.init_emb()
    
    def init_emb(self):
        #initrange = 0.5 / self.emb_dim
        #self.center_embeddings.weight.data.uniform_(-initrange, initrange)
        nn.init.kaiming_uniform_(self.center_embeddings.weight, a=math.sqrt(5))
        self.context_embeddings.weight.data.uniform_(-0, 0)
        
    def forward(self, pos_center, pos_context, neg_context):
        losses = []
        emb_center = self.center_embeddings(pos_center.long())
        emb_context = self.context_embeddings(pos_context.long())
        score = torch.mul(emb_center, emb_context).squeeze()
        #print(score.shape)
        score = torch.sum(score, dim = 1)
        score = F.logsigmoid(score) # I think it is logsigmoid since we are doing nll loss func?
        losses.append(sum(score))
        
        
        neg_emb_context = self.context_embeddings(neg_context.long())
        #print(neg_emb_context.shape)
        neg_score = torch.bmm(neg_emb_context, emb_center.unsqueeze(2)).squeeze()
        neg_score = torch.sum(neg_score, dim = 1)
        neg_score = F.logsigmoid(-1 * neg_score)
        losses.append(sum(neg_score))
        return -1 * sum(losses)
        
        
        

In [0]:
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size = 128, shuffle = True, num_workers = 4)

def map_to_index(np_array, vocab):
    output = torch.zeros(np_array.shape)
    for i in range(len(np_array)):
        output[i] = Tensor(list(map(lambda x: vocab[x], np_array[i])))
    return output

In [11]:
lr = 3e-3

num_epochs = 10
neg_sample_size = 5
emb_dim = 100
emb_size = len(vocab.keys())
model = SkipGramModel(emb_size, emb_dim)
model.load_state_dict(torch.load(base_path/'word2vec0.pt'))
optim = torch.optim.SGD(model.parameters(), lr = lr) #cant use mom or wd since that would require calculating for all the params, too expensive
i = 0
for epoch in range(num_epochs):
    total_loss = 0
    i = 0
    for xb, yb in train_dl:
        #neg sampling
        neg_context = np.random.choice(
            tokens,
            size=(len(xb), neg_sample_size)
        )
        
        neg_context = map_to_index(neg_context, vocab)
        #print(type(neg_context))
        optim.zero_grad()
        loss = model(xb, yb, neg_context)
        total_loss += loss
        loss.backward()
        optim.step()
        if(i % 100 == 0):
            print('completed {0}/{1} batches. Avg loss per batch: {2}'.format(i, len(train_dl), total_loss/(i)))
        i+=1
    
    torch.save(model.state_dict(), base_path/'word2vec{0}.pt'.format(epoch))
    
    
torch.save(model.state_dict(), base_path/'word2vec.pt')

completed 0/31234 batches. Avg loss per batch: inf
completed 100/31234 batches. Avg loss per batch: 153.0912322998047
completed 200/31234 batches. Avg loss per batch: 152.4522247314453
completed 300/31234 batches. Avg loss per batch: 152.2268524169922
completed 400/31234 batches. Avg loss per batch: 152.18731689453125
completed 500/31234 batches. Avg loss per batch: 152.10638427734375
completed 600/31234 batches. Avg loss per batch: 152.03564453125
completed 700/31234 batches. Avg loss per batch: 151.99029541015625
completed 800/31234 batches. Avg loss per batch: 151.93702697753906
completed 900/31234 batches. Avg loss per batch: 151.91429138183594
completed 1000/31234 batches. Avg loss per batch: 151.90428161621094
completed 1100/31234 batches. Avg loss per batch: 151.91932678222656
completed 1200/31234 batches. Avg loss per batch: 151.88522338867188
completed 1300/31234 batches. Avg loss per batch: 151.81710815429688
completed 1400/31234 batches. Avg loss per batch: 151.7704772949218

KeyboardInterrupt: ignored

# Seq2Seq

Get the data

In [0]:
line_map = {}
with open(base_path/'movie_lines.txt', encoding = 'ISO-8859-1') as f:
    for line in f:
        parts = line.split(' +++$+++ ')
        line_num = parts[0]
        #-2 to get rid of \n
        text = parts[-1][:-1]
        line_map[line_num] = text

In [8]:
table = []
with open(base_path/'movie_conversations.txt', encoding = 'ISO-8859-1') as f:
    for line in f:
        parts = line.split(' +++$+++ ')
        #get the referenced line numbers
        line_nums = re.findall('L[0-9]+', parts[-1])
        #form pairs
        
        for i in range(len(line_nums) - 1):
            pair = (line_nums[i], line_nums[i+1])
            #df.loc[df['column_name'] == some_value]
            first = line_map[line_nums[i]]
            second = line_map[line_nums[i+1]]
            table.append([tokenize(preprocess(first), vocab = vocab), tokenize(preprocess(second), vocab = vocab)])
        
data_df = pd.DataFrame(table, columns = ['in', 'out'])
data_df.to_csv(base_path/'processed_data.csv', index = False)
data_df.head()

Unnamed: 0,in,out
0,"[can, we, make, this, quick, ?, xxxunk, xxxunk...","[well, ,, i, thought, we, 'd, start, with, xxx..."
1,"[well, ,, i, thought, we, 'd, start, with, xxx...","[not, the, hacking, and, gagging, and, spittin..."
2,"[not, the, hacking, and, gagging, and, spittin...","[okay, ..., then, how, 'bout, we, try, out, so..."
3,"[you, 're, asking, me, out, ., that, 's, so, c...","[forget, it, .]"
4,"[no, ,, no, ,, it, 's, my, fault, --, we, did,...","[cameron, .]"


In [0]:
def converter(x):
    #convert "list" to list
    return literal_eval(x)

converters={'in': converter, 'out': converter}
df = pd.read_csv(base_path/'processed_data.csv', converters = converters)


Build the model

In [9]:
emb_dim = 100
emb_size = len(vocab.keys())
word2vec_model = SkipGramModel(emb_size, emb_dim)
word2vec_model.load_state_dict(torch.load(base_path/'word2vec0.pt'))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [40]:
np.percentile([len(row['in']) for index, row in df.iterrows()], 90)

28.0

In [41]:
np.percentile([len(row['out']) for index, row in df.iterrows()], 90)

29.0

In [0]:
def create_embedding_matrix(word2vec_model, emb_size, emb_dim, padding_idx, eos_idx):
    new_emb = nn.Embedding(emb_size, emb_dim, padding_idx=padding_idx)
    old_emb_weights = word2vec_model.center_embeddings.weight.data
    for i in range(len(old_emb_weights)):
        new_emb.weight.data[i] = old_emb_weights[i]
    
    #init the new embeddings to zero
    new_emb.weight.data[padding_idx].uniform_(-old_emb_weights.mean(), old_emb_weights.mean())
    new_emb.weight.data[eos_idx].uniform_(-old_emb_weights.mean(), old_emb_weights.mean())
    
    return new_emb

In [0]:
enc_emb = create_embedding_matrix(word2vec_model, emb_size, emb_dim, vocab['xxxpad'], vocab['xxxeos'])    
dec_emb = create_embedding_matrix(word2vec_model, emb_size, emb_dim, vocab['xxxpad'], vocab['xxxeos'])    

In [0]:
def get_max_x_and_y(df):
    max_x = -1
    for x in df['in']:
        if(len(x) > max_x):
            max_x = len(x)
            
    max_y = -1
    for y in df['out']:
        if(len(y) > max_y):
            max_y = len(y)
    max_y += 1
    with open(base_path/'max.txt', 'w+') as f:
        f.write(str(max_x) + ',' + str(max_y))
    
    return max_x, max_y

In [13]:
get_max_x_and_y(df)

(370, 683)

In [0]:
max_x = None
max_y = None
with open(base_path/'max.txt') as f:
    for line in f:
        max_x, max_y = line.split(',')
        max_x = int(max_x)
        max_y = int(max_y)

In [0]:
def get_x_y_tensors(df, vocab, max_x = 30, max_y = 30):
    num_rows = len(df.index)
    num_valid_samples = 0
    for index, row in df.iterrows():
        #do < instead of <= so there is room for a token if I choose to add
        if(len(row['in']) < max_x and len(row['out']) < max_y):
            num_valid_samples += 1
    res_x = torch.zeros(num_valid_samples, max_x).long() 
    res_y = torch.zeros(num_valid_samples, max_y).long()
    tensor_idx = 0
    for row_idx in range(num_rows):
        if(row_idx % 10000 == 0): print('done {0}/{1} samples'.format(row_idx, num_rows))
        
        x, y = df.iloc[row_idx, :]
        
        if(len(x) >= max_x or len(y) >= max_y): continue
        
        x_tensor = torch.zeros(max_x) + vocab['xxxpad']
        y_tensor = torch.zeros(max_y) + vocab['xxxpad']

        num_padding = max_x - len(x)
         
        #populate the rest of it with actual input
        token_index = 0;
        for i in range(num_padding, max_x):
            x_tensor[i] = vocab[x[token_index]] 
            token_index += 1
               
            
        #add input to the output
        for i in range(len(y)):
            y_tensor[i] = vocab[y[i]]
            
        #add end of stream token
        y_tensor[len(y)] = vocab['xxxeos']
            
        res_x[tensor_idx] = x_tensor.long()
        res_y[tensor_idx] = y_tensor.long()
        tensor_idx += 1
        
    return res_x, res_y
    

In [0]:
#x and y obtained from get_x_y_tensors()
def split_train_valid(all_x, all_y, valid_pct = 0.2):
    num_x_samples, x_len = all_x.shape
    num_y_samples, y_len = all_y.shape
    
    assert num_x_samples == num_y_samples
    
    x_train = torch.zeros(num_x_samples, x_len)
    y_train = torch.zeros(num_y_samples, y_len)
    
    x_valid = torch.zeros(num_x_samples, x_len)
    y_valid = torch.zeros(num_y_samples, y_len)
    
    train_idx = 0
    valid_idx = 0
    
    for x, y in zip(all_x, all_y):
        rand_num = random.uniform(0, 1)
        if(rand_num >= 0.2):
            x_train[train_idx] = x.squeeze(0)
            y_train[train_idx] = y.squeeze(0)
            train_idx += 1
        else:
            x_valid[valid_idx] = x.squeeze(0)
            y_valid[valid_idx] = y.squeeze(0)
            valid_idx += 1
    
    train_ds = TensorDataset(x_train[:train_idx], y_train[:train_idx])
    valid_ds = TensorDataset(x_valid[:valid_idx], y_valid[:valid_idx])
    
    return train_ds, valid_ds
    
    

In [8]:
x, y = get_x_y_tensors(df, vocab)

NameError: ignored

In [0]:
with open(base_path/'x_tensors.pkl', 'wb') as f:
    pickle.dump(x, f)

with open(base_path/'y_tensors.pkl', 'wb') as f:
    pickle.dump(y, f)

In [0]:
def get_saved_tensors():
    x = None
    y = None
    with open(base_path/'x_tensors.pkl', 'rb') as f:
        x = pickle.load(f)
    with open(base_path/'y_tensors.pkl', 'rb') as f:
        y = pickle.load(f)
    return x, y

In [0]:
x, y = get_saved_tensors()

In [22]:
len(train_ds) + len(valid_ds) == len(x)

True

AttributeError: ignored

In [0]:
class Seq2SeqModel(nn.Module):
    def __init__(self, encoder_emb, decoder_emb, num_hidden, output_length, num_layers, pad_idx):
        super().__init__()
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.pad_idx = pad_idx
        self.output_length = output_length
        
        self.encoder_emb_size = encoder_emb.embedding_dim
        self.decoder_emb_size = decoder_emb.embedding_dim
        self.decoder_vocab_size = decoder_emb.num_embeddings
        
        self.encoder_emb = encoder_emb
        self.encoder_emb_drop = nn.Dropout(0.15)
        self.encoder_gru = nn.GRU(self.encoder_emb_size, self.num_hidden, num_layers = self.num_layers, dropout = 0.25, batch_first = True)
        self.encoder_out = nn.Linear(self.num_hidden, self.decoder_emb_size, bias = False)
        
        self.decoder_emb = decoder_emb
        self.decoder_gru = nn.GRU(self.decoder_emb_size, self.decoder_emb_size, num_layers = self.num_layers, dropout = 0.1, batch_first = True)
        self.out_drop = nn.Dropout(0.35)
        self.out = nn.Linear(self.decoder_emb_size, self.decoder_vocab_size)
        self.out.weight.data = self.decoder_emb.weight.data
        
    def encoder(self, bs, inp):
        h = self.init_hidden(bs)
        emb = self.encoder_emb_drop(self.encoder_emb(inp))
        _, h = self.encoder_gru(emb, h)
        h = self.encoder_out(h)
        return h
    
    def decoder(self, decoder_inp, h):
        emb = self.decoder_emb(decoder_inp).unsqueeze(1)
        #print("decoder emb shape: " + str(emb.shape))
        out_pred, h = self.decoder_gru(emb, h)
        #print("out_pred shape: " + str(out_pred.shape))
        out_pred = self.out(self.out_drop(out_pred[:,0]))
        return h, out_pred
        
    def forward(self, inp, eos_index):
        #print("input shape: " + str(inp.shape))
        bs, seq_len = inp.size()
        h = self.encoder(bs, inp)
        #print("hidden shape: " + str(h.shape))
        dec_inp = inp.new_zeros(bs).long()
        res = []
        for i in range(self.output_length):
            #print('i: ' + str(i))
            h, out_pred = self.decoder(dec_inp, h)
            dec_inp = out_pred.max(1)[1]
            res.append(out_pred)
            if (dec_inp==eos_index).all(): break
        return torch.stack(res, dim = 1)
        
    def init_hidden(self, bs): return next(self.parameters()).new_zeros(self.num_layers, bs, self.num_hidden)
        
        

In [0]:
def seq2seq_loss(out, targ, pad_idx):
    bs,targ_len = targ.size()
    #print("targ size: " + str(targ.size()))
    _,out_len, vs = out.size()
    #print("out size " + str(out.size()))
    if targ_len > out_len: out  = F.pad(out,  (0, 0, 0, targ_len - out_len, 0, 0), value=pad_idx)
    if out_len > targ_len: targ = F.pad(targ, (0, out_len - targ_len, 0, 0), value=pad_idx)
        
    #print("targ size: " + str(targ.shape))
    #print("out size " + str(out.shape) )
    return CrossEntropyFlat()(out.float().cuda(), targ.cuda())

In [73]:
train_ds, valid_ds = split_train_valid(x, y)
train_dl = DataLoader(train_ds, batch_size = 128, shuffle = True, num_workers = 4)
valid_dl = DataLoader(valid_ds, batch_size = 128, shuffle = True, num_workers = 4)
anti_vocab = generate_anti_vocab(vocab)
num_epochs = 100
lr = 3e-3
model = Seq2SeqModel(enc_emb, dec_emb, 128, 30, 2, vocab['xxxpad']).cuda()
optim = torch.optim.Adam(model.parameters(), lr = lr)
torch.cuda.empty_cache()
for epoch in range(num_epochs):
    total_loss = 0
    for xb, yb in train_dl:
       
        pred = model(xb.cuda(), vocab['xxxeos'])
        print(tensor_to_str(pred[0].argmax(1), anti_vocab))
        print(tensor_to_str(yb[0], anti_vocab))
        loss = seq2seq_loss(pred, yb, vocab['xxxpad'])
        total_loss += loss
        loss.backward()
        optim.step()
        optim.zero_grad()
        
        #TODO: do an accuracy check with validation set
    print('Epoch: {0}. Train loss: {1}.'.format(epoch + 1, total_loss/len(train_dl)))
        

['they', 'they', 'disgraceful', 'turnoff', 'voltaire', 'silence', 'recklessness', "where'm", 'yeah', 'taco', 'fairytale', 'señor', 'disgraceful', 'ejaculate', 'i', 'the', 'excuse', 'disgraceful', 'taco', 'señor', 'disgraceful', 'he', 'yeah', 'taco', 'disgraceful', 'disgraceful', 'señor', 'disgraceful', ',', 'señor']
['your', 'choice', '.', 'look', 'at', 'the', 'time', '!', 'come', '.', 'there', "'s", 'someone', 'i', 'want', 'you', 'meet', '-', 'about', 'a', 'story', 'i', "'m", 'thinking', 'of', 'publishing', '.', 'xxxeos', 'xxxpad', 'xxxpad']
['bench', 'hejira', 'fron', 'uh', 'morg', 'cuter', 'you', 'i', 'morg', 'i', ',', 'i', ',', ',', ',', 'wright', 'rallying', ',', 'wright', 'wright', 'i', 'wright', 'the', 'the', ',', 'taco', 'rinaldi', ',', 'i', ',']
['what', 'is', 'that', '?', 'what', 'are', 'you', 'doing', 'with', 'your', 'hands', '?', 'talk', 'to', 'me', ',', 'you', "'re", 'talking', 'like', 'that', 'girl', 'sheila', '.', 'xxxeos', 'xxxpad', 'xxxpad', 'xxxpad', 'xxxpad', 'xxxpad

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/connection.py

KeyboardInterrupt: ignored

BrokenPipeError: [Errno 32] Broken pipe


In [0]:
torch.save(model.state_dict(), base_path/'fake.pt')

In [54]:
model = Seq2SeqModel(enc_emb, dec_emb, 128, 30, 2, vocab['xxxpad']).cuda()
model.load_state_dict(torch.load(base_path/'seq2seq.pt'))
model.eval()

Seq2SeqModel(
  (encoder_emb): Embedding(32653, 100, padding_idx=0)
  (encoder_emb_drop): Dropout(p=0.15)
  (encoder_gru): GRU(100, 128, num_layers=2, batch_first=True, dropout=0.25)
  (encoder_out): Linear(in_features=128, out_features=100, bias=False)
  (decoder_emb): Embedding(32653, 100, padding_idx=0)
  (decoder_gru): GRU(100, 100, num_layers=2, batch_first=True, dropout=0.1)
  (out_drop): Dropout(p=0.35)
  (out): Linear(in_features=100, out_features=32653, bias=True)
)

In [0]:
def generate_anti_vocab(vocab):
    res = dict()
    for key in vocab:
        res[vocab[key]] = key
    return res

def str_to_tensor(inp, vocab):
    tokens = tokenize(preprocess(inp), vocab = vocab)
    res = torch.zeros(len(tokens))
    for i in range(len(tokens)):
        res[i] = vocab[tokens[i]]
    return res.unsqueeze(0).long()
    
def tensor_to_str(inp, anti_vocab):
    res = []
    for i in range(len(inp)):
        res.append(anti_vocab[inp[i].item()])
    return res
    

In [0]:
anti_vocab = generate_anti_vocab(vocab)

In [0]:
t = str_to_tensor("hi there, I am a friend of yours", vocab)

In [55]:
pred = model(t.cuda(), vocab['xxxeos'])

tensor([[ 6.7114,  5.3207, -9.9145,  ..., -9.8984, -4.3586, -6.7311]],
       device='cuda:0', grad_fn=<AddmmBackward>)
tensor([0], device='cuda:0')
tensor([[ 6.7114,  5.3207, -9.9145,  ..., -9.8984, -4.3586, -6.7311]],
       device='cuda:0', grad_fn=<AddmmBackward>)
tensor([0], device='cuda:0')
tensor([[ 6.7114,  5.3207, -9.9145,  ..., -9.8984, -4.3586, -6.7311]],
       device='cuda:0', grad_fn=<AddmmBackward>)
tensor([0], device='cuda:0')
tensor([[ 6.7114,  5.3207, -9.9145,  ..., -9.8984, -4.3586, -6.7311]],
       device='cuda:0', grad_fn=<AddmmBackward>)
tensor([0], device='cuda:0')
tensor([[ 6.7114,  5.3207, -9.9145,  ..., -9.8984, -4.3586, -6.7311]],
       device='cuda:0', grad_fn=<AddmmBackward>)
tensor([0], device='cuda:0')
tensor([[ 6.7114,  5.3207, -9.9145,  ..., -9.8984, -4.3586, -6.7311]],
       device='cuda:0', grad_fn=<AddmmBackward>)
tensor([0], device='cuda:0')
tensor([[ 6.7114,  5.3207, -9.9145,  ..., -9.8984, -4.3586, -6.7311]],
       device='cuda:0', grad_fn=<Ad

In [56]:
pred.shape, t.shape

(torch.Size([1, 30, 32653]), torch.Size([1, 9]))

In [57]:
pred.argmax(2)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]], device='cuda:0')