In [2]:
import torch 
import torchvision
import os
import operator
import numpy as np
import torch.nn as nn
import torchvision.transforms as transforms
import pandas as pd

from PIL import Image
from collections import Counter
from string import punctuation
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset


In [3]:
def prepare_caption(caption):
    caption = ''.join([char for char in caption if not char in punctuation]).lower()
    return caption
prepare_caption('My name is Nancy.')

'my name is nancy'

In [4]:
transform = transforms.Compose([transforms.Resize((224, 224)),
                                transforms.ToTensor(),
                               transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

In [5]:
class Flickr30k(Dataset):
    
    def __init__(self, root_dir, csv_file, transform=None, topk=5000):
        self.df = pd.read_csv(os.path.join(root_dir, csv_file), delimiter='|')
        self.df.iloc[19999][' comment_number'] = ' 4'
        self.df.iloc[19999][' comment'] = ' A dog runs across the grass .'
        self.captions = {}
        self.vocab = Counter()
        for idx, row in self.df.iterrows():
            img_path = row['image_name']
            caption = prepare_caption(row[' comment'])
            if img_path not in self.captions:
                self.captions.update({img_path: [caption]})
            else:
                self.captions[img_path].append(caption)
                
            for word in caption.split():
                self.vocab[word] += 1
                
        self.transform = transform
        self.root_dir = root_dir
        self.topk = topk
        self.word2index = {word: index for index, (word, count) in enumerate(sorted(self.vocab.items(), key=operator.itemgetter(1), reverse=True)[:topk])}
        self.index2word = {index: word for index, (word, count) in enumerate(sorted(self.vocab.items(), key=operator.itemgetter(1), reverse=True)[:topk])}
    
    def __len__(self):
        return len(self.captions)
    
    def __getitem__(self, x):
        img_name = self.df.iloc[x * 5, 0]
        img = Image.open(os.path.join(self.root_dir, 'flickr30k_images', img_name))
        caption = sorted(self.captions[img_name], key=len)[-1]
        caption_encoded = []
        for word in caption.split():
            if word not in self.word2index:
                caption_encoded.append(self.topk)
            else:
                caption_encoded.append(self.word2index[word])
        
        if self.transform:
            img = self.transform(img)
        return img, caption, caption_encoded

In [6]:
def pad_seq(tensors):
    seq_len = max([tensor.shape[0] for tensor in tensors])
    for i in range(len(tensors)):
        if tensors[i].shape[0] < seq_len:
            tensors[i] = torch.cat([tensors[i], torch.zeros(seq_len - tensors[i].shape[0])], dim=-1)
    return tensors

In [7]:
pad_seq([torch.Tensor([1, 2]), torch.Tensor([1, 3, 5])])

[tensor([1., 2., 0.]), tensor([1., 3., 5.])]

In [8]:
def collate_fn(batch):
    imgs = [example[0] for example in batch]
    captions = [example[1] for example in batch]
    captions_encoded = [torch.Tensor(example[2]) for example in batch]
    
    imgs = torch.stack(imgs, dim=0)
    captions_encoded = pad_seq(captions_encoded)
    captions_encoded = torch.stack(captions_encoded, dim=0)
    
    return imgs, captions, captions_encoded

In [9]:
dataset = Flickr30k('/mnt/c/Users/MAX/Downloads/flickr30k_images', 'results.csv', transform=transform)

In [10]:
data_loader = DataLoader(dataset, batch_size=2, drop_last=True, collate_fn=collate_fn)

In [9]:
torch.zeros((1, 2, 3), device=torch.device('cpu'))

tensor([[[0., 0., 0.],
         [0., 0., 0.]]])

In [11]:
for idx, data in enumerate(data_loader):
    print(idx, data[0], data[2])

0 tensor([[[[-2.0665, -2.0494, -2.0323,  ...,  1.1700,  1.6324,  1.6324],
          [-2.0665, -2.0323, -2.0494,  ...,  0.9988,  0.4851,  1.0844],
          [-2.0152, -2.0494, -2.0323,  ..., -0.9363, -0.9363, -0.3198],
          ...,
          [-0.7822, -0.4568, -0.5082,  ...,  0.2111,  0.2967,  0.1768],
          [-0.2342,  0.3138,  0.2282,  ...,  0.3823,  0.2111,  0.3481],
          [ 0.8104,  0.5364,  0.0741,  ...,  0.4851,  0.5364,  0.3138]],

         [[-1.9307, -1.9307, -1.9307,  ...,  1.9209,  2.2710,  2.3410],
          [-1.9657, -1.9307, -1.9307,  ...,  1.8158,  1.4132,  1.9909],
          [-1.8606, -1.9307, -1.9307,  ...,  0.1001,  0.1527,  0.8704],
          ...,
          [-0.2325, -0.1625, -0.2150,  ...,  1.1155,  1.0805,  0.9930],
          [ 0.2927,  0.8529,  0.9055,  ...,  1.1506,  0.9405,  1.1331],
          [ 1.2731,  1.2031,  1.0980,  ...,  1.0805,  1.2031,  0.9755]],

         [[-1.7173, -1.7173, -1.7173,  ...,  2.2391,  2.5180,  2.6051],
          [-1.7522, -1.7173,

3 tensor([[[[-1.9638, -1.9809, -1.9638,  ..., -1.2445, -1.2788, -1.3130],
          [-1.9809, -1.9809, -1.9638,  ..., -1.2445, -1.2617, -1.2788],
          [-1.9809, -1.9809, -1.9638,  ..., -1.2103, -1.2274, -1.2445],
          ...,
          [-0.7137, -0.6623, -0.6452,  ..., -0.5424, -0.5424, -0.5596],
          [-0.6281, -0.5767, -0.5424,  ..., -0.4397, -0.4226, -0.4054],
          [-0.9020, -0.9192, -0.8849,  ..., -0.4054, -0.5082, -0.5424]],

         [[-1.8782, -1.8957, -1.8782,  ..., -1.1253, -1.1429, -1.1954],
          [-1.8782, -1.8606, -1.8606,  ..., -1.1078, -1.1253, -1.1429],
          [-1.8782, -1.8957, -1.8782,  ..., -1.0728, -1.1253, -1.1429],
          ...,
          [-1.0028, -1.0028, -0.9503,  ..., -0.7577, -0.7752, -0.8277],
          [-0.9853, -0.9678, -0.9153,  ..., -0.6877, -0.6877, -0.7052],
          [-1.1078, -1.1253, -1.0903,  ..., -0.6352, -0.7402, -0.7577]],

         [[-1.6824, -1.6999, -1.6824,  ..., -1.1421, -1.1247, -1.1421],
          [-1.6999, -1.6824,

KeyboardInterrupt: 

In [12]:
class Identity(nn.Module):
    
    def forward(self, x):
        return x

In [13]:
class Encoder(nn.Module):
    
    def __init__(self):
        super(Encoder, self).__init__()
    
        self.model = torchvision.models.resnet18(pretrained=True)
        
    
    def forward(self, x):
        x = self.model.conv1(x)
        x = self.model.bn1(x)
        x = self.model.relu(x)
        x = self.model.maxpool(x)
        
        x = self.model.layer1(x)
        x = self.model.layer2(x)
        x = self.model.layer3(x)
        x = self.model.layer4(x)
        
        return x

In [14]:
class Attention(nn.Module):
    
    def __init__(self, features_dim, hidden_dim):
        super(Attention, self).__init__()
        
        self.linear1 = nn.Linear(features_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        
    def forward(self, features, hidden):
        features = self.linear1(features)
        hidden = hidden[0]
        
        hidden = self.linear2(hidden)
        #print(hidden.shape)
        #rint(features.shape)
        
        score = torch.nn.functional.tanh(hidden.permute(1, 0, 2) + features)
        
        attention_weights = torch.nn.functional.softmax(self.linear3(score), dim=1)
        
        attention_vectors = attention_weights * score
        attention_vectors = torch.mean(attention_vectors, dim=1)
        
        return attention_vectors

In [15]:
class Decoder(nn.Module):
    
    def __init__(self, hidden_dim, embedding_dim, num_layers, vocab_len):
        super(Decoder, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_len, embedding_dim)
        self.lstm = nn.LSTM(hidden_dim + embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_len)
        self.attention = Attention(embedding_dim, hidden_dim)
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        
    def init_hidden(self, batch_size):
        if torch.cuda.is_available():
            hidden = (torch.zeros((batch_size, self.num_layers, self.hidden_dim), device=torch.device('cuda')),
                     torch.zeros((batch_size, self.num_layers, self.hidden_dim), device=torch.device('cuda'))
                     )
        else:
            hidden = (torch.zeros((self.num_layers, batch_size, self.hidden_dim), device=torch.device('cpu')),
                     torch.zeros((self.num_layers, batch_size, self.hidden_dim), device=torch.device('cpu'))
                     )
            
        return hidden
    
    def forward(self, x, enc_embed, hidden):
        embeds = self.embeddings(x)
        embeds = torch.unsqueeze(embeds, 1)
        attention_vectors = self.attention(enc_embed, hidden)
        attention_vectors = torch.unsqueeze(attention_vectors, 1)
        #print(attention_vectors.shape)
        x = torch.cat([embeds, attention_vectors], dim=-1)
        x, hidden = self.lstm(x, hidden)
        x = self.linear(x)
        return x, hidden

In [35]:
def loss_with_mask(pred_vec, target_vec, loss_func):
    indices = torch.nonzero(target_vec)
    pred_vec = torch.squeeze(pred_vec[indices])
    target_vec = torch.squeeze(target_vec[indices])
    return loss_func(pred_vec, target_vec)

In [38]:
def train(encoder, decoder, loader):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(decoder.parameters())
    for idx, data in enumerate(loader):
        optimizer.zero_grad()
        
        inputs = data[0].float()
        batch_size = inputs.shape[0]
        
        enc_embeds = encoder(inputs)
        enc_embeds = enc_embeds.view(batch_size, encoder.model.inplanes, -1)
        labels = data[2].long()
        hidden = decoder.init_hidden(batch_size)
        #print(hidden[0].shape)
        outputs = []
        for i in range(labels.shape[1]):
            output, hidden = decoder(labels[:, i], enc_embeds, hidden)
            outputs.append(output)
        outputs = torch.cat(outputs, dim=1)
        outputs = outputs.view(-1, dataset.topk + 1)
        labels = labels.view(-1)
        loss = loss_with_mask(outputs, labels, criterion)
        print(loss)
        loss.backward()
        optimizer.step()
        

In [39]:
encoder = Encoder()
decoder = Decoder(50, 49, 1, dataset.topk + 1)
train(encoder, decoder, data_loader)

tensor(8.5253, grad_fn=<NllLossBackward>)
tensor(8.5019, grad_fn=<NllLossBackward>)
tensor(8.5167, grad_fn=<NllLossBackward>)
tensor(8.5151, grad_fn=<NllLossBackward>)
tensor(8.4851, grad_fn=<NllLossBackward>)
tensor(8.4684, grad_fn=<NllLossBackward>)
tensor(8.5224, grad_fn=<NllLossBackward>)
tensor(8.4725, grad_fn=<NllLossBackward>)
tensor(8.5056, grad_fn=<NllLossBackward>)
tensor(8.5117, grad_fn=<NllLossBackward>)
tensor(8.4515, grad_fn=<NllLossBackward>)
tensor(8.4776, grad_fn=<NllLossBackward>)
tensor(8.3997, grad_fn=<NllLossBackward>)
tensor(8.4261, grad_fn=<NllLossBackward>)
tensor(8.3812, grad_fn=<NllLossBackward>)
tensor(8.4380, grad_fn=<NllLossBackward>)
tensor(8.3851, grad_fn=<NllLossBackward>)
tensor(8.3488, grad_fn=<NllLossBackward>)
tensor(8.3761, grad_fn=<NllLossBackward>)
tensor(8.4050, grad_fn=<NllLossBackward>)
tensor(8.3560, grad_fn=<NllLossBackward>)
tensor(8.3374, grad_fn=<NllLossBackward>)
tensor(8.4507, grad_fn=<NllLossBackward>)
tensor(8.3950, grad_fn=<NllLossBac

tensor(5.7406, grad_fn=<NllLossBackward>)
tensor(5.3980, grad_fn=<NllLossBackward>)
tensor(5.4187, grad_fn=<NllLossBackward>)
tensor(5.3071, grad_fn=<NllLossBackward>)
tensor(5.2472, grad_fn=<NllLossBackward>)
tensor(5.0912, grad_fn=<NllLossBackward>)
tensor(5.7971, grad_fn=<NllLossBackward>)
tensor(5.4197, grad_fn=<NllLossBackward>)
tensor(3.9008, grad_fn=<NllLossBackward>)
tensor(5.8115, grad_fn=<NllLossBackward>)
tensor(6.6119, grad_fn=<NllLossBackward>)
tensor(4.4230, grad_fn=<NllLossBackward>)
tensor(6.0158, grad_fn=<NllLossBackward>)
tensor(5.1391, grad_fn=<NllLossBackward>)
tensor(5.5620, grad_fn=<NllLossBackward>)
tensor(6.0677, grad_fn=<NllLossBackward>)
tensor(5.3539, grad_fn=<NllLossBackward>)
tensor(5.6776, grad_fn=<NllLossBackward>)
tensor(5.4220, grad_fn=<NllLossBackward>)
tensor(4.7071, grad_fn=<NllLossBackward>)
tensor(4.8962, grad_fn=<NllLossBackward>)
tensor(5.5667, grad_fn=<NllLossBackward>)
tensor(4.6754, grad_fn=<NllLossBackward>)
tensor(4.7373, grad_fn=<NllLossBac

tensor(3.5717, grad_fn=<NllLossBackward>)
tensor(4.2215, grad_fn=<NllLossBackward>)
tensor(4.0000, grad_fn=<NllLossBackward>)
tensor(4.8401, grad_fn=<NllLossBackward>)
tensor(5.1259, grad_fn=<NllLossBackward>)
tensor(4.6033, grad_fn=<NllLossBackward>)
tensor(4.2447, grad_fn=<NllLossBackward>)
tensor(3.8893, grad_fn=<NllLossBackward>)
tensor(5.2012, grad_fn=<NllLossBackward>)
tensor(4.2365, grad_fn=<NllLossBackward>)
tensor(4.5247, grad_fn=<NllLossBackward>)
tensor(4.7347, grad_fn=<NllLossBackward>)
tensor(4.8696, grad_fn=<NllLossBackward>)
tensor(4.8847, grad_fn=<NllLossBackward>)
tensor(5.1674, grad_fn=<NllLossBackward>)
tensor(5.6542, grad_fn=<NllLossBackward>)
tensor(4.4528, grad_fn=<NllLossBackward>)
tensor(5.2004, grad_fn=<NllLossBackward>)
tensor(5.9788, grad_fn=<NllLossBackward>)
tensor(4.4032, grad_fn=<NllLossBackward>)
tensor(5.5761, grad_fn=<NllLossBackward>)
tensor(3.5433, grad_fn=<NllLossBackward>)
tensor(4.2962, grad_fn=<NllLossBackward>)
tensor(4.1039, grad_fn=<NllLossBac

tensor(4.6713, grad_fn=<NllLossBackward>)
tensor(3.7500, grad_fn=<NllLossBackward>)
tensor(3.7102, grad_fn=<NllLossBackward>)
tensor(3.4064, grad_fn=<NllLossBackward>)
tensor(4.4385, grad_fn=<NllLossBackward>)
tensor(3.9938, grad_fn=<NllLossBackward>)
tensor(3.0593, grad_fn=<NllLossBackward>)
tensor(4.0963, grad_fn=<NllLossBackward>)
tensor(3.8354, grad_fn=<NllLossBackward>)
tensor(4.5117, grad_fn=<NllLossBackward>)
tensor(4.9329, grad_fn=<NllLossBackward>)
tensor(5.0522, grad_fn=<NllLossBackward>)
tensor(3.6796, grad_fn=<NllLossBackward>)
tensor(3.0592, grad_fn=<NllLossBackward>)
tensor(3.1663, grad_fn=<NllLossBackward>)
tensor(5.1910, grad_fn=<NllLossBackward>)
tensor(5.4728, grad_fn=<NllLossBackward>)
tensor(2.0747, grad_fn=<NllLossBackward>)
tensor(3.7275, grad_fn=<NllLossBackward>)
tensor(3.0516, grad_fn=<NllLossBackward>)
tensor(3.3805, grad_fn=<NllLossBackward>)
tensor(3.8008, grad_fn=<NllLossBackward>)
tensor(4.5291, grad_fn=<NllLossBackward>)
tensor(3.7877, grad_fn=<NllLossBac

tensor(3.4252, grad_fn=<NllLossBackward>)
tensor(2.9462, grad_fn=<NllLossBackward>)
tensor(4.1905, grad_fn=<NllLossBackward>)
tensor(2.6925, grad_fn=<NllLossBackward>)
tensor(3.1748, grad_fn=<NllLossBackward>)
tensor(2.7737, grad_fn=<NllLossBackward>)
tensor(2.7760, grad_fn=<NllLossBackward>)
tensor(3.9567, grad_fn=<NllLossBackward>)
tensor(2.9598, grad_fn=<NllLossBackward>)
tensor(2.3484, grad_fn=<NllLossBackward>)
tensor(3.5625, grad_fn=<NllLossBackward>)
tensor(3.7149, grad_fn=<NllLossBackward>)
tensor(2.3105, grad_fn=<NllLossBackward>)
tensor(2.9545, grad_fn=<NllLossBackward>)
tensor(3.2658, grad_fn=<NllLossBackward>)
tensor(4.2074, grad_fn=<NllLossBackward>)
tensor(3.3370, grad_fn=<NllLossBackward>)
tensor(2.9739, grad_fn=<NllLossBackward>)
tensor(3.0982, grad_fn=<NllLossBackward>)
tensor(4.1362, grad_fn=<NllLossBackward>)
tensor(5.3405, grad_fn=<NllLossBackward>)
tensor(3.8536, grad_fn=<NllLossBackward>)
tensor(2.7042, grad_fn=<NllLossBackward>)
tensor(3.5646, grad_fn=<NllLossBac

tensor(3.1579, grad_fn=<NllLossBackward>)
tensor(3.6062, grad_fn=<NllLossBackward>)
tensor(3.3212, grad_fn=<NllLossBackward>)
tensor(3.4619, grad_fn=<NllLossBackward>)
tensor(2.5628, grad_fn=<NllLossBackward>)
tensor(2.6417, grad_fn=<NllLossBackward>)
tensor(3.0752, grad_fn=<NllLossBackward>)
tensor(3.7124, grad_fn=<NllLossBackward>)
tensor(3.6634, grad_fn=<NllLossBackward>)
tensor(2.5161, grad_fn=<NllLossBackward>)
tensor(2.5721, grad_fn=<NllLossBackward>)
tensor(3.2782, grad_fn=<NllLossBackward>)
tensor(3.1428, grad_fn=<NllLossBackward>)
tensor(3.3855, grad_fn=<NllLossBackward>)
tensor(3.3140, grad_fn=<NllLossBackward>)
tensor(3.5507, grad_fn=<NllLossBackward>)
tensor(2.4115, grad_fn=<NllLossBackward>)
tensor(4.3388, grad_fn=<NllLossBackward>)
tensor(2.7450, grad_fn=<NllLossBackward>)
tensor(3.1600, grad_fn=<NllLossBackward>)
tensor(2.0164, grad_fn=<NllLossBackward>)
tensor(2.9598, grad_fn=<NllLossBackward>)
tensor(4.4596, grad_fn=<NllLossBackward>)
tensor(2.7991, grad_fn=<NllLossBac

tensor(3.1723, grad_fn=<NllLossBackward>)
tensor(2.5016, grad_fn=<NllLossBackward>)
tensor(1.7389, grad_fn=<NllLossBackward>)
tensor(2.1454, grad_fn=<NllLossBackward>)
tensor(2.7720, grad_fn=<NllLossBackward>)
tensor(3.9200, grad_fn=<NllLossBackward>)
tensor(2.8074, grad_fn=<NllLossBackward>)
tensor(2.3191, grad_fn=<NllLossBackward>)
tensor(2.2136, grad_fn=<NllLossBackward>)
tensor(2.9048, grad_fn=<NllLossBackward>)
tensor(2.3309, grad_fn=<NllLossBackward>)
tensor(3.0026, grad_fn=<NllLossBackward>)
tensor(2.7180, grad_fn=<NllLossBackward>)
tensor(3.6717, grad_fn=<NllLossBackward>)
tensor(1.3671, grad_fn=<NllLossBackward>)
tensor(1.6310, grad_fn=<NllLossBackward>)
tensor(3.0149, grad_fn=<NllLossBackward>)
tensor(3.1661, grad_fn=<NllLossBackward>)
tensor(2.0775, grad_fn=<NllLossBackward>)
tensor(2.2622, grad_fn=<NllLossBackward>)
tensor(3.2203, grad_fn=<NllLossBackward>)
tensor(2.0724, grad_fn=<NllLossBackward>)
tensor(2.5609, grad_fn=<NllLossBackward>)
tensor(2.9621, grad_fn=<NllLossBac

tensor(4.0986, grad_fn=<NllLossBackward>)
tensor(3.6394, grad_fn=<NllLossBackward>)
tensor(2.3689, grad_fn=<NllLossBackward>)
tensor(1.8921, grad_fn=<NllLossBackward>)
tensor(2.0530, grad_fn=<NllLossBackward>)
tensor(3.0841, grad_fn=<NllLossBackward>)
tensor(3.2009, grad_fn=<NllLossBackward>)
tensor(3.0721, grad_fn=<NllLossBackward>)
tensor(2.5347, grad_fn=<NllLossBackward>)
tensor(2.1583, grad_fn=<NllLossBackward>)
tensor(2.3694, grad_fn=<NllLossBackward>)
tensor(3.7558, grad_fn=<NllLossBackward>)
tensor(2.1000, grad_fn=<NllLossBackward>)
tensor(2.2027, grad_fn=<NllLossBackward>)
tensor(3.5140, grad_fn=<NllLossBackward>)
tensor(3.4671, grad_fn=<NllLossBackward>)
tensor(2.7547, grad_fn=<NllLossBackward>)
tensor(2.4467, grad_fn=<NllLossBackward>)
tensor(3.2748, grad_fn=<NllLossBackward>)
tensor(1.7792, grad_fn=<NllLossBackward>)
tensor(2.0040, grad_fn=<NllLossBackward>)
tensor(2.2897, grad_fn=<NllLossBackward>)
tensor(1.9517, grad_fn=<NllLossBackward>)
tensor(1.5670, grad_fn=<NllLossBac

tensor(1.9706, grad_fn=<NllLossBackward>)
tensor(1.0990, grad_fn=<NllLossBackward>)
tensor(3.2405, grad_fn=<NllLossBackward>)
tensor(1.7257, grad_fn=<NllLossBackward>)
tensor(3.3527, grad_fn=<NllLossBackward>)
tensor(1.5220, grad_fn=<NllLossBackward>)
tensor(2.3210, grad_fn=<NllLossBackward>)
tensor(1.9606, grad_fn=<NllLossBackward>)
tensor(2.8480, grad_fn=<NllLossBackward>)
tensor(3.5099, grad_fn=<NllLossBackward>)
tensor(2.2303, grad_fn=<NllLossBackward>)
tensor(1.5272, grad_fn=<NllLossBackward>)
tensor(2.3991, grad_fn=<NllLossBackward>)
tensor(2.8371, grad_fn=<NllLossBackward>)
tensor(2.4263, grad_fn=<NllLossBackward>)
tensor(2.5559, grad_fn=<NllLossBackward>)
tensor(1.6477, grad_fn=<NllLossBackward>)
tensor(2.5664, grad_fn=<NllLossBackward>)
tensor(2.1239, grad_fn=<NllLossBackward>)
tensor(2.3040, grad_fn=<NllLossBackward>)
tensor(1.3696, grad_fn=<NllLossBackward>)
tensor(1.8817, grad_fn=<NllLossBackward>)
tensor(2.3318, grad_fn=<NllLossBackward>)
tensor(2.2111, grad_fn=<NllLossBac

tensor(2.5258, grad_fn=<NllLossBackward>)
tensor(2.0939, grad_fn=<NllLossBackward>)
tensor(1.3546, grad_fn=<NllLossBackward>)
tensor(2.2773, grad_fn=<NllLossBackward>)
tensor(1.6076, grad_fn=<NllLossBackward>)
tensor(2.0342, grad_fn=<NllLossBackward>)
tensor(2.5397, grad_fn=<NllLossBackward>)
tensor(2.0330, grad_fn=<NllLossBackward>)
tensor(2.1831, grad_fn=<NllLossBackward>)
tensor(2.6820, grad_fn=<NllLossBackward>)
tensor(1.9975, grad_fn=<NllLossBackward>)
tensor(1.7977, grad_fn=<NllLossBackward>)
tensor(2.0774, grad_fn=<NllLossBackward>)
tensor(2.2092, grad_fn=<NllLossBackward>)
tensor(1.5425, grad_fn=<NllLossBackward>)
tensor(1.2570, grad_fn=<NllLossBackward>)
tensor(1.6625, grad_fn=<NllLossBackward>)
tensor(2.2510, grad_fn=<NllLossBackward>)
tensor(1.8171, grad_fn=<NllLossBackward>)
tensor(1.3150, grad_fn=<NllLossBackward>)
tensor(2.0322, grad_fn=<NllLossBackward>)
tensor(2.0272, grad_fn=<NllLossBackward>)
tensor(2.6408, grad_fn=<NllLossBackward>)
tensor(2.0217, grad_fn=<NllLossBac

tensor(1.4716, grad_fn=<NllLossBackward>)
tensor(2.8398, grad_fn=<NllLossBackward>)
tensor(1.5620, grad_fn=<NllLossBackward>)
tensor(1.6591, grad_fn=<NllLossBackward>)
tensor(2.0011, grad_fn=<NllLossBackward>)
tensor(2.0216, grad_fn=<NllLossBackward>)
tensor(1.5051, grad_fn=<NllLossBackward>)
tensor(0.9599, grad_fn=<NllLossBackward>)
tensor(0.9733, grad_fn=<NllLossBackward>)
tensor(1.0937, grad_fn=<NllLossBackward>)
tensor(1.3530, grad_fn=<NllLossBackward>)
tensor(0.7169, grad_fn=<NllLossBackward>)
tensor(2.2361, grad_fn=<NllLossBackward>)
tensor(2.5534, grad_fn=<NllLossBackward>)
tensor(3.4084, grad_fn=<NllLossBackward>)
tensor(1.4421, grad_fn=<NllLossBackward>)
tensor(2.6219, grad_fn=<NllLossBackward>)
tensor(2.5983, grad_fn=<NllLossBackward>)
tensor(2.0087, grad_fn=<NllLossBackward>)
tensor(1.7585, grad_fn=<NllLossBackward>)
tensor(1.8137, grad_fn=<NllLossBackward>)
tensor(2.2468, grad_fn=<NllLossBackward>)
tensor(2.4250, grad_fn=<NllLossBackward>)
tensor(1.1596, grad_fn=<NllLossBac

tensor(1.4807, grad_fn=<NllLossBackward>)
tensor(2.1457, grad_fn=<NllLossBackward>)
tensor(1.8721, grad_fn=<NllLossBackward>)
tensor(2.3387, grad_fn=<NllLossBackward>)
tensor(1.2211, grad_fn=<NllLossBackward>)
tensor(1.9752, grad_fn=<NllLossBackward>)
tensor(0.9337, grad_fn=<NllLossBackward>)
tensor(2.4635, grad_fn=<NllLossBackward>)
tensor(1.9764, grad_fn=<NllLossBackward>)
tensor(0.7993, grad_fn=<NllLossBackward>)
tensor(2.6692, grad_fn=<NllLossBackward>)
tensor(2.1515, grad_fn=<NllLossBackward>)
tensor(0.9279, grad_fn=<NllLossBackward>)
tensor(1.0001, grad_fn=<NllLossBackward>)
tensor(2.3055, grad_fn=<NllLossBackward>)
tensor(2.3333, grad_fn=<NllLossBackward>)
tensor(2.7727, grad_fn=<NllLossBackward>)
tensor(2.8170, grad_fn=<NllLossBackward>)
tensor(1.2801, grad_fn=<NllLossBackward>)
tensor(1.3045, grad_fn=<NllLossBackward>)
tensor(2.4070, grad_fn=<NllLossBackward>)
tensor(1.7404, grad_fn=<NllLossBackward>)
tensor(3.5007, grad_fn=<NllLossBackward>)
tensor(2.2219, grad_fn=<NllLossBac

tensor(2.0416, grad_fn=<NllLossBackward>)
tensor(2.6427, grad_fn=<NllLossBackward>)
tensor(1.3150, grad_fn=<NllLossBackward>)
tensor(1.3339, grad_fn=<NllLossBackward>)
tensor(2.9264, grad_fn=<NllLossBackward>)
tensor(1.0266, grad_fn=<NllLossBackward>)
tensor(1.9966, grad_fn=<NllLossBackward>)
tensor(1.7240, grad_fn=<NllLossBackward>)
tensor(1.6542, grad_fn=<NllLossBackward>)
tensor(0.8079, grad_fn=<NllLossBackward>)
tensor(1.1095, grad_fn=<NllLossBackward>)
tensor(1.3902, grad_fn=<NllLossBackward>)
tensor(0.8875, grad_fn=<NllLossBackward>)
tensor(1.0873, grad_fn=<NllLossBackward>)
tensor(0.2256, grad_fn=<NllLossBackward>)
tensor(1.7853, grad_fn=<NllLossBackward>)
tensor(2.5464, grad_fn=<NllLossBackward>)
tensor(0.3309, grad_fn=<NllLossBackward>)
tensor(2.3984, grad_fn=<NllLossBackward>)
tensor(1.2320, grad_fn=<NllLossBackward>)
tensor(2.0407, grad_fn=<NllLossBackward>)
tensor(1.3757, grad_fn=<NllLossBackward>)
tensor(1.0714, grad_fn=<NllLossBackward>)
tensor(1.7220, grad_fn=<NllLossBac

tensor(0.9089, grad_fn=<NllLossBackward>)
tensor(0.7758, grad_fn=<NllLossBackward>)
tensor(1.9633, grad_fn=<NllLossBackward>)
tensor(1.2415, grad_fn=<NllLossBackward>)
tensor(2.4009, grad_fn=<NllLossBackward>)
tensor(1.8903, grad_fn=<NllLossBackward>)
tensor(1.3047, grad_fn=<NllLossBackward>)
tensor(2.1737, grad_fn=<NllLossBackward>)
tensor(1.6018, grad_fn=<NllLossBackward>)
tensor(2.0877, grad_fn=<NllLossBackward>)
tensor(1.8077, grad_fn=<NllLossBackward>)
tensor(1.8644, grad_fn=<NllLossBackward>)
tensor(2.7560, grad_fn=<NllLossBackward>)
tensor(1.7090, grad_fn=<NllLossBackward>)
tensor(1.0367, grad_fn=<NllLossBackward>)
tensor(2.3758, grad_fn=<NllLossBackward>)
tensor(0.6482, grad_fn=<NllLossBackward>)
tensor(0.8972, grad_fn=<NllLossBackward>)
tensor(1.7064, grad_fn=<NllLossBackward>)
tensor(1.8309, grad_fn=<NllLossBackward>)
tensor(2.0935, grad_fn=<NllLossBackward>)
tensor(3.1472, grad_fn=<NllLossBackward>)
tensor(3.2088, grad_fn=<NllLossBackward>)
tensor(1.7008, grad_fn=<NllLossBac

tensor(1.0132, grad_fn=<NllLossBackward>)
tensor(0.9673, grad_fn=<NllLossBackward>)
tensor(1.5105, grad_fn=<NllLossBackward>)
tensor(3.0477, grad_fn=<NllLossBackward>)
tensor(1.5149, grad_fn=<NllLossBackward>)
tensor(2.9421, grad_fn=<NllLossBackward>)
tensor(0.7822, grad_fn=<NllLossBackward>)
tensor(1.0710, grad_fn=<NllLossBackward>)
tensor(2.0277, grad_fn=<NllLossBackward>)
tensor(1.7894, grad_fn=<NllLossBackward>)
tensor(3.0552, grad_fn=<NllLossBackward>)
tensor(1.7397, grad_fn=<NllLossBackward>)
tensor(1.8724, grad_fn=<NllLossBackward>)
tensor(1.2655, grad_fn=<NllLossBackward>)
tensor(0.4692, grad_fn=<NllLossBackward>)
tensor(0.9168, grad_fn=<NllLossBackward>)
tensor(0.9908, grad_fn=<NllLossBackward>)
tensor(1.6020, grad_fn=<NllLossBackward>)
tensor(1.4295, grad_fn=<NllLossBackward>)
tensor(1.8089, grad_fn=<NllLossBackward>)
tensor(1.5576, grad_fn=<NllLossBackward>)
tensor(1.0998, grad_fn=<NllLossBackward>)
tensor(0.5722, grad_fn=<NllLossBackward>)
tensor(0.9039, grad_fn=<NllLossBac

tensor(1.6374, grad_fn=<NllLossBackward>)
tensor(1.5361, grad_fn=<NllLossBackward>)
tensor(0.7182, grad_fn=<NllLossBackward>)
tensor(0.9817, grad_fn=<NllLossBackward>)
tensor(1.4558, grad_fn=<NllLossBackward>)
tensor(2.6493, grad_fn=<NllLossBackward>)
tensor(0.8651, grad_fn=<NllLossBackward>)
tensor(0.5490, grad_fn=<NllLossBackward>)
tensor(1.2260, grad_fn=<NllLossBackward>)
tensor(3.6564, grad_fn=<NllLossBackward>)
tensor(1.4628, grad_fn=<NllLossBackward>)
tensor(0.7712, grad_fn=<NllLossBackward>)
tensor(0.4339, grad_fn=<NllLossBackward>)
tensor(1.9072, grad_fn=<NllLossBackward>)
tensor(1.1633, grad_fn=<NllLossBackward>)
tensor(0.8820, grad_fn=<NllLossBackward>)
tensor(1.0730, grad_fn=<NllLossBackward>)
tensor(1.1986, grad_fn=<NllLossBackward>)
tensor(2.0548, grad_fn=<NllLossBackward>)
tensor(0.8526, grad_fn=<NllLossBackward>)
tensor(0.8452, grad_fn=<NllLossBackward>)
tensor(2.9190, grad_fn=<NllLossBackward>)
tensor(2.9875, grad_fn=<NllLossBackward>)
tensor(2.1872, grad_fn=<NllLossBac

tensor(1.0433, grad_fn=<NllLossBackward>)
tensor(0.9256, grad_fn=<NllLossBackward>)
tensor(1.4018, grad_fn=<NllLossBackward>)
tensor(0.4207, grad_fn=<NllLossBackward>)
tensor(1.0733, grad_fn=<NllLossBackward>)
tensor(1.0050, grad_fn=<NllLossBackward>)
tensor(1.4956, grad_fn=<NllLossBackward>)
tensor(0.9575, grad_fn=<NllLossBackward>)
tensor(1.6136, grad_fn=<NllLossBackward>)
tensor(1.6619, grad_fn=<NllLossBackward>)
tensor(0.9288, grad_fn=<NllLossBackward>)
tensor(2.0418, grad_fn=<NllLossBackward>)
tensor(1.9198, grad_fn=<NllLossBackward>)
tensor(1.5554, grad_fn=<NllLossBackward>)
tensor(1.3758, grad_fn=<NllLossBackward>)
tensor(1.1191, grad_fn=<NllLossBackward>)
tensor(1.1862, grad_fn=<NllLossBackward>)
tensor(1.4256, grad_fn=<NllLossBackward>)
tensor(1.4568, grad_fn=<NllLossBackward>)
tensor(1.0142, grad_fn=<NllLossBackward>)
tensor(1.5214, grad_fn=<NllLossBackward>)
tensor(2.6692, grad_fn=<NllLossBackward>)
tensor(0.8372, grad_fn=<NllLossBackward>)
tensor(1.3947, grad_fn=<NllLossBac

tensor(1.5858, grad_fn=<NllLossBackward>)
tensor(1.9379, grad_fn=<NllLossBackward>)
tensor(0.8924, grad_fn=<NllLossBackward>)
tensor(1.1747, grad_fn=<NllLossBackward>)
tensor(0.9612, grad_fn=<NllLossBackward>)
tensor(1.8529, grad_fn=<NllLossBackward>)
tensor(1.2112, grad_fn=<NllLossBackward>)
tensor(2.3669, grad_fn=<NllLossBackward>)
tensor(1.8146, grad_fn=<NllLossBackward>)
tensor(1.3174, grad_fn=<NllLossBackward>)
tensor(2.1992, grad_fn=<NllLossBackward>)
tensor(0.8453, grad_fn=<NllLossBackward>)
tensor(1.1426, grad_fn=<NllLossBackward>)
tensor(0.7008, grad_fn=<NllLossBackward>)
tensor(1.0096, grad_fn=<NllLossBackward>)
tensor(1.3738, grad_fn=<NllLossBackward>)
tensor(0.4137, grad_fn=<NllLossBackward>)
tensor(1.8733, grad_fn=<NllLossBackward>)
tensor(1.7442, grad_fn=<NllLossBackward>)
tensor(0.9648, grad_fn=<NllLossBackward>)
tensor(0.9754, grad_fn=<NllLossBackward>)
tensor(1.1956, grad_fn=<NllLossBackward>)
tensor(1.1059, grad_fn=<NllLossBackward>)
tensor(1.4673, grad_fn=<NllLossBac

tensor(0.7182, grad_fn=<NllLossBackward>)
tensor(1.1246, grad_fn=<NllLossBackward>)
tensor(0.9079, grad_fn=<NllLossBackward>)
tensor(1.5537, grad_fn=<NllLossBackward>)
tensor(1.5854, grad_fn=<NllLossBackward>)
tensor(0.5438, grad_fn=<NllLossBackward>)
tensor(0.5836, grad_fn=<NllLossBackward>)
tensor(2.4767, grad_fn=<NllLossBackward>)
tensor(0.3825, grad_fn=<NllLossBackward>)
tensor(1.5519, grad_fn=<NllLossBackward>)
tensor(2.0066, grad_fn=<NllLossBackward>)
tensor(1.2820, grad_fn=<NllLossBackward>)
tensor(0.6941, grad_fn=<NllLossBackward>)
tensor(1.6471, grad_fn=<NllLossBackward>)
tensor(2.2626, grad_fn=<NllLossBackward>)
tensor(0.5198, grad_fn=<NllLossBackward>)
tensor(1.6516, grad_fn=<NllLossBackward>)
tensor(1.7376, grad_fn=<NllLossBackward>)
tensor(1.2301, grad_fn=<NllLossBackward>)
tensor(1.2014, grad_fn=<NllLossBackward>)
tensor(0.9359, grad_fn=<NllLossBackward>)
tensor(1.2002, grad_fn=<NllLossBackward>)
tensor(1.0046, grad_fn=<NllLossBackward>)
tensor(1.8913, grad_fn=<NllLossBac

tensor(1.4670, grad_fn=<NllLossBackward>)
tensor(1.2548, grad_fn=<NllLossBackward>)
tensor(1.2577, grad_fn=<NllLossBackward>)
tensor(1.7700, grad_fn=<NllLossBackward>)
tensor(0.8446, grad_fn=<NllLossBackward>)
tensor(1.2552, grad_fn=<NllLossBackward>)
tensor(0.3501, grad_fn=<NllLossBackward>)
tensor(1.8341, grad_fn=<NllLossBackward>)
tensor(1.0929, grad_fn=<NllLossBackward>)
tensor(0.7003, grad_fn=<NllLossBackward>)
tensor(0.5828, grad_fn=<NllLossBackward>)
tensor(1.2089, grad_fn=<NllLossBackward>)
tensor(0.8033, grad_fn=<NllLossBackward>)
tensor(1.9971, grad_fn=<NllLossBackward>)
tensor(0.6962, grad_fn=<NllLossBackward>)
tensor(1.0255, grad_fn=<NllLossBackward>)
tensor(2.7336, grad_fn=<NllLossBackward>)
tensor(1.4363, grad_fn=<NllLossBackward>)
tensor(0.9962, grad_fn=<NllLossBackward>)
tensor(0.7118, grad_fn=<NllLossBackward>)
tensor(1.1488, grad_fn=<NllLossBackward>)
tensor(0.7383, grad_fn=<NllLossBackward>)
tensor(1.1381, grad_fn=<NllLossBackward>)
tensor(0.6568, grad_fn=<NllLossBac

tensor(0.8816, grad_fn=<NllLossBackward>)
tensor(0.9274, grad_fn=<NllLossBackward>)
tensor(0.3510, grad_fn=<NllLossBackward>)
tensor(1.9694, grad_fn=<NllLossBackward>)
tensor(0.9795, grad_fn=<NllLossBackward>)
tensor(0.7929, grad_fn=<NllLossBackward>)
tensor(0.6969, grad_fn=<NllLossBackward>)
tensor(1.0919, grad_fn=<NllLossBackward>)
tensor(1.6444, grad_fn=<NllLossBackward>)
tensor(1.8821, grad_fn=<NllLossBackward>)
tensor(0.4904, grad_fn=<NllLossBackward>)
tensor(0.7498, grad_fn=<NllLossBackward>)
tensor(0.6506, grad_fn=<NllLossBackward>)
tensor(1.0351, grad_fn=<NllLossBackward>)
tensor(0.4478, grad_fn=<NllLossBackward>)
tensor(1.3971, grad_fn=<NllLossBackward>)
tensor(0.6252, grad_fn=<NllLossBackward>)
tensor(1.1736, grad_fn=<NllLossBackward>)
tensor(0.8097, grad_fn=<NllLossBackward>)
tensor(0.4449, grad_fn=<NllLossBackward>)
tensor(0.0402, grad_fn=<NllLossBackward>)
tensor(0.8865, grad_fn=<NllLossBackward>)
tensor(1.6614, grad_fn=<NllLossBackward>)
tensor(1.0088, grad_fn=<NllLossBac

tensor(0.5638, grad_fn=<NllLossBackward>)
tensor(0.2805, grad_fn=<NllLossBackward>)
tensor(0.7595, grad_fn=<NllLossBackward>)
tensor(0.6070, grad_fn=<NllLossBackward>)
tensor(0.4201, grad_fn=<NllLossBackward>)
tensor(0.3520, grad_fn=<NllLossBackward>)
tensor(0.9566, grad_fn=<NllLossBackward>)
tensor(1.5850, grad_fn=<NllLossBackward>)
tensor(2.7901, grad_fn=<NllLossBackward>)
tensor(0.7412, grad_fn=<NllLossBackward>)
tensor(0.7221, grad_fn=<NllLossBackward>)
tensor(0.5632, grad_fn=<NllLossBackward>)
tensor(0.6744, grad_fn=<NllLossBackward>)
tensor(0.8179, grad_fn=<NllLossBackward>)
tensor(1.3373, grad_fn=<NllLossBackward>)
tensor(0.6257, grad_fn=<NllLossBackward>)
tensor(0.6912, grad_fn=<NllLossBackward>)
tensor(0.8710, grad_fn=<NllLossBackward>)
tensor(1.6136, grad_fn=<NllLossBackward>)
tensor(1.3766, grad_fn=<NllLossBackward>)
tensor(2.6070, grad_fn=<NllLossBackward>)
tensor(0.8424, grad_fn=<NllLossBackward>)
tensor(1.3318, grad_fn=<NllLossBackward>)
tensor(0.8122, grad_fn=<NllLossBac

tensor(1.6415, grad_fn=<NllLossBackward>)
tensor(1.3292, grad_fn=<NllLossBackward>)
tensor(0.9559, grad_fn=<NllLossBackward>)
tensor(0.7356, grad_fn=<NllLossBackward>)
tensor(0.6735, grad_fn=<NllLossBackward>)
tensor(0.6435, grad_fn=<NllLossBackward>)
tensor(0.9764, grad_fn=<NllLossBackward>)
tensor(0.2532, grad_fn=<NllLossBackward>)
tensor(0.6536, grad_fn=<NllLossBackward>)
tensor(1.0427, grad_fn=<NllLossBackward>)
tensor(0.0518, grad_fn=<NllLossBackward>)
tensor(0.5350, grad_fn=<NllLossBackward>)
tensor(0.4186, grad_fn=<NllLossBackward>)
tensor(0.6885, grad_fn=<NllLossBackward>)
tensor(0.1184, grad_fn=<NllLossBackward>)
tensor(1.7208, grad_fn=<NllLossBackward>)
tensor(0.2936, grad_fn=<NllLossBackward>)
tensor(0.5093, grad_fn=<NllLossBackward>)
tensor(0.8935, grad_fn=<NllLossBackward>)
tensor(0.9852, grad_fn=<NllLossBackward>)
tensor(0.7536, grad_fn=<NllLossBackward>)
tensor(0.6567, grad_fn=<NllLossBackward>)
tensor(0.6973, grad_fn=<NllLossBackward>)
tensor(0.9264, grad_fn=<NllLossBac

tensor(1.6328, grad_fn=<NllLossBackward>)
tensor(0.1965, grad_fn=<NllLossBackward>)
tensor(0.3624, grad_fn=<NllLossBackward>)
tensor(0.7889, grad_fn=<NllLossBackward>)
tensor(0.6982, grad_fn=<NllLossBackward>)
tensor(0.3836, grad_fn=<NllLossBackward>)
tensor(1.4383, grad_fn=<NllLossBackward>)
tensor(0.7143, grad_fn=<NllLossBackward>)
tensor(1.5027, grad_fn=<NllLossBackward>)
tensor(0.8794, grad_fn=<NllLossBackward>)
tensor(0.3885, grad_fn=<NllLossBackward>)
tensor(0.5468, grad_fn=<NllLossBackward>)
tensor(1.0176, grad_fn=<NllLossBackward>)
tensor(0.5287, grad_fn=<NllLossBackward>)
tensor(0.0983, grad_fn=<NllLossBackward>)
tensor(1.1176, grad_fn=<NllLossBackward>)
tensor(0.8183, grad_fn=<NllLossBackward>)
tensor(0.8609, grad_fn=<NllLossBackward>)
tensor(0.7017, grad_fn=<NllLossBackward>)
tensor(0.5525, grad_fn=<NllLossBackward>)
tensor(0.8755, grad_fn=<NllLossBackward>)
tensor(1.8193, grad_fn=<NllLossBackward>)
tensor(0.3085, grad_fn=<NllLossBackward>)
tensor(0.3968, grad_fn=<NllLossBac

tensor(0.5813, grad_fn=<NllLossBackward>)
tensor(0.6210, grad_fn=<NllLossBackward>)
tensor(0.7622, grad_fn=<NllLossBackward>)
tensor(0.4857, grad_fn=<NllLossBackward>)
tensor(0.3058, grad_fn=<NllLossBackward>)
tensor(1.8431, grad_fn=<NllLossBackward>)
tensor(1.3229, grad_fn=<NllLossBackward>)
tensor(0.4120, grad_fn=<NllLossBackward>)
tensor(0.7641, grad_fn=<NllLossBackward>)
tensor(0.5114, grad_fn=<NllLossBackward>)
tensor(1.0858, grad_fn=<NllLossBackward>)
tensor(0.5440, grad_fn=<NllLossBackward>)
tensor(0.2678, grad_fn=<NllLossBackward>)
tensor(1.4370, grad_fn=<NllLossBackward>)
tensor(1.0033, grad_fn=<NllLossBackward>)
tensor(0.4251, grad_fn=<NllLossBackward>)
tensor(0.5535, grad_fn=<NllLossBackward>)
tensor(0.1782, grad_fn=<NllLossBackward>)
tensor(0.1166, grad_fn=<NllLossBackward>)
tensor(0.2737, grad_fn=<NllLossBackward>)
tensor(1.2231, grad_fn=<NllLossBackward>)
tensor(0.8563, grad_fn=<NllLossBackward>)
tensor(0.3981, grad_fn=<NllLossBackward>)
tensor(0.8361, grad_fn=<NllLossBac

tensor(0.9388, grad_fn=<NllLossBackward>)
tensor(0.0750, grad_fn=<NllLossBackward>)
tensor(1.2281, grad_fn=<NllLossBackward>)
tensor(1.1275, grad_fn=<NllLossBackward>)
tensor(0.7632, grad_fn=<NllLossBackward>)
tensor(0.6954, grad_fn=<NllLossBackward>)
tensor(0.6957, grad_fn=<NllLossBackward>)
tensor(1.2688, grad_fn=<NllLossBackward>)
tensor(1.4063, grad_fn=<NllLossBackward>)
tensor(1.0642, grad_fn=<NllLossBackward>)
tensor(0.4863, grad_fn=<NllLossBackward>)
tensor(0.5974, grad_fn=<NllLossBackward>)
tensor(0.2079, grad_fn=<NllLossBackward>)
tensor(1.5355, grad_fn=<NllLossBackward>)
tensor(0.8567, grad_fn=<NllLossBackward>)
tensor(1.3388, grad_fn=<NllLossBackward>)
tensor(1.5526, grad_fn=<NllLossBackward>)
tensor(0.2936, grad_fn=<NllLossBackward>)
tensor(1.2026, grad_fn=<NllLossBackward>)
tensor(1.1386, grad_fn=<NllLossBackward>)
tensor(0.5594, grad_fn=<NllLossBackward>)
tensor(0.8708, grad_fn=<NllLossBackward>)
tensor(0.3841, grad_fn=<NllLossBackward>)
tensor(0.5235, grad_fn=<NllLossBac

tensor(0.0570, grad_fn=<NllLossBackward>)
tensor(0.3701, grad_fn=<NllLossBackward>)
tensor(1.3711, grad_fn=<NllLossBackward>)
tensor(0.7609, grad_fn=<NllLossBackward>)
tensor(0.8663, grad_fn=<NllLossBackward>)
tensor(0.6004, grad_fn=<NllLossBackward>)
tensor(0.1903, grad_fn=<NllLossBackward>)
tensor(0.6136, grad_fn=<NllLossBackward>)
tensor(0.5679, grad_fn=<NllLossBackward>)
tensor(1.0014, grad_fn=<NllLossBackward>)
tensor(0.4849, grad_fn=<NllLossBackward>)
tensor(0.2003, grad_fn=<NllLossBackward>)
tensor(0.5962, grad_fn=<NllLossBackward>)
tensor(0.6831, grad_fn=<NllLossBackward>)
tensor(0.8259, grad_fn=<NllLossBackward>)
tensor(0.7287, grad_fn=<NllLossBackward>)
tensor(0.7282, grad_fn=<NllLossBackward>)
tensor(0.5833, grad_fn=<NllLossBackward>)
tensor(0.1412, grad_fn=<NllLossBackward>)
tensor(1.0206, grad_fn=<NllLossBackward>)
tensor(0.2556, grad_fn=<NllLossBackward>)
tensor(0.7259, grad_fn=<NllLossBackward>)
tensor(0.4601, grad_fn=<NllLossBackward>)
tensor(2.1270, grad_fn=<NllLossBac

tensor(0.5029, grad_fn=<NllLossBackward>)
tensor(0.5405, grad_fn=<NllLossBackward>)
tensor(0.3701, grad_fn=<NllLossBackward>)
tensor(1.6478, grad_fn=<NllLossBackward>)
tensor(0.9442, grad_fn=<NllLossBackward>)
tensor(0.0496, grad_fn=<NllLossBackward>)
tensor(0.6288, grad_fn=<NllLossBackward>)
tensor(0.4861, grad_fn=<NllLossBackward>)
tensor(0.3729, grad_fn=<NllLossBackward>)
tensor(0.2031, grad_fn=<NllLossBackward>)
tensor(0.4459, grad_fn=<NllLossBackward>)
tensor(1.2611, grad_fn=<NllLossBackward>)
tensor(1.8908, grad_fn=<NllLossBackward>)
tensor(1.0032, grad_fn=<NllLossBackward>)
tensor(1.0166, grad_fn=<NllLossBackward>)
tensor(0.2999, grad_fn=<NllLossBackward>)
tensor(0.0803, grad_fn=<NllLossBackward>)
tensor(0.6875, grad_fn=<NllLossBackward>)
tensor(0.9959, grad_fn=<NllLossBackward>)
tensor(0.4154, grad_fn=<NllLossBackward>)
tensor(0.8972, grad_fn=<NllLossBackward>)
tensor(0.2211, grad_fn=<NllLossBackward>)
tensor(0.1529, grad_fn=<NllLossBackward>)
tensor(0.3067, grad_fn=<NllLossBac

tensor(0.3744, grad_fn=<NllLossBackward>)
tensor(0.4872, grad_fn=<NllLossBackward>)
tensor(0.9263, grad_fn=<NllLossBackward>)
tensor(0.3476, grad_fn=<NllLossBackward>)
tensor(1.3148, grad_fn=<NllLossBackward>)
tensor(0.8318, grad_fn=<NllLossBackward>)
tensor(0.6435, grad_fn=<NllLossBackward>)
tensor(0.0388, grad_fn=<NllLossBackward>)
tensor(0.2311, grad_fn=<NllLossBackward>)
tensor(0.0589, grad_fn=<NllLossBackward>)
tensor(0.1963, grad_fn=<NllLossBackward>)
tensor(0.1989, grad_fn=<NllLossBackward>)
tensor(0.4878, grad_fn=<NllLossBackward>)
tensor(0.9773, grad_fn=<NllLossBackward>)
tensor(0.5719, grad_fn=<NllLossBackward>)
tensor(0.4182, grad_fn=<NllLossBackward>)
tensor(0.5276, grad_fn=<NllLossBackward>)
tensor(1.4960, grad_fn=<NllLossBackward>)
tensor(0.6044, grad_fn=<NllLossBackward>)
tensor(0.8019, grad_fn=<NllLossBackward>)
tensor(0.5182, grad_fn=<NllLossBackward>)
tensor(1.2773, grad_fn=<NllLossBackward>)
tensor(0.3054, grad_fn=<NllLossBackward>)
tensor(0.4110, grad_fn=<NllLossBac

tensor(0.8577, grad_fn=<NllLossBackward>)
tensor(0.2220, grad_fn=<NllLossBackward>)
tensor(0.9915, grad_fn=<NllLossBackward>)
tensor(0.9038, grad_fn=<NllLossBackward>)
tensor(0.3148, grad_fn=<NllLossBackward>)
tensor(1.3135, grad_fn=<NllLossBackward>)
tensor(1.1377, grad_fn=<NllLossBackward>)
tensor(0.6153, grad_fn=<NllLossBackward>)
tensor(1.1646, grad_fn=<NllLossBackward>)
tensor(0.2487, grad_fn=<NllLossBackward>)
tensor(0.9003, grad_fn=<NllLossBackward>)
tensor(0.3917, grad_fn=<NllLossBackward>)
tensor(0.7440, grad_fn=<NllLossBackward>)
tensor(0.2964, grad_fn=<NllLossBackward>)
tensor(1.0382, grad_fn=<NllLossBackward>)
tensor(0.6667, grad_fn=<NllLossBackward>)
tensor(0.5027, grad_fn=<NllLossBackward>)
tensor(1.3291, grad_fn=<NllLossBackward>)
tensor(0.4470, grad_fn=<NllLossBackward>)
tensor(1.3830, grad_fn=<NllLossBackward>)
tensor(0.4145, grad_fn=<NllLossBackward>)
tensor(0.6825, grad_fn=<NllLossBackward>)
tensor(2.0712, grad_fn=<NllLossBackward>)
tensor(0.0647, grad_fn=<NllLossBac

tensor(0.2986, grad_fn=<NllLossBackward>)
tensor(0.2294, grad_fn=<NllLossBackward>)
tensor(0.1868, grad_fn=<NllLossBackward>)
tensor(0.2148, grad_fn=<NllLossBackward>)
tensor(1.9249, grad_fn=<NllLossBackward>)
tensor(1.3864, grad_fn=<NllLossBackward>)
tensor(0.7316, grad_fn=<NllLossBackward>)
tensor(1.2608, grad_fn=<NllLossBackward>)
tensor(0.7066, grad_fn=<NllLossBackward>)
tensor(0.8004, grad_fn=<NllLossBackward>)
tensor(0.4794, grad_fn=<NllLossBackward>)
tensor(0.5576, grad_fn=<NllLossBackward>)
tensor(0.5736, grad_fn=<NllLossBackward>)
tensor(0.5921, grad_fn=<NllLossBackward>)
tensor(0.3806, grad_fn=<NllLossBackward>)
tensor(1.3655, grad_fn=<NllLossBackward>)
tensor(0.2781, grad_fn=<NllLossBackward>)
tensor(0.1359, grad_fn=<NllLossBackward>)
tensor(0.7203, grad_fn=<NllLossBackward>)
tensor(0.3004, grad_fn=<NllLossBackward>)
tensor(1.3298, grad_fn=<NllLossBackward>)
tensor(1.6932, grad_fn=<NllLossBackward>)
tensor(0.6000, grad_fn=<NllLossBackward>)
tensor(1.0707, grad_fn=<NllLossBac

tensor(1.1528, grad_fn=<NllLossBackward>)
tensor(0.0221, grad_fn=<NllLossBackward>)
tensor(0.5557, grad_fn=<NllLossBackward>)
tensor(0.7142, grad_fn=<NllLossBackward>)
tensor(0.9478, grad_fn=<NllLossBackward>)
tensor(0.3609, grad_fn=<NllLossBackward>)
tensor(0.4092, grad_fn=<NllLossBackward>)
tensor(0.2840, grad_fn=<NllLossBackward>)
tensor(0.2831, grad_fn=<NllLossBackward>)
tensor(0.2816, grad_fn=<NllLossBackward>)
tensor(0.6431, grad_fn=<NllLossBackward>)
tensor(0.2654, grad_fn=<NllLossBackward>)
tensor(0.5091, grad_fn=<NllLossBackward>)
tensor(0.8283, grad_fn=<NllLossBackward>)
tensor(1.4805, grad_fn=<NllLossBackward>)
tensor(0.5840, grad_fn=<NllLossBackward>)
tensor(0.2735, grad_fn=<NllLossBackward>)
tensor(0.3810, grad_fn=<NllLossBackward>)
tensor(0.9881, grad_fn=<NllLossBackward>)
tensor(0.3626, grad_fn=<NllLossBackward>)
tensor(0.4316, grad_fn=<NllLossBackward>)
tensor(0.0112, grad_fn=<NllLossBackward>)
tensor(0.6154, grad_fn=<NllLossBackward>)
tensor(0.5743, grad_fn=<NllLossBac

tensor(0.4425, grad_fn=<NllLossBackward>)
tensor(0.8578, grad_fn=<NllLossBackward>)
tensor(0.2610, grad_fn=<NllLossBackward>)
tensor(0.5250, grad_fn=<NllLossBackward>)
tensor(0.1803, grad_fn=<NllLossBackward>)
tensor(0.4045, grad_fn=<NllLossBackward>)
tensor(0.2637, grad_fn=<NllLossBackward>)
tensor(0.9219, grad_fn=<NllLossBackward>)
tensor(1.4683, grad_fn=<NllLossBackward>)
tensor(0.8259, grad_fn=<NllLossBackward>)
tensor(0.6936, grad_fn=<NllLossBackward>)
tensor(0.5689, grad_fn=<NllLossBackward>)
tensor(0.0554, grad_fn=<NllLossBackward>)
tensor(0.3573, grad_fn=<NllLossBackward>)
tensor(0.0256, grad_fn=<NllLossBackward>)
tensor(0.2565, grad_fn=<NllLossBackward>)
tensor(0.5594, grad_fn=<NllLossBackward>)
tensor(0.3640, grad_fn=<NllLossBackward>)
tensor(0.3225, grad_fn=<NllLossBackward>)
tensor(0.3280, grad_fn=<NllLossBackward>)
tensor(2.0253, grad_fn=<NllLossBackward>)
tensor(0.0461, grad_fn=<NllLossBackward>)
tensor(0.0107, grad_fn=<NllLossBackward>)
tensor(0.2635, grad_fn=<NllLossBac

KeyboardInterrupt: 

In [56]:
batch_size = 1
seq_len = 10
embedding_dim = 49
hidden_dim = 10
num_layers = 1
vocab_len = 29
model = Decoder(hidden_dim, embedding_dim, num_layers, vocab_len)
attention = Attention(embedding_dim, hidden_dim)
encoder_output = torch.rand(batch_size, 512, 49)
#inputs from embeddings [batch_size, seq_len, embedding_dim + hidden_dim]
input_str = torch.rand(batch_size, seq_len, embedding_dim)
hidden = torch.rand(2, num_layers, batch_size, hidden_dim)

for i in range(input_str.shape[1]):
    
    attention_vector = attention(encoder_output, hidden)
    lstm_input = torch.unsqueeze(torch.cat([input_str[:, i, :], attention_vector], dim=-1), 1)
    output, hidden = model(lstm_input, hidden)
    print(hidden)
    

(tensor([[[-0.1206,  0.0362, -0.2240,  0.2395, -0.0155, -0.0090,  0.0532,
           0.1678,  0.1748,  0.0265]]], grad_fn=<ViewBackward>), tensor([[[-0.4015,  0.0666, -0.2668,  0.4081, -0.0563, -0.0465,  0.1942,
           0.5321,  1.1125,  0.0434]]], grad_fn=<ViewBackward>))
(tensor([[[-0.1229, -0.2185, -0.4701,  0.4513, -0.1097, -0.0544,  0.0641,
           0.0982,  0.2417, -0.0161]]], grad_fn=<ViewBackward>), tensor([[[-0.3627, -0.7442, -0.5807,  0.9857, -0.2122, -0.3104,  0.3643,
           0.7520,  0.9708, -0.0301]]], grad_fn=<ViewBackward>))
(tensor([[[-0.1978, -0.4159, -0.3750,  0.3909, -0.0475, -0.1323,  0.1368,
           0.1838,  0.1406,  0.0098]]], grad_fn=<ViewBackward>), tensor([[[-0.4194, -1.0675, -0.6118,  1.0504, -0.1487, -0.6193,  0.3856,
           0.7525,  0.7458,  0.0186]]], grad_fn=<ViewBackward>))
(tensor([[[-0.1297, -0.4165, -0.4629,  0.4339, -0.0909, -0.0684,  0.1993,
           0.1639,  0.0761, -0.0210]]], grad_fn=<ViewBackward>), tensor([[[-0.3372, -1.0228, -0

In [27]:
print(x.shape)
print(hidden)

torch.Size([1, 10, 29])
(tensor([[[ 0.1567, -0.3082,  0.2041, -0.2354,  0.0321, -0.0569,  0.0033,
          -0.0245,  0.1133,  0.1696]]], grad_fn=<ViewBackward>), tensor([[[ 0.4709, -0.5611,  0.3368, -0.3590,  0.0493, -0.1002,  0.0086,
          -0.0744,  0.4667,  0.2902]]], grad_fn=<ViewBackward>))


In [40]:
def factorial(num):
    array = [1, 1]
    if num == 0 or num == 1:
        return 1
    for i in range(2, num + 1):
        array.append(array[-1] * i)
    return array[-1]

In [50]:
string = str(factorial(255))
i = len(string) - 1
while string[i] == '0':
    i -= 1
print(string)

3350850684932979117652665123754814942022584063591740702576779884286208799035732771005626138126763314259280802118502282445926550135522251856727692533193070412811083330325659322041700029792166250734253390513754466045711240338462701034020262992581378423147276636643647155396305352541105541439434840109915068285430675068591638581980604162940383356586739198268782104924614076605793562865241982176207428620969776803149467431386807972438247689158656000000000000000000000000000000000000000000000000000000000000000


In [51]:
print(len("000000000000000000000000000000000000000000000000000000000000000"))

63


In [55]:
import math
def is_prime(num):
    for i in range(2, math.ceil(num**(1/2)) + 1):
        if num % i == 0 and num != i:
            return False
    return True

In [56]:
is_prime(2)

True

In [57]:
counter = 0
for i in range(2, 100000):
    if is_prime(i):
        counter += 1
        print("#: ", counter, "number: ", i)
    if counter == 238:
        break

#:  1 number:  2
#:  2 number:  3
#:  3 number:  5
#:  4 number:  7
#:  5 number:  11
#:  6 number:  13
#:  7 number:  17
#:  8 number:  19
#:  9 number:  23
#:  10 number:  29
#:  11 number:  31
#:  12 number:  37
#:  13 number:  41
#:  14 number:  43
#:  15 number:  47
#:  16 number:  53
#:  17 number:  59
#:  18 number:  61
#:  19 number:  67
#:  20 number:  71
#:  21 number:  73
#:  22 number:  79
#:  23 number:  83
#:  24 number:  89
#:  25 number:  97
#:  26 number:  101
#:  27 number:  103
#:  28 number:  107
#:  29 number:  109
#:  30 number:  113
#:  31 number:  127
#:  32 number:  131
#:  33 number:  137
#:  34 number:  139
#:  35 number:  149
#:  36 number:  151
#:  37 number:  157
#:  38 number:  163
#:  39 number:  167
#:  40 number:  173
#:  41 number:  179
#:  42 number:  181
#:  43 number:  191
#:  44 number:  193
#:  45 number:  197
#:  46 number:  199
#:  47 number:  211
#:  48 number:  223
#:  49 number:  227
#:  50 number:  229
#:  51 number:  233
#:  52 number:  23

In [62]:
array = list(range(1, 2009))

In [63]:
while len(array) != 1:
    array = array[1::2]
    print(array)

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224, 226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250, 252, 254, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 312, 314, 316, 318, 320, 322, 324, 326, 328, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376, 378, 380, 382, 384, 386, 388, 390, 392, 394, 396, 398, 400, 402, 404, 406, 408, 410, 412, 414, 416, 418, 420, 42

In [64]:
array

[1024]