In [1]:
from raw_program.models import EncoderCNN, DecoderWithAttention

In [2]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import numpy as np


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
import sys
from pycocotools.coco import COCO
import nltk
from raw_program.data_loader import get_loader
from torchvision import transforms

COCOAPIROOT = r"D:\学习资料\实验室"

# Define a transform to pre-process the training images.
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Set the minimum word count threshold.
vocab_threshold = 5

# Specify the batch size.
batch_size = 10

# Obtain the data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=False,
                         cocoapi_loc=COCOAPIROOT)

loading annotations into memory...
Done (t=1.13s)
creating index...
index created!
[0/591753] 正在读取captions并根据其分词建立词典...
[100000/591753] 正在读取captions并根据其分词建立词典...
[200000/591753] 正在读取captions并根据其分词建立词典...
[300000/591753] 正在读取captions并根据其分词建立词典...
[400000/591753] 正在读取captions并根据其分词建立词典...
[500000/591753] 正在读取captions并根据其分词建立词典...
初始化vocab.pkl文件成功
loading annotations into memory...
Done (t=1.12s)
creating index...
index created!
正在对caption分词...


100%|████████████████████████████████████████████████████████████████████████| 591753/591753 [01:02<00:00, 9455.18it/s]


In [4]:
import numpy as np
import torch.utils.data as data

indices = data_loader.dataset.get_train_indices()
print('sampled indices:', indices)

new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
data_loader.batch_sampler.sampler = new_sampler
    
images, captions = next(iter(data_loader))
    
print('images.shape:', images.shape)
print('captions.shape:', captions.shape)

sampled indices: [132219, 405439, 5594, 427264, 303941, 111948, 224063, 249155, 33978, 313804]
images.shape: torch.Size([10, 3, 224, 224])
captions.shape: torch.Size([10, 11])


In [5]:
# Specify the dimensionality of the image embedding.
embed_size = 256

#-#-#-# Do NOT modify the code below this line. #-#-#-#

# Initialize the encoder. (Optional: Add additional arguments if necessary.)
encoder = EncoderCNN()

# Move the encoder to GPU if CUDA is available.
encoder.to(device)
    
# Move last batch of images (from Step 2) to GPU if CUDA is available.   
images = images.to(device)

# Pass the images through the encoder.
features = encoder(images)

print('type(features):', type(features))
print('features.shape:', features.shape)

# Check that your encoder satisfies some requirements of the project! :D
assert type(features)==torch.Tensor, "Encoder output needs to be a PyTorch Tensor." 
assert (features.shape[0]==batch_size), "The shape of the encoder output is incorrect."



type(features): <class 'torch.Tensor'>
features.shape: torch.Size([10, 14, 14, 2048])


In [6]:
# Specify the number of features in the hidden state of the RNN decoder.
hidden_size = 512
attention_dim = 512
embed_dim = embed_size
decoder_dim = 512 

#-#-#-# Do NOT modify the code below this line. #-#-#-#

# Store the size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the decoder.
decoder = DecoderWithAttention(attention_dim, embed_dim, decoder_dim, vocab_size)

# Move the decoder to GPU if CUDA is available.
decoder.to(device)
    
# Move last batch of captions (from Step 1) to GPU if CUDA is available 
captions = captions.to(device)
caption_lengths = torch.full((batch_size, 1), len(captions[1]))

# Pass the encoder output and captions through the decoder.
outputs, encode_captions, decode_length, alphas, sort_ind= decoder(features, captions, caption_lengths)


print('type(outputs):', type(outputs))
print('outputs.shape:', outputs.shape)

# Check that your decoder satisfies some requirements of the project! :D
assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]-1) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."

type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([10, 10, 10321])


In [7]:
captions.shape

torch.Size([10, 11])

In [8]:
caption_lengths = torch.full((batch_size, 1), len(captions[1]))

In [9]:
caption_lengths

tensor([[11],
        [11],
        [11],
        [11],
        [11],
        [11],
        [11],
        [11],
        [11],
        [11]])

In [10]:
vocab_size

10321

In [11]:
captions.shape[1]

11

In [12]:
decode_length

[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]

In [13]:
encode_captions

tensor([[   0,    3,   29,  383,   49,    9,  464,  123,    3,   20,    1],
        [   0,    3,  376,  253,    6,  254,   16,    3, 8567,   12,    1],
        [   0,  110,   16,  196, 1000, 2466, 1066, 2429,   16,   48,    1],
        [   0,   61,   30, 3258,  373,    3,  665, 1227,  427,   12,    1],
        [   0,    3, 3292, 3351,  379,   39,   10,  123,    3,   70,    1],
        [   0,    9, 1829,  277,   35,  155,   39,    9,  118,   12,    1],
        [   0,    3, 4876,   21,   35,  195,   49,  234,  924,   12,    1],
        [   0,    9, 5339,  471,  104,  753, 2165,  123,  730,   12,    1],
        [   0,    3,  113, 2618,   39,    9,   33,  265,  300,   12,    1],
        [   0,   34,   67,  263, 1059,  331,  936,  676,  234,  878,    1]],
       device='cuda:0')

In [14]:
outputs.shape

torch.Size([10, 10, 10321])