In [1]:
import torch
import torch.nn as nn
from torchvision import transforms
import sys
COCOAPIROOT = r"D:\学习资料\实验室"
from pycocotools.coco import COCO
from raw_program.data_loader import get_loader
from raw_program.model import EncoderCNN, DecoderRNN
import math


# 选取合适参数
batch_size = 128          # batch size
vocab_threshold = 5        # minimum word count threshold
vocab_from_file = False    # if True, load existing vocab file
embed_size = 512           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
num_epochs = 3             # number of training epochs
save_every = 1             # determines frequency of saving model weights
print_every = 100          # determines window for printing average loss
log_file = 'training_log.txt'       # name of file with saved training loss and perplexity

# 建立transforms
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# 建立dataloader
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file,
                         cocoapi_loc=COCOAPIROOT)

# 定义词典大小
vocab_size = len(data_loader.dataset.vocab)

# 初始化encoder和decoder 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# 把模型移动到GPU中
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# 定义损失函数
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

# 把可学习的参数建立一个列表
params = list(encoder.embed.parameters()) + list(decoder.parameters())

# 选定优化器
optimizer = torch.optim.Adam(params, lr=0.001)

# 设置每一个epoch训练多少步
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

loading annotations into memory...
Done (t=1.25s)
creating index...
index created!
[0/591753] 正在读取captions并根据其分词建立词典...
[100000/591753] 正在读取captions并根据其分词建立词典...
[200000/591753] 正在读取captions并根据其分词建立词典...
[300000/591753] 正在读取captions并根据其分词建立词典...
[400000/591753] 正在读取captions并根据其分词建立词典...
[500000/591753] 正在读取captions并根据其分词建立词典...
初始化vocab.pkl文件成功
loading annotations into memory...
Done (t=1.19s)
creating index...
index created!
正在对caption分词...


100%|████████████████████████████████████████████████████████████████████████| 591753/591753 [01:08<00:00, 8632.36it/s]


In [3]:
# Check model save/load
import os

# Save
torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-0.pkl'))
torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-0.pkl'))
# Load
decoder_file = 'decoder-0.pkl'
encoder_file = 'encoder-0.pkl'
encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file),  map_location='cpu'))
decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file),  map_location='cpu'))

<All keys matched successfully>

In [5]:
import torch.utils.data as data
import numpy as np

import requests
import time


# Open the training log file.
f = open(log_file, 'w')

# Select True if training on local desktop. False to train on GPU workspace
local = False
# if not local:
start_time = time.time()
#     response = requests.request("GET", 
#                                 "http://metadata.google.internal/computeMetadata/v1/instance/attributes/keep_alive_token", 
#                                 headers={"Metadata-Flavor":"Google"})

for epoch in range(1, num_epochs+1):
    for i_step in range(1, total_step+1):
#         if not local:
#             if time.time() - old_time > 60:
#                 old_time = time.time()
#                 requests.request("POST", 
#                                  "https://nebula.udacity.com/api/v1/remote/keep-alive", 
#                                  headers={'Authorization': "STAR " + response.text})

        # 随机从caption_length中采样返回对应索引
        indices = data_loader.dataset.get_train_indices()
        # 创建一个样本
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler
        # 获取这批数据
        images, captions = next(iter(data_loader))
        # 将数据移到GPU中
        images = images.to(device)
        captions = captions.to(device)
        # 将梯度归零
        decoder.zero_grad()
        encoder.zero_grad()
        # 将输入传到encoder和decoder中
        features = encoder(images)
        outputs = decoder(features, captions)
        # 计算损失函数
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        # 反向传播
        loss.backward()
        # 更新优化器参数
        optimizer.step()
        # 获取训练的统计数据
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        # 打印训练数据(同一行中)
        print('\r' + stats, end="")
        sys.stdout.flush()
        # 将数据存到文件中
        f.write(stats + '\n')
        f.flush()
        # 打印训练数据(换行)
        if i_step % print_every == 0:
            print('\r' + stats)

    # 保存权重
    if epoch % save_every == 0:
        torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch))
        torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch))

# 关闭log文件
f.close()
end_time = time.time()
print("本次训练时长：{}".format(end_time-start_time))

Epoch [1/3], Step [100/4624], Loss: 3.6885, Perplexity: 39.98566
Epoch [1/3], Step [200/4624], Loss: 3.4065, Perplexity: 30.16089
Epoch [1/3], Step [300/4624], Loss: 3.9250, Perplexity: 50.6533
Epoch [1/3], Step [400/4624], Loss: 3.0100, Perplexity: 20.2865
Epoch [1/3], Step [500/4624], Loss: 3.0308, Perplexity: 20.7130
Epoch [1/3], Step [600/4624], Loss: 3.0018, Perplexity: 20.1212
Epoch [1/3], Step [700/4624], Loss: 2.8469, Perplexity: 17.2340
Epoch [1/3], Step [800/4624], Loss: 2.7958, Perplexity: 16.3752
Epoch [1/3], Step [900/4624], Loss: 2.6767, Perplexity: 14.5367
Epoch [1/3], Step [1000/4624], Loss: 2.7064, Perplexity: 14.9753
Epoch [1/3], Step [1100/4624], Loss: 2.5397, Perplexity: 12.6756
Epoch [1/3], Step [1200/4624], Loss: 2.4240, Perplexity: 11.2910
Epoch [1/3], Step [1300/4624], Loss: 2.4809, Perplexity: 11.9524
Epoch [1/3], Step [1400/4624], Loss: 2.4782, Perplexity: 11.9195
Epoch [1/3], Step [1500/4624], Loss: 2.5077, Perplexity: 12.2761
Epoch [1/3], Step [1600/4624], L

Epoch [3/3], Step [3500/4624], Loss: 1.9405, Perplexity: 6.96227
Epoch [3/3], Step [3600/4624], Loss: 2.0742, Perplexity: 7.95798
Epoch [3/3], Step [3700/4624], Loss: 1.9367, Perplexity: 6.93571
Epoch [3/3], Step [3800/4624], Loss: 1.8853, Perplexity: 6.58820
Epoch [3/3], Step [3900/4624], Loss: 1.8539, Perplexity: 6.38444
Epoch [3/3], Step [4000/4624], Loss: 1.9714, Perplexity: 7.18109
Epoch [3/3], Step [4100/4624], Loss: 1.9455, Perplexity: 6.99726
Epoch [3/3], Step [4200/4624], Loss: 1.8973, Perplexity: 6.66810
Epoch [3/3], Step [4300/4624], Loss: 1.9660, Perplexity: 7.14173
Epoch [3/3], Step [4400/4624], Loss: 1.9410, Perplexity: 6.96560
Epoch [3/3], Step [4500/4624], Loss: 1.9476, Perplexity: 7.01180
Epoch [3/3], Step [4600/4624], Loss: 2.0703, Perplexity: 7.92712
Epoch [3/3], Step [4624/4624], Loss: 1.9401, Perplexity: 6.95954本次训练时长：137305.35981154442
