In [1]:
from cgi import test
import os
from collections import Counter
import numpy as np
import json
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image



  from cgi import test


In [2]:
# 图像路径和文本描述文件的路径
image_path = "./data/deepfashion-multimodal/images/"
train_text = "./data/deepfashion-multimodal/train_captions.json"
test_text = "./data/deepfashion-multimodal/test_captions.json"

# 定义一个用于深度学习的数据集类
class MyDataset(Dataset):
    # 构造函数：初始化数据集
    def __init__(self, image_paths, train_captions, test_captions, transform=None):
        self.image_paths = image_paths  # 图像文件路径
        self.train_captions = train_captions  # 训练集的文本描述
        self.test_captions = test_captions  # 测试集的文本描述
        self.transform = transform  # 图像的转换方法

    # 返回数据集的长度，即图像的数量
    def __len__(self):
        return len(self.image_paths)

    # 根据索引获取数据集中的一个项
    def __getitem__(self, idx):
        idx_name = self.image_paths[idx]  # 获取对应索引的图像路径
        file_name = idx_name.split("/")[-1]  # 从路径中提取文件名
        image = Image.open(idx_name).convert("RGB")  # 打开图像并转换为RGB格式
        if self.transform:  # 如果提供了转换方法，则应用转换
            image = self.transform(image)
        # print(file_name)
        return image, self.train_captions[file_name]  # 返回图像及其对应的训练描述
    
    # 根据索取返回文件名
    def get_file_name(self, idx):
        idx_name = self.image_paths[idx]  # 获取对应索引的图像路径
        file_name = idx_name.split("/")[-1]  # 从路径中提取文件名
        return file_name
    
    # 获取训练集的文本描述
    def get_train_captions(self):
        return self.train_captions
    
# 定义一个用于深度学习的数据集类
class MyDatasetForTest(Dataset):
    # 构造函数：初始化数据集
    def __init__(self, image_paths, train_captions, test_captions, transform=None):
        self.image_paths = image_paths  # 图像文件路径
        self.train_captions = train_captions  # 训练集的文本描述
        self.test_captions = test_captions  # 测试集的文本描述
        self.transform = transform  # 图像的转换方法

    # 返回数据集的长度，即图像的数量
    def __len__(self):
        return len(self.image_paths)

    # 根据索引获取数据集中的一个项
    def __getitem__(self, idx):
        idx_name = self.image_paths[idx]  # 获取对应索引的图像路径
        file_name = idx_name.split("/")[-1]  # 从路径中提取文件名
        image = Image.open(idx_name).convert("RGB")  # 打开图像并转换为RGB格式
        if self.transform:  # 如果提供了转换方法，则应用转换
            image = self.transform(image)
        # print(file_name)
        return image, self.test_captions[file_name]  # 返回图像及其对应的训练描述
    
    # 根据索取返回文件名
    def get_file_name(self, idx):
        idx_name = self.image_paths[idx]  # 获取对应索引的图像路径
        file_name = idx_name.split("/")[-1]  # 从路径中提取文件名
        return file_name
    
    # 获取训练集的文本描述
    def get_train_captions(self):
        return self.test_captions


In [3]:
# 读取训练集和测试集的文本描述
train_captions = json.load(open(train_text, 'r'))  # 从JSON文件加载训练集的文本描述
test_captions = json.load(open(test_text, 'r'))    # 从JSON文件加载测试集的文本描述

image_paths = []  # 初始化用于存储图像路径的列表

# 遍历训练集的文本描述，并构建图像的完整路径列表
for key in train_captions.keys():
    image_paths.append(image_path + key)  # 将图像的基本路径和图像文件名结合，添加到列表中
    
image_path_test = []
# 遍历测试集的文本描述，并构建图像的完整路径列表
for key in test_captions.keys():
    image_path_test.append(image_path + key)  # 将图像的基本路径和图像文件名结合，添加到列表中


In [4]:
# 定义一个图像转换流程，包括调整尺寸、转换为张量、标准化
transform = transforms.Compose(
    [
        transforms.Resize((256, 256)),  # 首先，调整图像尺寸为256x256像素
        transforms.ToTensor(),  # 接着，将图像数据转换为张量（Tensor）格式
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # 最后，对图像进行标准化处理
    ]
)


In [5]:
# 获取图像描述的键和值
image_dic = train_captions.keys()
image_descriptions = train_captions.values()

# 构建词汇表
vocab = Counter()
for description in image_descriptions:
    vocab.update(description.split())  # 更新词汇表的计数

# 设置出现次数阈值，移除低频词汇（这里阈值为-1，意味着不实际移除任何词汇）
threshold = -1
words = [word for word, count in vocab.items() if count >= threshold]

# 为每个单词创建索引映射
idx_to_word = {idx: word for idx, word in enumerate(words, 1)}

# 添加特殊标记到词汇表
idx_to_word[0] = "<pad>"  # 填充标记
idx_to_word[len(idx_to_word)] = "<end>"  # 结束标记
idx_to_word[len(idx_to_word)] = "<start>"  # 开始标记

# 为描述添加开始和结束标记
for key, description in train_captions.items():
    train_captions[key] = "<start> " + description + " <end>"

# 使用 <pad> 填充文本至最大长度
max_length = max(len(description.split()) for description in image_descriptions)
for key, description in train_captions.items():
    train_captions[key] += " <pad>" * (max_length - len(description.split()))
for key, description in test_captions.items():
    test_captions[key] += " <pad>" * (max_length - len(description.split()))

# 创建从单词到索引的映射
word_to_idx = {word: idx for idx, word in idx_to_word.items()}
vocab_size_len = len(idx_to_word)

# 将每个单词转换为其索引
train_captions = {
    key: [word_to_idx[word] for word in value.split()]
    for key, value in train_captions.items()
}
test_captions = {
    key: [word_to_idx[word] for word in value.split()]
    for key, value in test_captions.items()
}

# 打印训练集中前五个描述的键值对和索引
for key, value, idx in zip(train_captions.keys(), train_captions.values(), range(5)):
    print(key, value, idx)
    if idx == 4:
        break


WOMEN-Jackets_Coats-id_00005611-01_4_full.jpg [159, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 13, 14, 15, 16, 17, 1, 18, 3, 16, 14, 5, 19, 1, 8, 16, 20, 9, 15, 4, 10, 11, 12, 21, 22, 23, 24, 25, 26, 27, 28, 7, 8, 9, 29, 12, 21, 30, 16, 31, 32, 33, 34, 35, 36, 21, 30, 4, 37, 158, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 0
WOMEN-Tees_Tanks-id_00005033-03_4_full.jpg [159, 38, 39, 40, 4, 41, 6, 42, 8, 9, 43, 12, 44, 4, 32, 45, 46, 1, 47, 24, 32, 5, 48, 1, 49, 50, 28, 20, 8, 9, 10, 11, 12, 1, 22, 24, 32, 51, 158, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 1
WOMEN-Rompers_Jumpsuits-id_00000245-01_1_front.jpg [159, 38, 39, 52, 4, 41, 6, 7, 8, 9, 10, 11, 12, 44, 4, 32, 53, 46, 21, 54, 24, 32, 5, 55, 1, 56, 50, 28, 7, 8, 9, 10, 11, 12, 57, 16, 32, 33, 34, 35, 36, 1, 22, 24, 32, 58, 57, 16, 25, 59, 34, 35, 60, 158, 0, 

In [6]:
# 将训练集的文本描述转换为PyTorch张量
for key, value in train_captions.items():
    train_captions[key] = torch.tensor(value, dtype=torch.long)

# 创建一个数据集实例
dataset = MyDataset(image_paths, train_captions, test_captions, transform)
dataset_test = MyDatasetForTest(image_path_test, train_captions, test_captions, transform)

# 遍历数据集的第一个元素，打印图像和文本描述的形状和内容
for i in range(1):
    # 打印图像和文本描述的张量形状
    print(dataset[i][0].shape, dataset[i][1].shape)
    # 打印图像和文本描述的内容
    print(dataset[i][0], dataset[i][1])


torch.Size([3, 256, 256]) torch.Size([95])
tensor([[[2.2318, 2.2318, 2.2318,  ..., 2.1804, 2.1804, 2.1804],
         [2.2318, 2.2147, 2.2318,  ..., 2.1804, 2.1804, 2.1804],
         [2.2318, 2.2318, 2.2318,  ..., 2.1804, 2.1804, 2.1804],
         ...,
         [2.1804, 2.1633, 2.1633,  ..., 2.0777, 2.0777, 2.0777],
         [2.1804, 2.1633, 2.1633,  ..., 2.0948, 2.0777, 2.0777],
         [2.1804, 2.1804, 2.1633,  ..., 2.0948, 2.0948, 2.0777]],

        [[2.4111, 2.4111, 2.4111,  ..., 2.3585, 2.3585, 2.3585],
         [2.4111, 2.3936, 2.4111,  ..., 2.3585, 2.3585, 2.3585],
         [2.4111, 2.4111, 2.4111,  ..., 2.3585, 2.3585, 2.3585],
         ...,
         [2.3235, 2.3410, 2.3410,  ..., 2.2185, 2.2185, 2.2185],
         [2.3235, 2.3410, 2.3410,  ..., 2.2360, 2.2185, 2.2185],
         [2.3235, 2.3410, 2.3410,  ..., 2.2360, 2.2360, 2.2185]],

        [[2.6226, 2.6226, 2.6226,  ..., 2.5703, 2.5703, 2.5703],
         [2.6226, 2.6051, 2.6226,  ..., 2.5703, 2.5703, 2.5703],
         [2.622

In [7]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn import Transformer

# 使用预训练的CNN来提取图像特征
class FeatureExtractorCNN(nn.Module):
    def __init__(self):
        super(FeatureExtractorCNN, self).__init__()
        # 载入预训练的ResNet-18模型
        self.resnet = models.resnet18(pretrained=True)
        # 移除最后一个层，以获取特征
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-1])

    def forward(self, images):
        # 调整图像尺寸并重塑
        batch_size = images.shape[0]
        images = nn.functional.interpolate(
            images, scale_factor=3, mode="bilinear", align_corners=False
        )
        images = images.unfold(2, 256, 256).unfold(3, 256, 256)
        images = images.contiguous().view(-1, 3, 256, 256)

        # 通过ResNet提取特征
        features = self.resnet(images)
        features = features.view(batch_size, 9, -1)
        return features

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F


# 首先定义注意力机制层
class Attention(nn.Module):
    def __init__(self, feature_dim, decoder_dim, attention_dim):
        super(Attention, self).__init__()
        self.attention = nn.Linear(feature_dim + decoder_dim, attention_dim)
        self.v = nn.Linear(attention_dim, 1, bias=False)

    def forward(self, features, hidden_state):
        # 组合来自上一解码状态的特征和隐藏状态
        hidden_state = hidden_state.unsqueeze(1).repeat(1, features.size(1), 1)
        combined = torch.cat((features, hidden_state), 2)
        attention_scores = self.attention(combined)
        attention_scores = self.v(torch.tanh(attention_scores)).squeeze(2)
        alpha = F.softmax(attention_scores, dim=1)
        context = (features * alpha.unsqueeze(2)).sum(dim=1)
        return context, alpha

# 定义解码器RNN
class DecoderRNN(nn.Module):
    def __init__(self, attention_dim, embed_dim, decoder_dim, vocab_size):
        super(DecoderRNN, self).__init__()
        self.decoder_dim = decoder_dim
        self.vocab_size = vocab_size
        self.attention = Attention(feature_dim=512, decoder_dim=decoder_dim, attention_dim=attention_dim)
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.LSTMCell(embed_dim + 512, decoder_dim, bias=True)
        self.fc = nn.Linear(decoder_dim, vocab_size)
        self.init_weights()

    def init_weights(self):
        self.embedding.weight.data.uniform_(-0.1, 0.1)
        self.fc.bias.data.fill_(0)
        self.fc.weight.data.uniform_(-0.1, 0.1)

    def forward(self, features, captions):
        embeddings = self.embedding(captions)
        h, c = self.init_hidden_state(features)  # 初始化隐藏状态和细胞状态
        seq_length = len(captions[0])
        batch_size = features.size(0)
        preds = torch.zeros(batch_size, seq_length, self.vocab_size).to(features.device)
        
        for t in range(seq_length):
            context, alpha = self.attention(features, h)
            rnn_input = torch.cat((embeddings[:, t, :], context), 1)
            h, c = self.rnn(rnn_input, (h, c))
            out = self.fc(h)
            preds[:, t, :] = out
        
        return preds

    def init_hidden_state(self, features):
        mean_features = features.mean(dim=1)
        h = torch.zeros(size=(features.size(0), self.decoder_dim)).to(features.device)
        c = torch.zeros(size=(features.size(0), self.decoder_dim)).to(features.device)
        return h, c



In [9]:
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=0)
# check dataset

for i, (image, caption) in enumerate(dataloader):
    #打印图像和文本描述的张量形状
    # print(image.shape, caption.shape)
    #打印一个文本描述的内容
    # print(caption[0])
    if i == 0:
        break

# torch.Size([16, 3, 256, 256]) torch.Size([16, 95])
# tensor([159,   1,   2,   3,   4,  83,   6,   7,   8,   9,  66,  11,  12,  44,
#           4,  32,  92,  46,   1,  18,   3,  16,  14,  65,  19,   1,   8,  16,
#          20,   9,  15,   4,  10,  11,  12,  57,  16,  32,  33,  34,  35,  36,
#         158,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
#           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
#           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
#           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim

# 实例化模型
cnn = FeatureExtractorCNN()
cnn = nn.DataParallel(cnn)

attention_dim = 256
embed_dim = 256
decoder_dim = 512
vocab_size = vocab_size_len  

decoder = DecoderRNN(attention_dim, embed_dim, decoder_dim, vocab_size)
decoder =nn.DataParallel(decoder)



# 训练参数
epoch = 5
lr = 0.001
# todo:创建优化器
optimizer = optim.Adam(list(decoder.parameters()), lr=lr)



# check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

decoder.to(device)
cnn = cnn.to(device)




cuda


In [12]:

# 损失函数，忽略填充索引
pad_index = 0
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

#训练循环
for e in range(epoch):
    for i, (image, caption) in enumerate(dataloader):
        # 将图像和描述移至计算设备
        image = image.to(device)
        caption = caption.to(device)

        # 重置梯度
        optimizer.zero_grad()
        
        # 提取图像特征
        features = cnn(image)
        # src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(features, caption[:, :-1])
        
        # 打印图像特征的形状
        # print(features.shape)
        # 模型前向传播
        outputs = decoder(features, caption[:, :-1])
        
        # #打印输出的形状
        # print("outputs",outputs.shape)
        # print(outputs)
        # #打印caption的形状
        # print("caption:",caption.shape)
        # print(caption)
        
        loss = criterion(outputs.reshape(-1, 160), caption[:, 1:].reshape(-1))

        # 反向传播和优化
        loss.backward()  # 计算梯度
        optimizer.step()  # 更新参数

        # 每10步打印一次损失，呈现一次输出
        if i % 10 == 0:
            print(f"Epoch {e}, Step {i}, Loss: {loss.item()}")
            
            outputs_str = []
            outputs_str.append([])
            for j in range(outputs.shape[1]):
                outputs_str[0].append(idx_to_word[torch.argmax(outputs[0][j]).item()])
            print("outputs_str:",outputs_str)
            
            captions_str = []
            captions_str.append([])
            for j in range(caption.shape[1]):
                captions_str[0].append(idx_to_word[caption[0][j].item()])
            print("captions_str:",captions_str)

    # 保存模型至当前目录的model_a文件夹下
    # torch.save({
    #     'epoch': e,
    #     'decoder_state_dict': decoder.state_dict(),
    #     
    #     'optimizer_state_dict': optimizer.state_dict(),
    # }, f'model_a/checkpoint_{e}.pth')


KeyboardInterrupt: 

In [30]:
# 写一个简单的前端
import gradio as gr

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

def generate_caption(model, image_features, word2idx, idx2word, max_length_inf=93):
    outputs_inf = [word2idx["<start>"]]
    image_features.to(device)

    for k in range(max_length_inf - 1):
        
        with torch.no_grad():
            
            # 将当前生成的字幕转换为张量
            current_caption = torch.LongTensor(outputs_inf).unsqueeze(0).to(device)
            
            # 生成下一个单词的预测
            out = model(image_features, current_caption)

        # 获取最后一个时间步的最大值索引
        last_time_step_max_index = out.argmax(dim=2)[0, -1].item()

        next_word = last_time_step_max_index


        # 检查是否达到结束标记
        if next_word == word2idx['<end>']:
            break  # 一旦生成<end>标记，立即停止生成

        outputs_inf.append(next_word)
        
    # 转换序列为文字（跳过第一个元素）
    caption_inf = [idx2word[idx] for idx in outputs_inf[1:]]

    return ' '.join(caption_inf)


def predict(images):
    #对图像进行预处理
    images = Image.fromarray(images)
    images = transform(images)
    images = images.unsqueeze(0)
    # 使用模型对图像进行描述生成    
    image_feature = cnn(images)
    # print(test_feature.shape)
    image_feature.to(device)

    output = generate_caption(decoder_inf, image_feature, word_to_idx, idx_to_word)
    # 返回描述文本
    description = output
    
    return description


# load model

decoder_inf = DecoderRNN(attention_dim=256, embed_dim=256, decoder_dim=512, vocab_size=160)
decoder_inf.to(device)
checkpoint = torch.load(f'model_a/checkpoint_3.pth')
decoder_inf.load_state_dict(checkpoint['decoder_state_dict'])
decoder_inf.eval()


interface = gr.Interface(
    fn=predict,
    inputs="image",
    outputs="text",
    live=True,  # 实时更新输出
    title="图片描述模型",
    description="上传一张图片，获取描述文本。",
)

interface.launch()

cuda
Running on local URL:  http://127.0.0.1:7876

To create a public link, set `share=True` in `launch()`.




In [11]:
# 生成训练集中的描述
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

def generate_caption(model, image_features, word2idx, idx2word, max_length_inf=93):
    outputs_inf = [word2idx["<start>"]]
    image_features.to(device)
    # print_tensor_devices(image_features)

    for k in range(max_length_inf - 1):
        
        with torch.no_grad():
            
            
            # 将当前生成的字幕转换为张量
            current_caption = torch.LongTensor(outputs_inf).unsqueeze(0).to(device)
            
            # 生成下一个单词的预测
            out = model(image_features, current_caption)
            #打印out的形状
            # print("out:",out.shape)
        # 获取最后一个时间步的最大值索引
        last_time_step_max_index = out.argmax(dim=2)[0, -1].item()

        next_word = last_time_step_max_index


        # 检查是否达到结束标记
        if next_word == word2idx['<end>']:
            break  # 一旦生成<end>标记，立即停止生成

        outputs_inf.append(next_word)
        

    # 转换序列为文字
    caption_inf = [idx2word[idx] for idx in outputs_inf]

    return ' '.join(caption_inf)

# load model

decoder_inf = DecoderRNN(attention_dim=256, embed_dim=256, decoder_dim=512, vocab_size=160)
decoder_inf.to(device)
checkpoint = torch.load(f'model_a/checkpoint_3.pth')
decoder_inf.load_state_dict(checkpoint['decoder_state_dict'])
decoder_inf.eval()

# test
test_data = dataset[10154]
train_captions = test_data[1]
# print(test_data[0].shape, test_data[1].shape)# 3,256,256 95

file_name = dataset.get_file_name(10154)
print(file_name)

# print(train_captions)

test_image = test_data[0].unsqueeze(0).to(device)
test_feature = cnn(test_image)
# print(test_feature.shape)

test_feature.to(device)

output = generate_caption(decoder_inf, test_feature, word_to_idx, idx_to_word)

print(output)

train_captions_str = []
train_captions_str.append([])
for j in range(train_captions.shape[0]):
    train_captions_str[0].append(idx_to_word[train_captions[j].item()])
print("train_captions_str:",train_captions_str)



cuda
torch.Size([3, 256, 256]) torch.Size([95])
WOMEN-Tees_Tanks-id_00002951-02_7_additional.jpg
tensor([159,  21,  47,  24,  32, 126,  40,  28,  11, 133,  79,   9,  32,   5,
         48,   1,  40,  16,  28,   7,   8,   9,  72,  13,  16,  84,   1,  49,
         50,  28,  20,   8,   9,  10,  11,  12, 158,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])
torch.Size([1, 9, 512])
<start> The female is wearing a short-sleeve shirt with solid color patterns. The shirt is with cotton fabric. It has a round neckline. The pants the female wears is of long length. The pants are with denim fabric and pure color patterns. The female is wearing a ring on her finger.
train_captions_str: [['<start>', 'This', 'person', 'wears', 'a',

In [18]:
# 生成测试集中的描述
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

def generate_caption(model, image_features, word2idx, idx2word, max_length_inf=93):
    outputs_inf = [word2idx["<start>"]]
    image_features.to(device)

    for k in range(max_length_inf - 1):
        
        with torch.no_grad():
            
            # 将当前生成的字幕转换为张量
            current_caption = torch.LongTensor(outputs_inf).unsqueeze(0).to(device)
            
            # 生成下一个单词的预测
            out = model(image_features, current_caption)

        # 获取最后一个时间步的最大值索引
        last_time_step_max_index = out.argmax(dim=2)[0, -1].item()

        next_word = last_time_step_max_index


        # 检查是否达到结束标记
        if next_word == word2idx['<end>']:
            break  # 一旦生成<end>标记，立即停止生成

        outputs_inf.append(next_word)
        
    # 转换序列为文字（跳过第一个元素）
    caption_inf = [idx2word[idx] for idx in outputs_inf[1:]]

    return ' '.join(caption_inf)

# load model

decoder_inf = DecoderRNN(attention_dim=256, embed_dim=256, decoder_dim=512, vocab_size=160)
decoder_inf.to(device)
checkpoint = torch.load(f'model_a/checkpoint_3.pth')
decoder_inf.load_state_dict(checkpoint['decoder_state_dict'])
decoder_inf.eval()

# test
test_data_test = dataset_test[10]
test_captions = test_data_test[1]
test_captions = torch.tensor(test_captions)

# print(test_data_test[0].shape)
file_name = dataset_test.get_file_name(10)
print(file_name)
# print(test_captions)

test_image = test_data_test[0].unsqueeze(0).to(device)
test_feature = cnn(test_image)
# print(test_feature.shape)

test_feature.to(device)


output = generate_caption(decoder_inf, test_feature, word_to_idx, idx_to_word)

print(output)

test_captions_str = []
test_captions_str.append([])
for j in range(test_captions.shape[0]):
    test_captions_str[0].append(idx_to_word[test_captions[j].item()])
print("test_captions_str:",test_captions_str)



cuda
torch.Size([3, 256, 256])
WOMEN-Tees_Tanks-id_00002201-18_7_additional.jpg
tensor([  1,   2,   3,   4,  83,   6,   7,   8,   9,  43,  12,  44,   4,  32,
         92,  46,   1,  18,   3,  16,  14,  83,  19,   1,   8,  16, 100,   9,
         15,   4,  10,  11,  12,  21, 122,   4,  32,  93,  67,  68,  94,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])
torch.Size([1, 9, 512])
The upper clothing has short sleeves, cotton fabric and graphic patterns. It has a round neckline. The lower clothing is of short length. The fabric is cotton and it has pure color patterns. The person wears a ring. There is an accessory on her wrist.
test_captions_str: [['The', 'upper', 'clothing', 'has', 'short', 'sleeves,', 'cotton', 'fabric', 'and', 'graphic

In [1]:
# 跑完测试集，生成json文件
import json
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

def generate_caption(model, image_features, word2idx, idx2word, max_length_inf=93):
    outputs_inf = [word2idx["<start>"]]
    image_features.to(device)

    for k in range(max_length_inf - 1):
        
        with torch.no_grad():
            
            # 将当前生成的字幕转换为张量
            current_caption = torch.LongTensor(outputs_inf).unsqueeze(0).to(device)
            
            # 生成下一个单词的预测
            out = model(image_features, current_caption)

        # 获取最后一个时间步的最大值索引
        last_time_step_max_index = out.argmax(dim=2)[0, -1].item()

        next_word = last_time_step_max_index


        # 检查是否达到结束标记
        if next_word == word2idx['<end>']:
            break  # 一旦生成<end>标记，立即停止生成

        outputs_inf.append(next_word)
        
    # 转换序列为文字（跳过第一个元素）
    caption_inf = [idx2word[idx] for idx in outputs_inf[1:]]

    return ' '.join(caption_inf)

# load model

decoder_inf = DecoderRNN(attention_dim=256, embed_dim=256, decoder_dim=512, vocab_size=160)
decoder_inf.to(device)
checkpoint = torch.load(f'model_a/checkpoint_3.pth')
decoder_inf.load_state_dict(checkpoint['decoder_state_dict'])
decoder_inf.eval()

# test
# 创建字典
data = {}

for i in range(2538):
    test_data_test = dataset_test[i]
    test_captions = test_data_test[1]
    test_captions = torch.tensor(test_captions)

    file_name = dataset_test.get_file_name(i)

    test_image = test_data_test[0].unsqueeze(0).to(device)
    test_feature = cnn(test_image)

    test_feature.to(device)

    output = generate_caption(decoder_inf, test_feature, word_to_idx, idx_to_word)
    # 将生成的字幕添加到字典中
    data[file_name] = output
    # 每隔10个打印一次
    if i % 10 == 0:
        print(f"Step {i}, Caption: {output}")
    
    
# 将字典转换为 JSON 字符串
json_data = json.dumps(data, indent=2)
# print(json_data)
#将json_data写入文件
with open('results.json', 'w') as f:
    f.write(json_data)



NameError: name 'torch' is not defined

In [14]:
# 一个用来看词典的块
outputs_str = []
outputs_str.append([])
for j in range(160):
    outputs_str[0].append(idx_to_word[j])
print("outputs_str:",outputs_str)
# 提取所有的词  
all_words = [word for sublist in outputs_str for word in sublist if word != '<pad>' and word != '<start>' and word != '<end>']  
  
# 计算词的数量  
word_count = len(all_words)  
  
print(f"词典里一共有 {word_count} 个词。")

outputs_str: [['<pad>', 'The', 'upper', 'clothing', 'has', 'long', 'sleeves,', 'cotton', 'fabric', 'and', 'solid', 'color', 'patterns.', 'neckline', 'of', 'it', 'is', 'v-shape.', 'lower', 'length.', 'denim', 'This', 'lady', 'also', 'wears', 'an', 'outer', 'clothing,', 'with', 'complicated', 'female', 'wearing', 'a', 'ring', 'on', 'her', 'finger.', 'neckwear.', 'Her', 'tank', 'shirt', 'no', 'chiffon', 'graphic', 'It', 'round', 'neckline.', 'person', 'pants.', 'pants', 'are', 'ring.', 'top', 'v-shape', 'woman', 'trousers.', 'trousers', 'There', 'belt.', 'accessory', 'wrist.', 'sweater', 'lattice', 'fabric.', 'the', 'three-point', 'pure', 'in', 'his', 'neck.', 'long-sleeve', 'plaid', 'its', 'lapel.', 'socks', 'shoes.', 'suspenders', 'short-sleeve', 'T-shirt', 'patterns', 'shorts.', 'crew.', 'shorts', 'short', 'round.', 'sleeveless', 'floral', 'hat.', 'this', 'pair', 'socks.', 'three-quarter', 'crew', 'hat', 'head.', 'lapel', 'sleeves', 'trousers,', 'pants,', 'waist.', 'leather', 'cotton.'

In [ ]:
# 测试指标
import evaluate
import json
import numpy as np

test_text = "./data/deepfashion-multimodal/test_captions.json"

# read from result.json
test_data = json.load(open("./results.json", 'r'))
real_data = json.load(open(test_text, 'r'))

test_selected_data = {}
real_selected_data = {}

indice = 0
for key, value in test_data.items():
    real_selected_data[key] = real_data[key]
    test_selected_data[key] = value
    indice += 1

eval = evaluate.DeepFashionEvalCap(real_selected_data, test_selected_data)
eval.evaluate()