# Homework 3

## Part A

### Import python library

In [1]:
import os
import random
from tqdm import tqdm
from collections import defaultdict
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
import torch.utils.data
import torch.utils.data.dataset

### Define hyperparameters

In [2]:
K=3
MAX_SEQ_LEN=30
EPOCH=5
BATCH_SIZE=8

### Define paths

In [3]:
GLOVE_FILE='/home/ubuntu/worker/model/distilled_glove.42B.300d.txt' # distilled
CV_FILE='/home/ubuntu/worker/model/alexnet-owt-4df8aa71.pth'
ALEXNET_FILE='/home/ubuntu/worker/model/alexnet-owt-4df8aa71.pth'
VGG_FILE='/home/ubuntu/worker/model/vgg19_bn-c79401a0.pth'
flickr_root_dir = '/home/ubuntu/worker/dataset/flickr30k'
flickr_caption_filename = flickr_root_dir + '/results_20130124.token'
flickr_image_dir = flickr_root_dir + '/flickr30k-images'

### Preparing data

In [4]:
def load_glove_embedding(glove_file):
    # 0: pad, 1: unk
    # dim: 300
    glove_embeddings = [np.zeros(300), np.random.rand(300)]
    word2id_dict = defaultdict(int)
    id2word_dict = dict()
    with open(glove_file) as f:
        for index, line in enumerate(f):
            items = line.strip().split(' ')
            word, embedding = items[0], np.array([float(v) for v in items[1:]])
            glove_embeddings.append(embedding)
            word2id_dict[word] = index
            id2word_dict[index] = word
    return np.array(glove_embeddings), word2id_dict, id2word_dict

In [5]:
def load_caption(flickr_caption_filename, tokenize=False, numerize=False, word2id_dict=None):
    assert (not (tokenize == False and numerize == True))   # 想要numerize，那么必须先tokenize

    captions = defaultdict(list)
    lengths = defaultdict(list)
    with open(flickr_caption_filename) as f:
        for line in tqdm(f.readlines()):
            image_filename, caption = line.strip().split('\t')
            image_filename = image_filename[:-2]
            if tokenize:
                caption = caption.strip().split(' ')
            if numerize:
                caption = [word2id_dict[w.lower()] for w in caption]
                length = min(MAX_SEQ_LEN, len(caption))
                caption = caption[:MAX_SEQ_LEN] + [0] * (MAX_SEQ_LEN - len(caption))
            captions[image_filename].append(caption)
            lengths[image_filename].append(length)
    return captions, lengths

In [6]:
def load_single_image(image_path):
    image = Image.open(image_path)
    image = image.resize((227, 227))
    image = np.expand_dims(image, axis=0)
    image = image / 127.5
    image = image - 1.0  # 归一化到[-1, 1]之间
    return image

#### Define dataloader

In [7]:
class Flickr_Dataset_MemoryFriendly(torch.utils.data.Dataset):
    def __init__(self, flickr_caption_filename, flickr_image_dir, word2id_dict=None):
        captions_dict, lengths_dict = load_caption(flickr_caption_filename, tokenize=True, numerize=True, word2id_dict=word2id_dict)
        self.flickr_image_dir = flickr_image_dir
        image_filenames = os.listdir(flickr_image_dir)

        self.captions, self.images, self.lengths = [], [], []
        for filename in image_filenames:
            for caption, length in zip(captions_dict[filename], lengths_dict[filename]):
                self.captions.append(caption)
                self.lengths.append(length)
                self.images.append(os.path.join(self.flickr_image_dir, filename))

        self.pos_captions = torch.tensor(self.captions, dtype=torch.long)
        self.pos_lengths = torch.tensor(self.lengths, dtype=torch.int64)
        self.neg_captions, self.neg_lengths = [], []
        for _ in range(K):
            index = list(range(self.pos_captions.shape[0]))
            random.shuffle(index)
            self.neg_captions.append(self.pos_captions[index])
            self.neg_lengths.append(self.pos_lengths[index])
        self.neg_captions = torch.stack(self.neg_captions, dim=1)
        self.neg_lengths = torch.stack(self.neg_lengths, dim=1)

    def __getitem__(self, i):
        input_image = load_single_image(self.images[i])
        input_image = torch.tensor(input_image, dtype=torch.float).squeeze().permute(2, 0, 1)
        return input_image, self.pos_captions[i], self.pos_lengths[i], self.neg_captions[i], self.neg_lengths[i]

    def __len__(self):
        return len(self.captions)

### Define neural networks

In [8]:
class AlexNet(nn.Module):

    def __init__(self, num_classes=1000, output_feat=False):
        super(AlexNet, self).__init__()
        self.output_feat = output_feat
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        """
        Pass the input through the net.
        Args:
            x (Tensor): input tensor
        Returns:
            output (Tensor): output tensor
        """
        x = self.features(x)
        x = x.view(-1, 256 * 6 * 6)  # reduce the dimensions for linear layer input
        if self.output_feat:
            return x
        else:
            return self.classifier(x)

In [9]:
class Text_Representer(nn.Module):
    def __init__(self, embedding_matrix, use_rnn=True):
        super(Text_Representer, self).__init__()
        self.use_rnn = use_rnn
        self.embedding_matrix = embedding_matrix
        self.vocab_num, self.embed_dim = self.embedding_matrix.shape
        self.embedding = nn.Embedding(self.vocab_num, self.embed_dim)
        self.embedding.weight = embedding_matrix

        self.gru = nn.GRU(input_size=self.embed_dim, hidden_size=256 * 6, batch_first=True)
        self.linear = nn.Linear(in_features=256 * 6, out_features=256 * 6 * 6)

    def forward(self, sentence, length):
        x = self.embedding(sentence)
        if self.use_rnn:
            # x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True)
            x = torch.nn.utils.rnn.pack_padded_sequence(x, length, batch_first=True, enforce_sorted=False) 
            _, x = self.gru(x)
            x = self.linear(x[0])
        else:
            x = torch.sum(x, dim=1)
            x = self.linear(x)
        return x

In [10]:
class Cross_Modal_Retriever(nn.Module):
    def __init__(self, embedding_matrix, cv_weight_file=None):
        super(Cross_Modal_Retriever, self).__init__()
        self.cv_net = AlexNet(output_feat=True)
        if cv_weight_file:
            self.cv_net.load_state_dict(torch.load(cv_weight_file))
        self.nlp_net = Text_Representer(embedding_matrix=embedding_matrix)

    def forward(self, image, pos_caption, pos_length, neg_captions, neg_lengths):
        cv_feats = self.cv_net(image)
        nlp_pos_feats = self.nlp_net(pos_caption, pos_length.view(-1))
        neg_pos_feats = self.nlp_net(neg_captions.view(-1, neg_captions.shape[2]), neg_lengths.view(-1))
        neg_pos_feats = neg_pos_feats.view(neg_captions.shape[0], neg_captions.shape[1], -1)
        # similarity = torch.diagonal(torch.matmul(nlp_feats, cv_feats.T))
        return cv_feats, nlp_pos_feats, neg_pos_feats

### Define loss function for metric learning

In [11]:
class TripletLoss(nn.Module):
    '''
    Compute normal triplet loss or soft margin triplet loss given triplets
    '''
    def __init__(self, margin=None):
        super(TripletLoss, self).__init__()
        self.margin = margin
        if self.margin is None:  # if no margin assigned, use soft-margin
            self.Loss = nn.SoftMarginLoss()
        else:
            self.Loss = nn.TripletMarginLoss(margin=margin, p=2)

    def forward(self, anchor, pos, neg):
        if self.margin is None:
            num_samples = anchor.shape[0]
            y = torch.ones((num_samples, 1)).view(-1)
            if anchor.is_cuda: y = y.cuda()
            ap_dist = torch.norm(anchor-pos, 2, dim=1).view(-1)
            an_dist = torch.norm(anchor-neg, 2, dim=1).view(-1)
            loss = self.Loss(an_dist - ap_dist, y)
        else:
            loss = self.Loss(anchor, pos, neg)

        return loss

### Train and evaluate

In [27]:
##### debug 词向量部分
embedding_matrix, word2id_dict, _ = load_glove_embedding(GLOVE_FILE)
# print(embedding_matrix.shape)
##### debug部分
retriever = Cross_Modal_Retriever(embedding_matrix=nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float),
                                                                requires_grad=True),
                                    cv_weight_file=CV_FILE).cuda()
# train_dataset = Flickr_Dataset(flickr_caption_filename, flickr_image_dir, word2id_dict=word2id_dict)
# test_dataset = Flickr_Dataset(flickr_caption_filename, flickr_image_dir, word2id_dict=word2id_dict)
all_dataset = Flickr_Dataset(flickr_caption_filename, flickr_image_dir, word2id_dict=word2id_dict)

dataset_size = len(all_dataset)
train_size, test_size = int(0.8 * dataset_size), int(0.2 * dataset_size)
train_dataset, test_dataset = torch.utils.data.random_split(all_dataset, [train_size, test_size])

100%|██████████| 158915/158915 [00:01<00:00, 98051.40it/s]


In [28]:
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True,
                                               num_workers=2)
test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset,
                                               batch_size=1,
                                               shuffle=False,
                                               num_workers=2)

In [29]:
loss_function = TripletLoss(margin=None)
optimizer = torch.optim.Adam(params=retriever.parameters(), lr=1e-4)

In [30]:
for epoch in range(EPOCH):
    total_loss = []
    retriever.train()
    for step, (image, pos_caption, pos_length, neg_captions, neg_lengths) in enumerate(train_dataloader):
        [image, pos_caption, neg_captions] = [tensor.cuda() for tensor in [image, pos_caption,  neg_captions]]
        cv_feats, nlp_pos_feat, nlp_neg_feats = retriever(image, pos_caption, pos_length, neg_captions, neg_lengths)

        print_losses = []

        loss = loss_function(cv_feats, nlp_pos_feat, nlp_neg_feats[:, 0, :])
        # total_loss = loss
        # print_losses.append(loss.item())
        for i in range(1, K):
            loss += loss_function(cv_feats, nlp_pos_feat, nlp_neg_feats[:, i, :])
            # total_loss += loss
            # print_losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss.append(loss.item())
        if step % 50 == 0:
            print('    step %s: loss is %s' % (step + 1, np.mean(total_loss)))

    print('EPOCH %s: Loss is %s' % ((epoch + 1), np.mean(total_loss)))

    results = []
    for step, (image, pos_caption, pos_length, neg_captions, neg_lengths) in enumerate(test_dataloader):
        [image, pos_caption, neg_captions] = [tensor.cuda() for tensor in [image, pos_caption,  neg_captions]]
        retriever.eval()
        cv_feats, nlp_pos_feat, nlp_neg_feats = retriever(image, pos_caption, pos_length, neg_captions, neg_lengths)

        dists = [torch.norm(cv_feats - nlp_pos_feat, 2, dim=1).view(-1).item()]
        for i in range(nlp_neg_feats.shape[1]):
            dist = torch.norm(cv_feats - nlp_neg_feats[:, i, :], 2, dim=1).view(-1)
            dists.append(dist.item())

        if np.argmin(dists) != 0:
            results.append(0)
        else:
            results.append(1)

    print("acc: %s" % (np.mean(results)))

    step 1: loss is 2.0792458057403564
    step 51: loss is 2.0498646310731474
    step 101: loss is 1.974066696544685
    step 151: loss is 1.887848048810138
    step 201: loss is 1.8166883970374492
    step 251: loss is 1.752949283892415
    step 301: loss is 1.704050165870261
    step 351: loss is 1.6602387197336919
    step 401: loss is 1.618179939036952
    step 451: loss is 1.5801041681327734
    step 501: loss is 1.5460684075802862
    step 551: loss is 1.5180851604025507
    step 601: loss is 1.4891209725333927
    step 651: loss is 1.4636205660033337
    step 701: loss is 1.4391313158666525
    step 751: loss is 1.414180849347705
    step 801: loss is 1.392053521453367
    step 851: loss is 1.372014387303878
    step 901: loss is 1.3520850143607264
    step 951: loss is 1.3332064521550882
EPOCH 1: Loss is 1.3173130331504752
acc: 0.6747317748481892
    step 1: loss is 0.7976595163345337
    step 51: loss is 0.8621252576510111
    step 101: loss is 0.854474663734436
    step 151

KeyboardInterrupt: 

## Part B

## Part C

### Define other neural networks
#### VGG-19

In [12]:
class VGG(nn.Module):
    def __init__(self, features, num_classes=1000, output_feat=True, init_weights=True):
        super(VGG, self).__init__()
        self.features = features
        self.output_feat = output_feat
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
        if init_weights:
            self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        if self.output_feat:
            return x
        else:
            x = self.classifier(x)
            return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(
                    m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
                
def make_layers(cfg, batch_norm=False):
    layers = []
    in_channels = 3
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)

cfg = {
    'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}

def vgg19_bn(pretrained=False, cv_weight_file=None, **kwargs):
    """VGG 19-layer model (configuration 'E') with batch normalization
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    if pretrained:
        kwargs['init_weights'] = False
    model = VGG(make_layers(cfg['E'], batch_norm=True), output_feat=True, **kwargs)
    if pretrained:
        model.load_state_dict(torch.load(cv_weight_file))
    return model

In [13]:
class Cross_Modal_Retriever_v2(nn.Module):
    def __init__(self, embedding_matrix, alexnet_weight_file, vgg19_weight_file=None):
        super(Cross_Modal_Retriever_v2, self).__init__()
        self.alexnet = AlexNet(output_feat=True)
        self.alexnet.load_state_dict(torch.load(alexnet_weight_file))
        self.vgg19 = vgg19_bn(pretrained=True, cv_weight_file=vgg19_weight_file)
        self.vgg_linear = nn.Linear(25088, 256 * 6 * 6)
        
        self.glove_net = Text_Representer(embedding_matrix=embedding_matrix)
        self.rnn_net = Text_Representer(embedding_matrix=embedding_matrix, use_rnn=True)

    def forward(self, image, pos_caption, pos_length, neg_captions, neg_lengths):
        alexnet_feat = self.alexnet(image)
        vgg19_feat = self.vgg_linear(self.vgg19(image))
        print(vgg19_feat.shape, alexnet_feat.shape)
        cv_feats = torch.cat([alexnet_feat, vgg19_feat], dim=-1)
        
        glove_pos_feats = self.glove_net(pos_caption, pos_length.view(-1))
        rnn_pos_feats = self.rnn_net(pos_caption, pos_length.view(-1))
        nlp_pos_feats = torch.cat([glove_pos_feats, rnn_pos_feats], dim=-1)   
        
        glove_neg_feats = self.glove_net(neg_captions.view(-1, neg_captions.shape[2]), neg_lengths.view(-1))
        glove_neg_feats = glove_neg_feats.view(neg_captions.shape[0], neg_captions.shape[1], -1)
        rnn_neg_feats = self.rnn_net(neg_captions.view(-1, neg_captions.shape[2]), neg_lengths.view(-1))
        rnn_neg_feats = rnn_neg_feats.view(neg_captions.shape[0], neg_captions.shape[1], -1)
        nlp_neg_feats = torch.cat([glove_neg_feats, rnn_neg_feats], dim=-1) 
        
        return cv_feats, nlp_pos_feats, nlp_neg_feats

In [14]:
##### debug 词向量部分
embedding_matrix, word2id_dict, _ = load_glove_embedding(GLOVE_FILE)
# print(embedding_matrix.shape)
##### debug部分
retriever = Cross_Modal_Retriever_v2(embedding_matrix=nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float),
                                                                requires_grad=True),
                                     alexnet_weight_file=ALEXNET_FILE,
                                     vgg19_weight_file=VGG_FILE).cuda()
# train_dataset = Flickr_Dataset(flickr_caption_filename, flickr_image_dir, word2id_dict=word2id_dict)
# test_dataset = Flickr_Dataset(flickr_caption_filename, flickr_image_dir, word2id_dict=word2id_dict)
all_dataset = Flickr_Dataset_MemoryFriendly(flickr_caption_filename, flickr_image_dir, word2id_dict=word2id_dict)

dataset_size = len(all_dataset)
train_size, test_size = int(0.8 * dataset_size), int(0.2 * dataset_size)
train_dataset, test_dataset = torch.utils.data.random_split(all_dataset, [train_size, test_size])

100%|██████████| 158915/158915 [00:01<00:00, 134404.95it/s]


In [15]:
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=1,
                                               shuffle=True,
                                               num_workers=2)
test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset,
                                               batch_size=1,
                                               shuffle=False,
                                               num_workers=2)
loss_function = TripletLoss(margin=None)
optimizer = torch.optim.Adam(params=retriever.parameters(), lr=1e-4)

In [23]:
for epoch in range(EPOCH):
    total_loss = []
    retriever.train()
    for step, (image, pos_caption, pos_length, neg_captions, neg_lengths) in enumerate(train_dataloader):
        [image, pos_caption, neg_captions] = [tensor.cuda() for tensor in [image, pos_caption,  neg_captions]]
        cv_feats, nlp_pos_feat, nlp_neg_feats = retriever(image, pos_caption, pos_length, neg_captions, neg_lengths)

        print_losses = []

        loss = loss_function(cv_feats, nlp_pos_feat, nlp_neg_feats[:, 0, :])
        # total_loss = loss
        # print_losses.append(loss.item())
        for i in range(1, K):
            loss += loss_function(cv_feats, nlp_pos_feat, nlp_neg_feats[:, i, :])
            # total_loss += loss
            # print_losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss.append(loss.item())
        if step % 50 == 0:
            print('    step %s: loss is %s' % (step + 1, np.mean(total_loss)))

    print('EPOCH %s: Loss is %s' % ((epoch + 1), np.mean(total_loss)))

    results = []
    for step, (image, pos_caption, pos_length, neg_captions, neg_lengths) in enumerate(test_dataloader):
        [image, pos_caption, neg_captions] = [tensor.cuda() for tensor in [image, pos_caption,  neg_captions]]
        retriever.eval()
        cv_feats, nlp_pos_feat, nlp_neg_feats = retriever(image, pos_caption, pos_length, neg_captions, neg_lengths)

        dists = [torch.norm(cv_feats - nlp_pos_feat, 2, dim=1).view(-1).item()]
        for i in range(nlp_neg_feats.shape[1]):
            dist = torch.norm(cv_feats - nlp_neg_feats[:, i, :], 2, dim=1).view(-1)
            dists.append(dist.item())

        if np.argmin(dists) != 0:
            results.append(0)
        else:
            results.append(1)

    print("acc: %s" % (np.mean(results)))

torch.Size([1, 9216]) torch.Size([1, 9216])


RuntimeError: CUDA out of memory. Tried to allocate 882.00 MiB (GPU 0; 7.43 GiB total capacity; 5.36 GiB already allocated; 598.94 MiB free; 6.21 GiB reserved in total by PyTorch)