In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip '/content/drive/My Drive/Final Project ML/data/polyvore_outfits.zip' -d '/content/data'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/data/polyvore_outfits/images/153602568.jpg  
  inflating: /content/data/polyvore_outfits/images/191618273.jpg  
  inflating: /content/data/polyvore_outfits/images/213612958.jpg  
  inflating: /content/data/polyvore_outfits/images/179384818.jpg  
  inflating: /content/data/polyvore_outfits/images/94195633.jpg  
  inflating: /content/data/polyvore_outfits/images/129168533.jpg  
  inflating: /content/data/polyvore_outfits/images/200202718.jpg  
  inflating: /content/data/polyvore_outfits/images/171126342.jpg  
  inflating: /content/data/polyvore_outfits/images/209723920.jpg  
  inflating: /content/data/polyvore_outfits/images/192162041.jpg  
  inflating: /content/data/polyvore_outfits/images/157431662.jpg  
  inflating: /content/data/polyvore_outfits/images/178970104.jpg  
  inflating: /content/data/polyvore_outfits/images/85760017.jpg  
  inflating: /content/data/polyvore_outfits/images/184692392.jpg  

In [3]:
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo

model_urls = {
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
}

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

class ResNet(nn.Module):

    def __init__(self, block, layers, embedding_size=64):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc_embed = nn.Linear(512 * block.expansion, embedding_size)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc_embed(x)

        return x

def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model."""
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']), strict=False)
    return model


In [4]:
from PIL import Image
import os
import os.path
import torch.utils.data
import torchvision.transforms as transforms
import numpy as np
import json
import torch
import pickle
import h5py
from sklearn.metrics import roc_auc_score
from torch.autograd import Variable

def default_image_loader(path):
    return Image.open(path).convert('RGB')

def parse_iminfo(question, im2index, id2im, gt = None):
    """ Maps the questions from the FITB and compatibility tasks back to
        their index in the precomputed matrix of features

        question: List of images to measure compatibility between
        im2index: Dictionary mapping an image name to its location in a
                  precomputed matrix of features
        gt: optional, the ground truth outfit set this item belongs to
    """
    questions = []
    is_correct = np.zeros(len(question), np.bool)
    for index, im_id in enumerate(question):
        set_id = im_id.split('_')[0]
        if gt is None:
            gt = set_id

        im = id2im[im_id]
        questions.append((im2index[im], im))
        is_correct[index] = set_id == gt

    return questions, is_correct, gt

def load_typespaces(rootdir, rand_typespaces, num_rand_embed):
    """ loads a mapping of pairs of types to the embedding used to
        compare them

        rand_typespaces: Boolean indicator of randomly assigning type
                         specific spaces to their embedding
        num_rand_embed: number of embeddings to use when
                        rand_typespaces is true
    """
    typespace_fn = os.path.join(rootdir, 'typespaces.p')
    typespaces = pickle.load(open(typespace_fn,'rb'))
    if not rand_typespaces:
        ts = {}
        for index, t in enumerate(typespaces):
            ts[t] = index

        typespaces = ts
        return typespaces

    # load a previously created random typespace or create one
    # if none exist
    width = 0
    fn = os.path.join(rootdir, 'typespaces_rand_%i.p') % num_rand_embed
    if os.path.isfile(fn):
        typespaces = pickle.load(open(fn, 'rb'))
    else:
        spaces = np.random.permutation(len(typespaces))
        width = np.ceil(len(spaces) / float(num_rand_embed))
        ts = {}
        for index, t in enumerate(spaces):
            ts[typespaces[t]] = int(np.floor(index / width))

        typespaces = ts
        pickle.dump(typespaces, open(fn, 'wb'))

    return typespaces


def load_compatibility_questions(fn, im2index, id2im):
    """ Returns the list of compatibility questions for the
        split """
    with open(fn, 'r') as f:
        lines = f.readlines()

    compatibility_questions = []
    for line in lines:
        data = line.strip().split()
        compat_question, _, _ = parse_iminfo(data[1:], im2index, id2im)
        compatibility_questions.append((compat_question, int(data[0])))

    return compatibility_questions

def load_fitb_questions(fn, im2index, id2im):
    """ Returns the list of fill in the blank questions for the
        split """
    data = json.load(open(fn, 'r'))
    questions = []
    for item in data:
        question = item['question']
        q_index, _, gt = parse_iminfo(question, im2index, id2im)
        answer = item['answers']
        a_index, is_correct, _ = parse_iminfo(answer, im2index, id2im, gt)
        questions.append((q_index, a_index, is_correct))

    return questions

class TripletImageLoader(torch.utils.data.Dataset):
    def __init__(self, args, split, meta_data, text_dim = None, transform=None, loader=default_image_loader):
        rootdir = os.path.join(args.datadir, 'polyvore_outfits', args.polyvore_split)
        self.impath = os.path.join(args.datadir, 'polyvore_outfits', 'images')
        self.is_train = split == 'train'
        data_json = os.path.join(rootdir, '%s.json' % split)
        outfit_data = json.load(open(data_json, 'r'))

        # get list of images and make a mapping used to quickly organize the data
        im2type = {}
        category2ims = {}
        imnames = set()
        id2im = {}
        for outfit in outfit_data:
            outfit_id = outfit['set_id']
            for item in outfit['items']:
                im = item['item_id']
                category = meta_data[im]['semantic_category']
                im2type[im] = category

                if category not in category2ims:
                    category2ims[category] = {}

                if outfit_id not in category2ims[category]:
                    category2ims[category][outfit_id] = []

                category2ims[category][outfit_id].append(im)
                id2im['%s_%i' % (outfit_id, item['index'])] = im
                imnames.add(im)

        imnames = list(imnames)
        im2index = {}
        for index, im in enumerate(imnames):
            im2index[im] = index

        self.data = outfit_data
        self.imnames = imnames
        self.im2type = im2type
        self.typespaces = load_typespaces(rootdir, args.rand_typespaces, args.num_rand_embed)
        self.transform = transform
        self.loader = loader
        self.split = split

        if self.is_train:
            self.text_feat_dim = text_dim
            self.desc2vecs = {}
            featfile = os.path.join(rootdir, 'train_hglmm_pca6000.txt')
            with open(featfile, 'r') as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue

                    vec = line.split(',')
                    label = ','.join(vec[:-self.text_feat_dim])
                    vec = np.array([float(x) for x in vec[-self.text_feat_dim:]], np.float32)
                    assert(len(vec) == text_dim)
                    self.desc2vecs[label] = vec
            #DEBUG
            #print("Size of desc2vecs:", len(self.desc2vecs))
            #print("Sample keys from desc2vecs:", list(self.desc2vecs.keys())[:5])
            self.im2desc = {}
            for im in imnames:
                raw_desc = meta_data[im]['title']
                desc = meta_data[im]['title']
                if not desc:
                    desc = meta_data[im]['url_name']
                    raw_desc = desc

                #desc = desc.replace('\n','').encode('ascii', 'ignore').strip().lower()
                # Convert to string, remove new lines, strip and lower the case
                desc = desc.replace('\n','').strip().lower()

                #print(f"Raw description: {raw_desc}, Processed description: {desc}")
                # sometimes descriptions didn't map to any known words so they were
                # removed, so only add those which have a valid feature representation
                if desc and desc in self.desc2vecs:
                    self.im2desc[im] = desc
                    #print(f"Description mapped for image {im}: {desc}")
                #else: #debug
                    #print(f"No mapping for image {im}: {desc}")

            #DEBUG
            #print("Size of im2desc:", len(self.im2desc))
            #print("Sample mappings from im2desc:", {k: self.im2desc[k] for k in list(self.im2desc.keys())[:5]})
            # At train time we pull the list of outfits and enumerate the pairwise
            # comparisons between them to train with.  Negatives are pulled by the
            # __get_item__ function
            pos_pairs = []
            max_items = 0
            for outfit in outfit_data:
                items = outfit['items']
                cnt = len(items)
                max_items = max(cnt, max_items)
                outfit_id = outfit['set_id']
                for j in range(cnt-1):
                    for k in range(j+1, cnt):
                        pos_pairs.append([outfit_id, items[j]['item_id'], items[k]['item_id']])

            self.pos_pairs = pos_pairs
            self.category2ims = category2ims
            self.max_items = max_items
        else:
            # pull the two task's questions for test and val splits
            fn = os.path.join(rootdir, 'fill_in_blank_%s.json' % split)
            self.fitb_questions = load_fitb_questions(fn, im2index, id2im)
            fn = os.path.join(rootdir, 'compatibility_%s.txt' % split)
            self.compatibility_questions = load_compatibility_questions(fn, im2index, id2im)

    def load_train_item(self, image_id):
        """ Returns a single item in the triplet and its data
        """
        imfn = os.path.join(self.impath, '%s.jpg' % image_id)
        img = self.loader(imfn)
        if self.transform is not None:
            img = self.transform(img)

        #this keeps on going to has_text = 0 meaning no image_id in self.im2desc
        #inspect self.im2desc
        if image_id in self.im2desc:
            text = self.im2desc[image_id]
            text_features = self.desc2vecs[text]
            has_text = 1
        else:
            text_features = np.zeros(self.text_feat_dim, np.float32)
            has_text = 0.

        has_text = np.float32(has_text)
        item_type = self.im2type[image_id]
        # Debug statements
        #print(f"Image ID: {image_id}, Has Text: {has_text}, Text Features Shape: {text_features.shape}")
        return img, text_features, has_text, item_type

    def sample_negative(self, outfit_id, item_id, item_type):
        """ Returns a randomly sampled item from a different set
            than the outfit at data_index, but of the same type as
            item_type

            data_index: index in self.data where the positive pair
                        of items was pulled from
            item_type: the coarse type of the item that the item
                       that was paired with the anchor
        """
        item_out = item_id
        candidate_sets = list(self.category2ims[item_type].keys())  # Convert dict_keys to a list
        attempts = 0
        while item_out == item_id and attempts < 100:
            choice = np.random.choice(candidate_sets)
            items = self.category2ims[item_type][choice]
            #item_index = np.random.choice(range(len(items)))
            item_index = np.random.choice(len(items))  # Pass an integer to np.random.choice
            item_out = items[item_index]
            attempts += 1

        return item_out


    def get_typespace(self, anchor, pair):
        """ Returns the index of the type specific embedding
            for the pair of item types provided as input
        """
        query = (anchor, pair)
        if query not in self.typespaces:
            query = (pair, anchor)

        return self.typespaces[query]

    def test_compatibility(self, embeds, metric):
        """ Returns the area under a roc curve for the compatibility
            task

            embeds: precomputed embedding features used to score
                    each compatibility question
            metric: a function used to score the elementwise product
                    of a pair of embeddings, if None euclidean
                    distance is used
        """
        scores = []
        labels = np.zeros(len(self.compatibility_questions), np.int32)
        for index, (outfit, label) in enumerate(self.compatibility_questions):
            labels[index] = label
            n_items = len(outfit)
            outfit_score = 0.0
            num_comparisons = 0.0
            for i in range(n_items-1):
                item1, img1 = outfit[i]
                type1 = self.im2type[img1]
                for j in range(i+1, n_items):
                    item2, img2 = outfit[j]
                    type2 = self.im2type[img2]
                    condition = self.get_typespace(type1, type2)
                    embed1 = embeds[item1][condition].unsqueeze(0)
                    embed2 = embeds[item2][condition].unsqueeze(0)
                    if metric is None:
                        outfit_score += torch.nn.functional.pairwise_distance(embed1, embed2, 2)
                    else:
                        outfit_score += metric(Variable(embed1 * embed2)).data

                    num_comparisons += 1.

            outfit_score /= num_comparisons
            scores.append(outfit_score)

        scores = torch.cat(scores).squeeze().cpu().numpy()
        #scores = np.load('feats.npy')
        #print(scores)
        #assert(False)
        #np.save('feats.npy', scores)
        auc = roc_auc_score(labels, 1 - scores)
        return auc

    def test_fitb(self, embeds, metric):
        """ Returns the accuracy of the fill in the blank task

            embeds: precomputed embedding features used to score
                    each compatibility question
            metric: a function used to score the elementwise product
                    of a pair of embeddings, if None euclidean
                    distance is used
        """
        correct = 0.
        n_questions = 0.
        for q_index, (questions, answers, is_correct) in enumerate(self.fitb_questions):
            answer_score = np.zeros(len(answers), dtype=np.float32)
            for index, (answer, img1) in enumerate(answers):
                type1 = self.im2type[img1]
                score = 0.0
                for question, img2 in questions:
                    type2 = self.im2type[img2]
                    condition = self.get_typespace(type1, type2)
                    embed1 = embeds[question][condition].unsqueeze(0)
                    embed2 = embeds[answer][condition].unsqueeze(0)
                    if metric is None:
                        score += torch.nn.functional.pairwise_distance(embed1, embed2, 2)
                    else:
                        score += metric(Variable(embed1 * embed2)).data

                answer_score[index] = score.squeeze().cpu().numpy()

            correct += is_correct[np.argmin(answer_score)]
            n_questions += 1

        # scores are based on distances so need to convert them so higher is better
        acc = correct / n_questions
        return acc

    def __getitem__(self, index):
        if self.is_train:
            outfit_id, anchor_im, pos_im = self.pos_pairs[index]
            img1, desc1, has_text1, anchor_type = self.load_train_item(anchor_im)
            img2, desc2, has_text2, item_type = self.load_train_item(pos_im)

            neg_im = self.sample_negative(outfit_id, pos_im, item_type)
            img3, desc3, has_text3, _ = self.load_train_item(neg_im)
            condition = self.get_typespace(anchor_type, item_type)
            return img1, desc1, has_text1, img2, desc2, has_text2, img3, desc3, has_text3, condition

        anchor = self.imnames[index]
        img1 = self.loader(os.path.join(self.impath, '%s.jpg' % anchor))
        if self.transform is not None:
            img1 = self.transform(img1)

        return img1


    def shuffle(self):
        np.random.shuffle(self.pos_pairs)

    def __len__(self):
        if self.is_train:
            return len(self.pos_pairs)

        return len(self.imnames)

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

def make_fc_1d(f_in, f_out):
    return nn.Sequential(nn.Linear(f_in, f_out),
                         nn.BatchNorm1d(f_out,eps=0.001,momentum=0.01),
                         nn.ReLU(inplace=True))

def selective_margin_loss(pos_samples, neg_samples, margin, has_sample):
    """ pos_samples: Distance between positive pair
        neg_samples: Distance between negative pair
        margin: minimum desired margin between pos and neg samples
        has_sample: Indicates if the sample should be used to calcuate the loss
    """
    margin_diff = torch.clamp((pos_samples - neg_samples) + margin, min=0, max=1e6)
    num_sample = max(torch.sum(has_sample), 1)
    return torch.sum(margin_diff * has_sample) / num_sample

def accuracy(pos_samples, neg_samples):
    """ pos_samples: Distance between positive pair
        neg_samples: Distance between negative pair
    """
    is_cuda = pos_samples.is_cuda
    margin = 0
    pred = (pos_samples - neg_samples - margin).cpu().data
    acc = (pred > 0).sum()*1.0/pos_samples.size()[0]
    acc = torch.from_numpy(np.array([acc], np.float32))
    if is_cuda:
        acc = acc.cuda()

    return Variable(acc)

class EmbedBranch(nn.Module):
    def __init__(self, feat_dim, embedding_dim):
        super(EmbedBranch, self).__init__()
        self.fc1 = make_fc_1d(feat_dim, embedding_dim)
        self.fc2 = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)

        # L2 normalize each feature vector
        norm = torch.norm(x, p=2, dim=1) + 1e-10
        norm = norm.unsqueeze(1)  # Add an extra dimension to make the shape [256, 1]
        x = x / norm  # Now this division should work correctly
        #x = x / norm.expand_as(x)
        return x

class Tripletnet(nn.Module):
    def __init__(self, args, embeddingnet, text_dim, criterion):
        super(Tripletnet, self).__init__()
        self.embeddingnet = embeddingnet
        self.text_branch = EmbedBranch(text_dim, args.dim_embed)
        self.metric_branch = None
        if args.learned_metric:
            self.metric_branch = nn.Linear(args.dim_embed, 1, bias=False)

            # initilize as having an even weighting across all dimensions
            weight = torch.zeros(1,args.dim_embed)/float(args.dim_embed)
            self.metric_branch.weight = nn.Parameter(weight)

        self.criterion = criterion
        self.margin = args.margin

    def image_forward(self, x, y, z):
        """ x: Anchor data
            y: Distant (negative) data
            z: Close (positive) data
        """
        # conditions only available on the anchor sample
        c = x.conditions
        embedded_x, masknorm_norm_x, embed_norm_x, general_x = self.embeddingnet(x.images, c)
        embedded_y, masknorm_norm_y, embed_norm_y, general_y = self.embeddingnet(y.images, c)
        embedded_z, masknorm_norm_z, embed_norm_z, general_z = self.embeddingnet(z.images, c)
        mask_norm = (masknorm_norm_x + masknorm_norm_y + masknorm_norm_z) / 3
        embed_norm = (embed_norm_x + embed_norm_y + embed_norm_z) / 3
        loss_embed = embed_norm / np.sqrt(len(x))
        loss_mask = mask_norm / len(x)
        if self.metric_branch is None:
            dist_a = F.pairwise_distance(embedded_x, embedded_y, 2)
            dist_b = F.pairwise_distance(embedded_x, embedded_z, 2)
        else:
            dist_a = self.metric_branch(embedded_x*embedded_y)
            dist_b = self.metric_branch(embedded_x*embedded_z)

        # Debug: Print distances
        #print(f"Dist A (Anchor-Negative): {dist_a.mean().item()}, Dist B (Anchor-Positive): {dist_b.mean().item()}")

        target = torch.FloatTensor(dist_a.size()).fill_(1)
        if dist_a.is_cuda:
            target = target.cuda()
        target = Variable(target)

        # type specific triplet loss
        loss_triplet = self.criterion(dist_a, dist_b, target)

        # Debug: Print loss value
        #print(f"Triplet Loss: {loss_triplet.item()}")

        acc = accuracy(dist_a, dist_b)

        # calculate image similarity loss on the general embedding
        disti_p = F.pairwise_distance(general_y, general_z, 2)
        disti_n1 = F.pairwise_distance(general_y, general_x, 2)
        disti_n2 = F.pairwise_distance(general_z, general_x, 2)
        loss_sim_i1 = self.criterion(disti_p, disti_n1, target)
        loss_sim_i2 = self.criterion(disti_p, disti_n2, target)
        loss_sim_i = (loss_sim_i1 + loss_sim_i2) / 2.

        # Debug: Print image similarity losses
        #print(f"Loss Sim I1: {loss_sim_i1.item()}, Loss Sim I2: {loss_sim_i2.item()}")

        return acc, loss_triplet, loss_sim_i, loss_mask, loss_embed, general_x, general_y, general_z

    def text_forward(self, x, y, z):
        """ x: Anchor data
            y: Distant (negative) data
            z: Close (positive) data
        """
        desc_x = self.text_branch(x.text)
        desc_y = self.text_branch(y.text)
        desc_z = self.text_branch(z.text)
        distd_p = F.pairwise_distance(desc_y, desc_z, 2)
        distd_n1 = F.pairwise_distance(desc_x, desc_y, 2)
        distd_n2 = F.pairwise_distance(desc_x, desc_z, 2)
        has_text = x.has_text * y.has_text * z.has_text
        loss_sim_t1 = selective_margin_loss(distd_p, distd_n1, self.margin, has_text)
        loss_sim_t2 = selective_margin_loss(distd_p, distd_n2, self.margin, has_text)
        loss_sim_t = (loss_sim_t1 + loss_sim_t2) / 2.
        return loss_sim_t, desc_x, desc_y, desc_z

    def calc_vse_loss(self, desc_x, general_x, general_y, general_z, has_text):
        """ Both y and z are assumed to be negatives because they are not from the same
            item as x

            desc_x: Anchor language embedding
            general_x: Anchor visual embedding
            general_y: Visual embedding from another item from input triplet
            general_z: Visual embedding from another item from input triplet
            has_text: Binary indicator of whether x had a text description
        """
        distd1_p = F.pairwise_distance(general_x, desc_x, 2)
        distd1_n1 = F.pairwise_distance(general_y, desc_x, 2)
        distd1_n2 = F.pairwise_distance(general_z, desc_x, 2)
        loss_vse_1 = selective_margin_loss(distd1_p, distd1_n1, self.margin, has_text)
        loss_vse_2 = selective_margin_loss(distd1_p, distd1_n2, self.margin, has_text)
        return (loss_vse_1 + loss_vse_2) / 2.

    def forward(self, x, y, z):
        """ x: Anchor data
            y: Distant (negative) data
            z: Close (positive) data
        """
        acc, loss_triplet, loss_sim_i, loss_mask, loss_embed, general_x, general_y, general_z = self.image_forward(x, y, z)
        loss_sim_t, desc_x, desc_y, desc_z = self.text_forward(x, y, z)
        loss_vse_x = self.calc_vse_loss(desc_x, general_x, general_y, general_z, x.has_text)
        loss_vse_y = self.calc_vse_loss(desc_y, general_y, general_x, general_z, y.has_text)
        loss_vse_z = self.calc_vse_loss(desc_z, general_z, general_x, general_y, z.has_text)
        loss_vse = (loss_vse_x + loss_vse_y + loss_vse_z) / 3.
        return acc, loss_triplet, loss_mask, loss_embed, loss_vse, loss_sim_t, loss_sim_i

In [6]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable

class ListModule(nn.Module):
    def __init__(self, *args):
        super(ListModule, self).__init__()
        idx = 0
        for module in args:
            self.add_module(str(idx), module)
            idx += 1

    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self._modules):
            raise IndexError('index {} is out of range'.format(idx))
        it = iter(self._modules.values())
        for i in range(idx):
            next(it)
        return next(it)

    def __iter__(self):
        return iter(self._modules.values())

    def __len__(self):
        return len(self._modules)

class TypeSpecificNet(nn.Module):
    def __init__(self, args, embeddingnet, n_conditions):
        """
        Args:
            args: Input arguments from the main script
            embeddingnet: Network that projects inputs into an embedding of embedding_size
            n_conditions: Number of different similarity notions
        """
        super(TypeSpecificNet, self).__init__()

        self.embeddingnet = embeddingnet

        # Fully connected layers for transforming the general embedding to type-specific embedding
        masks = [nn.Linear(args.dim_embed, args.dim_embed) for _ in range(n_conditions)]
        self.masks = nn.ModuleList(masks)

    def forward(self, x, c=None):
        """
        x: input image data
        c: type-specific embedding to compute for the images, returns all embeddings
           when None including the general embedding concatenated onto the end
        """
        embedded_x = self.embeddingnet(x)

        if c is None:
            masked_embedding = [mask(embedded_x).unsqueeze(1) for mask in self.masks]
            masked_embedding = torch.cat(masked_embedding, 1)
            embedded_x = embedded_x.unsqueeze(1)

            # L2 normalization
            norm = torch.norm(masked_embedding, p=2, dim=2, keepdim=True) + 1e-10
            masked_embedding = masked_embedding / norm

            return torch.cat((masked_embedding, embedded_x), 1)

        mask_norm = 0.
        masked_embedding = []
        for embed, condition in zip(embedded_x, c):
            mask = self.masks[condition]
            masked_embedding.append(mask(embed.unsqueeze(0)))
            mask_norm += mask.weight.norm(1)

        masked_embedding = torch.cat(masked_embedding)

        embed_norm = embedded_x.norm(2)
        # L2 normalization
        norm = torch.norm(masked_embedding, p=2, dim=1, keepdim=True) + 1e-10
        masked_embedding = masked_embedding / norm

        return masked_embedding, mask_norm, embed_norm, embedded_x



In [6]:
# import torch
# import torch.nn as nn
# import numpy as np
# from torch.autograd import Variable

# class ListModule(nn.Module):
#     def __init__(self, *args):
#         super(ListModule, self).__init__()
#         idx = 0
#         for module in args:
#             self.add_module(str(idx), module)
#             idx += 1

#     def __getitem__(self, idx):
#         if idx < 0 or idx >= len(self._modules):
#             raise IndexError('index {} is out of range'.format(idx))
#         it = iter(self._modules.values())
#         for i in range(idx):
#             next(it)
#         return next(it)

#     def __iter__(self):
#         return iter(self._modules.values())

#     def __len__(self):
#         return len(self._modules)

# class TypeSpecificNet(nn.Module):
#     def __init__(self, args, embeddingnet, n_conditions):
#         """ args: Input arguments from the main script
#             embeddingnet: The network that projects the inputs into an embedding of embedding_size
#             embedding_size: Number of dimensions of the embedding output from the embeddingnet
#             n_conditions: Integer defining number of different similarity notions
#         """
#         super(TypeSpecificNet, self).__init__()
#         # Boolean indicating whether masks are learned or fixed
#         learnedmask = args.learned

#         # Boolean indicating whether masks are initialized in equally sized disjoint
#         # sections or random otherwise
#         prein = args.prein

#         # Indicates that there isn't a 1:1 relationship between type specific spaces
#         # and pairs of items categories
#         if args.rand_typespaces:
#             n_conditions = int(np.ceil(n_conditions / float(args.num_rand_embed)))

#         self.learnedmask = learnedmask
#         self.embeddingnet = embeddingnet

#         # When true a fully connected layer is learned to transform the general
#         # embedding to the type specific embedding
#         self.fc_masks = args.use_fc

#         # When true we l2 normalize the output type specific embeddings
#         self.l2_norm = args.l2_embed

#         if self.fc_masks:
#             # learn a fully connected layer rather than a mask to project the general embedding
#             # into the type specific space
#             masks = []
#             for i in range(n_conditions):
#                 masks.append(nn.Linear(args.dim_embed, args.dim_embed))
#             self.masks = ListModule(*masks)
#         else:
#             # create the mask
#             if learnedmask:
#                 if prein:
#                     # define masks
#                     self.masks = torch.nn.Embedding(n_conditions, args.dim_embed)
#                     # initialize masks
#                     mask_array = np.zeros([n_conditions, args.dim_embed])
#                     mask_array.fill(0.1)
#                     mask_len = int(args.dim_embed / n_conditions)
#                     for i in range(n_conditions):
#                         mask_array[i, i*mask_len:(i+1)*mask_len] = 1
#                     # no gradients for the masks
#                     self.masks.weight = torch.nn.Parameter(torch.Tensor(mask_array), requires_grad=True)
#                 else:
#                     # define masks with gradients
#                     self.masks = torch.nn.Embedding(n_conditions, args.dim_embed)
#                     # initialize weights
#                     self.masks.weight.data.normal_(0.9, 0.7) # 0.1, 0.005
#             else:
#                 # define masks
#                 self.masks = torch.nn.Embedding(n_conditions, args.dim_embed)
#                 # initialize masks
#                 mask_array = np.zeros([n_conditions, args.dim_embed])
#                 mask_len = int(args.dim_embed / n_conditions)
#                 for i in range(n_conditions):
#                     mask_array[i, i*mask_len:(i+1)*mask_len] = 1
#                 # no gradients for the masks
#                 self.masks.weight = torch.nn.Parameter(torch.Tensor(mask_array), requires_grad=False)


#     def forward(self, x, c = None):
#         """ x: input image data
#             c: type specific embedding to compute for the images, returns all embeddings
#                when None including the general embedding concatenated onto the end
#         """
#         embedded_x = self.embeddingnet(x)
#         if c is None:
#             # used during testing, wants all type specific embeddings returned for an image
#             if self.fc_masks:
#                 masked_embedding = []
#                 for mask in self.masks:
#                     masked_embedding.append(mask(embedded_x).unsqueeze(1))

#                 masked_embedding = torch.cat(masked_embedding, 1)
#                 embedded_x = embedded_x.unsqueeze(1)
#             else:
#                 masks = Variable(self.masks.weight.data)
#                 masks = masks.unsqueeze(0).repeat(embedded_x.size(0), 1, 1)
#                 embedded_x = embedded_x.unsqueeze(1)
#                 masked_embedding = embedded_x.expand_as(masks) * masks

#             if self.l2_norm:
#                 norm = torch.norm(masked_embedding, p=2, dim=2) + 1e-10
#                 norm = norm.unsqueeze(2)  # Add a third dimension
#                 masked_embedding = masked_embedding / norm.expand_as(masked_embedding)

#             return torch.cat((masked_embedding, embedded_x), 1)

#         if self.fc_masks:
#             mask_norm = 0.
#             masked_embedding = []
#             for embed, condition in zip(embedded_x, c):
#                  mask = self.masks[condition]
#                  masked_embedding.append(mask(embed.unsqueeze(0)))
#                  mask_norm += mask.weight.norm(1)

#             masked_embedding = torch.cat(masked_embedding)
#         else:
#             self.mask = self.masks(c)
#             if self.learnedmask:
#                 self.mask = torch.nn.functional.relu(self.mask)

#             masked_embedding = embedded_x * self.mask
#             mask_norm = self.mask.norm(1)

#         embed_norm = embedded_x.norm(2)
#         if self.l2_norm:
#             # norm = torch.norm(masked_embedding, p=2, dim=1) + 1e-10
#             # masked_embedding = masked_embedding / norm.expand_as(masked_embedding)

#             norm = torch.norm(masked_embedding, p=2, dim=1, keepdim=True) + 1e-10
#             masked_embedding = masked_embedding / norm

#         return masked_embedding, mask_norm, embed_norm, embedded_x

In [7]:
class Args:
    def __init__(self):
        self.batch_size = 32 #256 default
        self.epochs = 5  #10
        self.start_epoch = 1
        self.lr = 5e-5
        self.seed = 1
        self.no_cuda = False
        self.log_interval = 250
        self.resume = ''  # Path to the checkpoint
        self.name = 'Type_Specific_Fashion_Compatibility'
        self.polyvore_split = 'nondisjoint'
        self.datadir = '/content/data'  # Adjusted path
        self.test = False
        self.dim_embed = 64
        #self.use_fc = True    #default False
        #self.learned = False
        self.prein = False
        self.rand_typespaces = False
        self.num_rand_embed = 4
        #self.l2_embed = True    #default False
        self.learned_metric = False
        self.margin = 0.3
        self.embed_loss = 5e-4
        self.mask_loss = 5e-4
        self.vse_loss = 5e-3
        self.sim_t_loss = 5e-5
        self.sim_i_loss = 5e-5

args = Args()

In [8]:
from __future__ import print_function
import argparse
import os
import sys
import shutil
import json

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torch.autograd import Variable
import torch.backends.cudnn as cudnn

In [10]:
def main():
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    fn = os.path.join(args.datadir, 'polyvore_outfits', 'polyvore_item_metadata.json')
    meta_data = json.load(open(fn, 'r'))
    text_feature_dim = 6000
    kwargs = {'num_workers': 8, 'pin_memory': True} if args.cuda else {}
    test_loader = torch.utils.data.DataLoader(
        TripletImageLoader(args, 'test', meta_data,
                           transform=transforms.Compose([
                               transforms.Resize(112),
                               transforms.CenterCrop(112),
                               transforms.ToTensor(),
                               normalize,
                           ])),
        batch_size=args.batch_size, shuffle=False, **kwargs)

    model = resnet50(pretrained=True, embedding_size=args.dim_embed)
    csn_model = TypeSpecificNet(args, model, len(test_loader.dataset.typespaces))

    criterion = torch.nn.MarginRankingLoss(margin = args.margin)
    tnet = Tripletnet(args, csn_model, text_feature_dim, criterion)
    if args.cuda:
        tnet.cuda()

    train_loader = torch.utils.data.DataLoader(
        TripletImageLoader(args, 'train', meta_data,
                           text_dim=text_feature_dim,
                           transform=transforms.Compose([
                               transforms.Resize(112),
                               transforms.CenterCrop(112),
                               transforms.RandomHorizontalFlip(),
                               transforms.ToTensor(),
                               normalize,
                           ])),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    val_loader = torch.utils.data.DataLoader(
        TripletImageLoader(args, 'valid', meta_data,
                           transform=transforms.Compose([
                               transforms.Resize(112),
                               transforms.CenterCrop(112),
                               transforms.ToTensor(),
                               normalize,
                           ])),
        batch_size=args.batch_size, shuffle=False, **kwargs)

    best_acc = 0
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc = checkpoint['best_prec1']
            tnet.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})"
                    .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True
    if args.test:
        test_acc = test(test_loader, tnet)
        sys.exit()

    parameters = filter(lambda p: p.requires_grad, tnet.parameters())
    optimizer = optim.Adam(parameters, lr=args.lr)
    n_parameters = sum([p.data.nelement() for p in tnet.parameters()])
    print('  + Number of params: {}'.format(n_parameters))

    for epoch in range(args.start_epoch, args.epochs + 1):
        # update learning rate
        adjust_learning_rate(optimizer, epoch)
        # train for one epoch
        train(train_loader, tnet, criterion, optimizer, epoch)
        # evaluate on validation set
        acc = test(val_loader, tnet)

        # remember best acc and save checkpoint
        is_best = acc > best_acc
        best_acc = max(acc, best_acc)
        save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': tnet.state_dict(),
            'best_prec1': best_acc,
        }, is_best)

    checkpoint = torch.load('runs/%s/'%(args.name) + 'model_best.pth.tar')
    tnet.load_state_dict(checkpoint['state_dict'])
    test_acc = test(test_loader, tnet)

def train(train_loader, tnet, criterion, optimizer, epoch):
    losses = AverageMeter()
    accs = AverageMeter()
    emb_norms = AverageMeter()
    mask_norms = AverageMeter()

    # switch to train mode
    tnet.train()
    for batch_idx, (img1, desc1, has_text1, img2, desc2, has_text2, img3, desc3, has_text3, condition) in enumerate(train_loader):
        anchor = TrainData(img1, desc1, has_text1, condition)
        close = TrainData(img2, desc2, has_text2)
        far = TrainData(img3, desc3, has_text3)

        # compute output
        acc, loss_triplet, loss_mask, loss_embed, loss_vse, loss_sim_t, loss_sim_i = tnet(anchor, far, close)

        # Log individual loss components for debugging
        #print(f"Batch: {batch_idx}, Loss Triplet: {loss_triplet.item()}, Loss Mask: {loss_mask.item()}, Loss Embed: {loss_embed.item()}, Loss VSE: {loss_vse.item()}, Loss Sim T: {loss_sim_t.item()}, Loss Sim I: {loss_sim_i.item()}")

        # encorages similar text inputs (sim_t) and image inputs (sim_i) to
        # embed close to each other, images operate on the general embedding
        loss_sim = args.sim_t_loss * loss_sim_t + args.sim_i_loss * loss_sim_i

        # cross-modal similarity regularizer on the general embedding
        loss_vse_w = args.vse_loss * loss_vse

        # sparsity and l2 regularizer
        loss_reg = args.embed_loss * loss_embed + args.mask_loss * loss_mask

        loss = loss_triplet + loss_reg
        if args.vse_loss > 0:
            loss += loss_vse_w
        if args.sim_t_loss > 0 or args.sim_i_loss > 0:
            loss += loss_sim

        num_items = len(anchor)
        # measure accuracy and record loss
        # losses.update(loss_triplet.data[0], num_items)
        # accs.update(acc.data[0], num_items)
        # emb_norms.update(loss_embed.data[0])
        # mask_norms.update(loss_mask.data[0])

        losses.update(loss_triplet.item(), num_items)
        accs.update(acc.item(), num_items)
        emb_norms.update(loss_embed.item())
        mask_norms.update(loss_mask.item())

        # compute gradient and do optimizer step
        optimizer.zero_grad()

        if loss == loss:
            loss.backward()
            optimizer.step()

            # # Add a check for any non-zero gradients
            # for name, param in tnet.named_parameters():
            #     if param.grad is not None:
            #         # Check if the gradient is zero
            #         if torch.sum(param.grad) == 0:
            #             print(f"Gradient zero for parameter: {name}")
            #         else:
            #             print(f"Gradient non-zero for parameter: {name}")
            #     else:
            #         print(f"No gradient for parameter (requires_grad might be False): {name}")

        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{}]\t'
                  'Loss: {:.4f} ({:.4f}) \t'
                  'Acc: {:.2f}% ({:.2f}%) \t'
                  'Emb_Norm: {:.2f} ({:.2f})'.format(
                epoch, batch_idx * num_items, len(train_loader.dataset),
                losses.val, losses.avg,
                100. * accs.val, 100. * accs.avg, emb_norms.val, emb_norms.avg))


def test(test_loader, tnet):
    # switch to evaluation mode
    tnet.eval()
    embeddings = []

    # for test/val data we get images only from the data loader
    for batch_idx, images in enumerate(test_loader):
        if args.cuda:
            images = images.cuda()
        images = Variable(images)
        embeddings.append(tnet.embeddingnet(images).data)

    embeddings = torch.cat(embeddings)
    metric = tnet.metric_branch
    auc = test_loader.dataset.test_compatibility(embeddings, metric)
    acc = test_loader.dataset.test_fitb(embeddings, metric)
    total = auc + acc
    print('\n{} set: Compat AUC: {:.2f} FITB: {:.1f}\n'.format(
        test_loader.dataset.split,
        round(auc, 2), round(acc * 100, 1)))

    return total

def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    """Saves checkpoint to disk"""
    directory = "runs/%s/"%(args.name)
    if not os.path.exists(directory):
        os.makedirs(directory)
    filename = directory + filename
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'runs/%s/'%(args.name) + 'model_best.pth.tar')

class TrainData():
    def __init__(self, images, text, has_text, conditions = None):
        has_text = has_text.float()
        if args.cuda:
            images, text, has_text = images.cuda(), text.cuda(), has_text.cuda()
        images, text, has_text = Variable(images), Variable(text), Variable(has_text)

        if conditions is not None:
            if args.cuda:
                conditions = conditions.cuda()

            conditions = Variable(conditions)

        self.images = images
        self.text = text
        self.has_text = has_text
        self.conditions = conditions

    def __len__(self):
        return self.images.size(0)

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args.lr * ((1 - 0.015) ** epoch)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

if __name__ == '__main__':
    main()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  is_correct = np.zeros(len(question), np.bool)


  + Number of params: 24302080
Train Epoch: 1 [0/686851]	Loss: 0.3005 (0.3005) 	Acc: 40.62% (40.62%) 	Emb_Norm: 3.17 (3.17)
Train Epoch: 1 [8000/686851]	Loss: 0.2725 (0.2598) 	Acc: 53.12% (60.30%) 	Emb_Norm: 4.23 (3.57)
Train Epoch: 1 [16000/686851]	Loss: 0.2424 (0.2496) 	Acc: 71.88% (62.31%) 	Emb_Norm: 5.26 (4.20)
Train Epoch: 1 [24000/686851]	Loss: 0.2507 (0.2431) 	Acc: 65.62% (63.27%) 	Emb_Norm: 6.02 (4.69)
Train Epoch: 1 [32000/686851]	Loss: 0.2254 (0.2382) 	Acc: 65.62% (64.05%) 	Emb_Norm: 6.53 (5.08)
Train Epoch: 1 [40000/686851]	Loss: 0.2367 (0.2355) 	Acc: 71.88% (64.52%) 	Emb_Norm: 6.91 (5.41)
Train Epoch: 1 [48000/686851]	Loss: 0.3526 (0.2333) 	Acc: 43.75% (64.96%) 	Emb_Norm: 7.29 (5.69)
Train Epoch: 1 [56000/686851]	Loss: 0.2180 (0.2316) 	Acc: 65.62% (65.26%) 	Emb_Norm: 7.69 (5.95)
Train Epoch: 1 [64000/686851]	Loss: 0.1953 (0.2304) 	Acc: 75.00% (65.53%) 	Emb_Norm: 8.00 (6.18)
Train Epoch: 1 [72000/686851]	Loss: 0.2142 (0.2294) 	Acc: 75.00% (65.74%) 	Emb_Norm: 8.11 (6.39)
Trai