In [1]:
import torchvision.models as models
from torchvision import transforms
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.neighbors import KDTree
import pickle
import numpy as np
import gzip

import torch
import torch.nn as nn

In [2]:
def get_model():
    """Get image feature extraction model from pre-trained vgg16 model."""
    vgg_model = models.vgg16(pretrained=True)
    vgg_model.classifier = nn.Sequential(*list(vgg16.classifier.children())[:-2])
    return vgg_model

def get_features(model, cropped_image):
    """Extract features from image using given model."""
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    x = preprocess(cropped_image)
    x = x.unsqueeze(0)
    features = model(x)
    return features

In [3]:
class GloVe():

    def __init__(self, file_path):
        self.dimension = None
        self.embedding = dict()
        with open(file_path, 'r') as f:
            for line in f.readlines():
                strs = line.strip().split()
                word = strs[0]
                vector = torch.FloatTensor(list(map(float, strs[1:])))
                self.embedding[word] = vector
                if self.dimension is None:
                    self.dimension = len(vector)

    def _fix_word(self, word):
        terms = word.replace('_', ' ').split(' ')
        ret = self.zeros()
        cnt = 0
        for term in terms:
            v = self.embedding.get(term)
            if v is None:
                subterms = term.split('-')
                subterm_sum = self.zeros()
                subterm_cnt = 0
                for subterm in subterms:
                    subv = self.embedding.get(subterm)
                    if subv is not None:
                        subterm_sum += subv
                        subterm_cnt += 1
                if subterm_cnt > 0:
                    v = subterm_sum / subterm_cnt
            if v is not None:
                ret += v
                cnt += 1
        return ret / cnt if cnt > 0 else None

    def __getitem__(self, words):
        if type(words) is str:
            words = [words]
        ret = self.zeros()
        cnt = 0
        for word in words:
            v = self.embedding.get(word)
            if v is None:
                v = self._fix_word(word)
            if v is not None:
                ret += v
                cnt += 1
        if cnt > 0:
            return ret / cnt
        else:
            return self.zeros()
    
    def zeros(self):
        return torch.zeros(self.dimension)

In [4]:
glove = GloVe('materials/glove.6B.300d.txt')
glove['name'].shape

torch.Size([300])

In [5]:
with open('materials/train_classes.txt', 'r') as infile:
    train_classes = [str.strip(line) for line in infile]

with open('materials/zsl_classes.txt', 'r') as infile:
    zsl_classes = [str.strip(line) for line in infile]

In [6]:
def to_categorical(y, num_classes):
    return np.eye(num_classes, dtype='uint8')[y]

In [7]:
def load_data(data_path):
    """read data, create datasets.
    
    Data is the pre-trained vgg model feature outputs.
    """
    # READ DATA
    with gzip.GzipFile(data_path, 'rb') as infile:
        data = pickle.load(infile)

    # ONE-HOT-ENCODE DATA
    label_encoder   = LabelEncoder()
    label_encoder.fit(train_classes)

    training_data = [instance for instance in data if instance[0] in train_classes]
    zero_shot_data = [instance for instance in data if instance[0] not in train_classes]
    # SHUFFLE TRAINING DATA
    np.random.shuffle(training_data)

    ### SPLIT DATA FOR TRAINING
    train_size  = 300
    train_data  = list()
    valid_data  = list()
    for class_label in train_classes:
        ct = 0
        for instance in training_data:
            if instance[0] == class_label:
                if ct < train_size:
                    train_data.append(instance)
                    ct+=1
                    continue
                valid_data.append(instance)

    # SHUFFLE TRAINING AND VALIDATION DATA
    np.random.shuffle(train_data)
    np.random.shuffle(valid_data)

    train_data = [(instance[1], to_categorical(label_encoder.transform([instance[0]]), num_classes=15))for instance in train_data]
    valid_data = [(instance[1], to_categorical(label_encoder.transform([instance[0]]), num_classes=15)) for instance in valid_data]

    # FORM X_TRAIN AND Y_TRAIN
    x_train, y_train    = zip(*train_data)
    x_train, y_train    = np.squeeze(np.asarray(x_train)), np.squeeze(np.asarray(y_train))
    # L2 NORMALIZE X_TRAIN
    x_train = normalize(x_train, norm='l2')

    # FORM X_VALID AND Y_VALID
    x_valid, y_valid = zip(*valid_data)
    x_valid, y_valid = np.squeeze(np.asarray(x_valid)), np.squeeze(np.asarray(y_valid))
    # L2 NORMALIZE X_VALID
    x_valid = normalize(x_valid, norm='l2')


    # FORM X_ZSL AND Y_ZSL
    y_zsl, x_zsl = zip(*zero_shot_data)
    x_zsl, y_zsl = np.squeeze(np.asarray(x_zsl)), np.squeeze(np.asarray(y_zsl))
    # L2 NORMALIZE X_ZSL
    x_zsl = normalize(x_zsl, norm='l2')

    print("-> data loading is completed.")
    return (x_train, x_valid, x_zsl), (y_train, y_valid, y_zsl)

In [8]:
NUM_CLASS = 15
NUM_ATTR = 300
BATCH_SIZE = 128
EPOCH = 1000
DATA_PATH = 'materials/zeroshot_data.pkl'

(x_train, x_valid, x_zsl), (y_train, y_valid, y_zsl) = load_data(DATA_PATH)

-> data loading is completed.


In [9]:
def build_model():
    """We set the weights of final layer to the embedding value of labels,
    then use them to compare with the image features."""
    
    final_untrained_fc = nn.Linear(NUM_ATTR, NUM_CLASS, bias=False)
    final_untrained_af = nn.Softmax(dim=1)

    class_vectors = np.load('materials/class_vectors.npy', allow_pickle=True)
    training_vectors    = sorted([(label, vec) for (label, vec) in class_vectors if label in train_classes], key=lambda x: x[0])
    classnames, vectors = zip(*training_vectors)
    vectors             = np.asarray(vectors, dtype=np.float32)

    for p in final_untrained_fc.parameters():
        p.requires_grad = False
    for p in final_untrained_af.parameters():
        p.requires_grad = False

    with torch.no_grad():
        final_untrained_fc.weight.data = torch.from_numpy(vectors)

    model = nn.Sequential(
        nn.Linear(4096, 1024),
        nn.ReLU(),
        nn.BatchNorm1d(1024),
        nn.Dropout(0.8),
        nn.Linear(1024, 512),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Linear(256, NUM_ATTR),
        nn.ReLU(),
        final_untrained_fc,
        final_untrained_af,
    )
    return model

In [10]:
def train_model(model, train_data, valid_data):
    x_train, y_train = train_data
    x_valid, y_valid = valid_data
    x_valid, y_valid = torch.from_numpy(x_valid), torch.from_numpy(y_valid)
    optimizer = torch.optim.Adam(lr=1e-5, params=model.parameters())
    rng = np.random.default_rng()

    for epoch in range(10000):
        rints = rng.integers(low=0, high=len(y_train), size=BATCH_SIZE)
        x_train_batch, y_train_batch = x_train[rints], y_train[rints]
        x_train_batch, y_train_batch = torch.from_numpy(x_train_batch), torch.from_numpy(y_train_batch)
        optimizer.zero_grad()
        y_train_batch_pred = model(x_train_batch)
        loss = nn.CrossEntropyLoss()(y_train_batch_pred, torch.max(y_train_batch, 1)[1])
        loss.backward()
        optimizer.step()

        if epoch % 1000 == 0:
            print('Training Loss: {}'.format(loss.item()))
            with torch.no_grad():
                y_valid_pred = model(x_valid)
                loss = nn.CrossEntropyLoss()(y_valid_pred, torch.max(y_valid, 1)[1])
            print('Validation Loss: {}'.format(loss.mean().item()))

    print('Finished Training')

In [11]:
model = build_model()
train_model(model, (x_train, y_train), (x_valid, y_valid))

Training Loss: 2.709284543991089
Validation Loss: 2.708547830581665
Training Loss: 2.3672308921813965
Validation Loss: 2.3981149196624756
Training Loss: 2.1713616847991943
Validation Loss: 2.3326947689056396
Training Loss: 2.125985622406006
Validation Loss: 2.304955005645752
Training Loss: 2.056572675704956
Validation Loss: 2.2991750240325928
Training Loss: 2.002394437789917
Validation Loss: 2.2781758308410645
Training Loss: 1.984486699104309
Validation Loss: 2.2689290046691895
Training Loss: 1.9162209033966064
Validation Loss: 2.2736241817474365
Training Loss: 1.9294459819793701
Validation Loss: 2.2614598274230957
Training Loss: 1.9332287311553955
Validation Loss: 2.270838499069214
Finished Training


Use wordnet to generate label embedding.


In [12]:
zsl_model = nn.Sequential(*list(model.children())[:-2])

# EVALUATION OF ZERO-SHOT LEARNING PERFORMANCE
WORD2VECPATH = 'materials/class_vectors.npy'
class_vectors       = sorted(np.load(WORD2VECPATH, allow_pickle=True), key=lambda x: x[0])
classnames, vectors = zip(*class_vectors)
classnames          = list(classnames)
vectors             = np.asarray(vectors, dtype=np.float)

tree        = KDTree(vectors)
pred_zsl    = zsl_model(torch.from_numpy(x_zsl))

pred_zsl = pred_zsl.detach().numpy()
top5, top3, top1 = 0, 0, 0

for i, pred in enumerate(pred_zsl):
    pred = np.expand_dims(pred, axis=0)
    dist_5, index_5 = tree.query(pred, k=5)
    pred_labels = [classnames[index] for index in index_5[0]]
    true_label = y_zsl[i]

    if true_label in pred_labels:
        top5 += 1
    if true_label in pred_labels[:3]:
        top3 += 1
    if true_label in pred_labels[0:1]:
        top1 += 1

print()

print("ZERO SHOT LEARNING SCORE")
print("-> Top-5 Accuracy: %.2f" % (top5 / float(len(x_zsl))))
print("-> Top-3 Accuracy: %.2f" % (top3 / float(len(x_zsl))))
print("-> Top-1 Accuracy: %.2f" % (top1 / float(len(x_zsl))))


ZERO SHOT LEARNING SCORE
-> Top-5 Accuracy: 0.54
-> Top-3 Accuracy: 0.25
-> Top-1 Accuracy: 0.00


Use GloVe to generate label embedding.

In [13]:
def build_glove_model():
    """We set the weights of final layer to the embedding value of labels,
    then use them to compare with the image features."""
    
    final_untrained_fc = nn.Linear(NUM_ATTR, NUM_CLASS, bias=False)
    final_untrained_af = nn.Softmax(dim=1)

    training_vectors = []
    for train_class in train_classes:
        training_vectors.append(glove[train_class].unsqueeze(dim=0))
    vectors = torch.cat(training_vectors)

    for p in final_untrained_fc.parameters():
        p.requires_grad = False
    for p in final_untrained_af.parameters():
        p.requires_grad = False

    with torch.no_grad():
        final_untrained_fc.weight.data = vectors

    model = nn.Sequential(
        nn.Linear(4096, 1024),
        nn.ReLU(),
        nn.BatchNorm1d(1024),
        nn.Dropout(0.8),
        nn.Linear(1024, 512),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Linear(256, NUM_ATTR),
        nn.ReLU(),
        final_untrained_fc,
        final_untrained_af,
    )
    return model

In [14]:
glove_model = build_glove_model()
train_model(glove_model, (x_train, y_train), (x_valid, y_valid))

Training Loss: 2.7092411518096924
Validation Loss: 2.7084641456604004
Training Loss: 2.353914976119995
Validation Loss: 2.4145851135253906
Training Loss: 2.252676248550415
Validation Loss: 2.3482372760772705
Training Loss: 2.2158567905426025
Validation Loss: 2.3300528526306152
Training Loss: 2.161053419113159
Validation Loss: 2.314211368560791
Training Loss: 2.062382698059082
Validation Loss: 2.296159029006958
Training Loss: 2.0416183471679688
Validation Loss: 2.3054888248443604
Training Loss: 2.0135385990142822
Validation Loss: 2.3002572059631348
Training Loss: 2.065131187438965
Validation Loss: 2.3012030124664307
Training Loss: 1.9573040008544922
Validation Loss: 2.297089099884033
Finished Training


In [15]:
zsl_model = nn.Sequential(*list(model.children())[:-2])

all_class_vectors = []

for label in train_classes + zsl_classes:
    all_class_vectors.append(glove[label].unsqueeze(dim=0))
class_vectors = torch.cat(all_class_vectors)

tree = KDTree(class_vectors)
pred_zsl = zsl_model(torch.from_numpy(x_zsl))


pred_zsl = pred_zsl.detach().numpy()
top5, top3, top1 = 0, 0, 0

for i, pred in enumerate(pred_zsl):
    pred = np.expand_dims(pred, axis=0)
    dist_5, index_5 = tree.query(pred, k=5)
    pred_labels = [classnames[index] for index in index_5[0]]
    true_label = y_zsl[i]

    if true_label in pred_labels:
        top5 += 1
    if true_label in pred_labels[:3]:
        top3 += 1
    if true_label in pred_labels[0:1]:
        top1 += 1

print()

print("ZERO SHOT LEARNING SCORE")
print("-> Top-5 Accuracy: %.2f" % (top5 / float(len(x_zsl))))
print("-> Top-3 Accuracy: %.2f" % (top3 / float(len(x_zsl))))
print("-> Top-1 Accuracy: %.2f" % (top1 / float(len(x_zsl))))


ZERO SHOT LEARNING SCORE
-> Top-5 Accuracy: 0.30
-> Top-3 Accuracy: 0.16
-> Top-1 Accuracy: 0.00
