In [1]:
import os
import sys
from pathlib import Path
import pickle
import argparse
import scipy.io
import numpy as np
from scipy import io
from collections import Counter

from sklearn.metrics import confusion_matrix, accuracy_score, \
                            balanced_accuracy_score
# # directory reach
# directory = Path(os.path.abspath(__file__))
# # setting path
# sys.path.append(os.path.abspath(directory.parent.parent.parent))
from src.helper import read_dataset, \
                       load_class_attribute_matrix, \
                       load_classes
from src.datasets import aPY, AwA, LAD, SUN, CUB


In [None]:
data = 'data/'
dataset = args.data + '/'
iterations = 1

ds = read_dataset(dataset, data)

images, image_idx, idx_image, imageid_class = ds.get_images()
classes, class_idx, idx_class, classid_imageid = ds.get_classes()
attributes, attr_idx, idx_attr, image_attr = ds.get_attributes()

if dataset == 'SUN/':
    image_attr = np.rint(image_attr)

seen_samples, seen_old_to_new, seen_new_to_old = ds.get_seen_sample()
unseen_samples, unseen_old_to_new, unseen_new_to_old = ds.get_unseen_sample()

if dataset == 'AWA2/':
    class_attributes = ds.get_class_attribute_matrix('att_splits.mat',
                                                     continuous=True, 
                                                     bound=None, 
                                                     lb=None, ub=None)
    class_attributes = class_attributes/100
else:
    class_attributes = ds.get_class_attribute_matrix(continuous=True, 
                                                     bound=None, 
                                                     lb=None, ub=None)
    
embeddings = ds.get_embeddings()


data_folder = 'data/' + dataset

split_file = 'att_splits.mat'
splits = scipy.io.loadmat(f'{data}{dataset}splits/{split_file}')

train = splits['trainval_loc']
#val = splits['val_loc']
test = splits['test_unseen_loc']

np.random.seed(0)
index_train = sorted(list(np.random.choice(list(range(train.shape[0])), 
                                           size=int(train.shape[0]/100*80), 
                                           replace=False)))
index_val = sorted(list(set(list(range(train.shape[0]))).difference(set(index_train))))

train_val = train[index_val]
train = train[index_train]


res = 'res101.mat'
resnet101 = scipy.io.loadmat(f'{data}{dataset}splits/{res}')

train_classes = np.unique(np.squeeze([i[0][0] \
                                      for i in resnet101['labels'][train-1]]))

test_classes = np.unique(np.squeeze([i[0][0] \
                                     for i in resnet101['labels'][test-1]]))


In [None]:
def train_data_loaders(class_idx, unseen_classes, seen_classes, 
                       seen_samples, unseen_samples, 
                       image_attr, embeddings, 
                       keep_train, index_train, 
                       dataset, seed=1, 
                       n_iter=3, max_iter=500):
    """Returns scores, num_examples, and saves and
    returns list of models.
    """
    
    idx_unseen = sorted(unseen_classes)
    idx_seen = sorted(seen_classes)
    
    # Split attribute matrices
    img_att_seen = image_attr[[i[0] for i in seen_samples - 1]]
    img_att_unseen = image_attr[[i[0] for i in unseen_samples - 1]]
    #img_att_val = image_attr[val_samples-1]
    
    # Split feature matrices
    X_tmp = embeddings.numpy()[[i[0] for i in seen_samples - 1]]
    X_test = embeddings.numpy()[[i[0] for i in unseen_samples - 1]]

    # Initialize return
    scores = defaultdict(list)
    num_examples = {}
    models = {}
    C, A = np.shape(M)
    
    for i,a in enumerate(tqdm(range(A))):
        # Get examples with the attribute
        print(img_att_seen.shape)
        idx_attr = np.where(img_att_seen[:,a] > 0)[0]
        idx_attr_test = np.where(img_att_unseen[:,a] > 0)[0]
        
        
        # Assign o,1 labe to all the examples
        y_tmp = np.array([1 if i in idx_attr else 0 \
                      for i in range(X_tmp.shape[0])])
        y_test = np.array([1 if i in idx_attr_test else 0 \
                      for i in range(X_test.shape[0])])
        
        num_examples[a] = {"Num pos train":len(idx_attr),
                           "Num pos test": len(idx_attr_test),
                           "Num neg train": np.sum(y_tmp==0),
                           "Num neg test": np.sum(y_test==0)}
        
        for s in range(n_iter):
            # Train logistic models for different splits of train and validation
            X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, 
                                                              test_size=0.33, 
                                                              random_state=s)
            
            try:
                clf = LogisticRegression(random_state=seed, 
                                         max_iter=max_iter, 
                                         class_weight='balanced'
                                         ).fit(X_train, y_train)
                
                y_val_pred = clf.predict(X_val)
                valpoint = clf.score(X_val, y_val)
                balanced_val = balanced_accuracy_score(y_val, y_val_pred)
                
                y_test_pred = clf.predict(X_test)
                testpoint = clf.score(X_test, y_test)
                balanced_test = balanced_accuracy_score(y_test, y_test_pred)
            
            except ValueError:
                print('Not samples from the two classes -> Assign -100 to identify nan')
                scores[a] += [(-100,-100, -100, -100)]
                continue
                
            scores[a] += [(valpoint, balanced_val, 
                           testpoint, balanced_test)]
        
        models[a] = clf
          
    with open(f'results/{dataset}detectors/score_detectors_held_out.pickle', 'wb') as handle:
        pickle.dump(scores, 
                    handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    with open(f'results/{dataset}detectors/models_held_out.pickle', 'wb') as handle:
        pickle.dump(models, 
                    handle, protocol=pickle.HIGHEST_PROTOCOL)
            
            
    return scores, num_examples, models


In [None]:
scores, num_examples, models = train_data_loaders(class_idx, train_classes, test_classes,
                                                  train, test, class_attributes, 
                                                  image_attr, embeddings,
                                                  dataset, seed=50, n_iter=1, max_iter=700)