In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
from nus_wide_data_util import TwoPartyNusWideDataLoader, retrieve_top_k_labels


def get_data_with_multi_classes(data_dir, labels):
    print("[INFO] target_label: {0}".format(labels))
    data_loader = TwoPartyNusWideDataLoader(data_dir)
    image, text, labels = data_loader.get_train_data(target_labels=labels, binary_classification=False)
    # image, text, labels = data_loader.get_test_data(target_labels=target_labels, binary_classification=False)
    print("[INFO] image shape: {}".format(image.shape))
    print("[INFO] text shape: {}".format(text.shape))
    print("[INFO] labels shape: {}".format(labels.shape))


def get_data_with_binary_classes(data_dir, labels):
    data_loader = TwoPartyNusWideDataLoader(data_dir, binary_negative_label=0)
    image, text, labels = data_loader.get_train_data(target_labels=labels, binary_classification=True)
    # image, text, labels = data_loader.get_test_data(target_labels=target_labels, binary_classification=True)
    print("[INFO] image shape: {}".format(image.shape))
    print("[INFO] text shape: {}".format(text.shape))
    print("[INFO] labels shape: {}".format(labels.shape))


### Load top 10 labels: 

'sky', 'clouds', 'person', 'water', 'animal', 'grass', 'buildings', 'window', 'plants', 'lake'


In [6]:
data_dir = "Input Your Directory Here"

In [23]:
top_k_labels = retrieve_top_k_labels(data_dir, top_k=10)
print(top_k_labels)

['sky', 'clouds', 'person', 'water', 'animal', 'grass', 'buildings', 'window', 'plants', 'lake']


### Load data with 10 classes using `get_data_with_multi_classes`

In [34]:
target_labels = ['sky', 'clouds', 'person', 'water', 'animal', 'grass', 'buildings', 'window', 'plants', 'lake']
get_data_with_multi_classes(data_dir, target_labels)

[INFO] target_label: ['sky', 'clouds', 'person', 'water', 'animal', 'grass', 'buildings', 'window', 'plants', 'lake']
[INFO] load data with labels:['sky', 'clouds', 'person', 'water', 'animal', 'grass', 'buildings', 'window', 'plants', 'lake'] for multi-classification.
[INFO] load rows with valid label (58619, 10)
[INFO] load image feature (Train_Normalized_CM55.dat) with (225) dimension.
[INFO] load image feature (Train_Normalized_EDH.dat) with (73) dimension.
[INFO] load image feature (Train_Normalized_CH.dat) with (64) dimension.
[INFO] load image feature (Train_Normalized_WT.dat) with (128) dimension.
[INFO] load image feature (Train_Normalized_CORR.dat) with (144) dimension.
[INFO] load all image feature with shape: (161789, 634)
[INFO] load all text feature (Train_Tags1k.dat) with shape: (161789, 1000).
[INFO] image shape: (58619, 634)
[INFO] text shape: (58619, 1000)
[INFO] labels shape: (58619, 10)


### Load data with `target_labels = None` using `get_data_with_multi_classes`, which will load data with top 5 classes as default.

In [35]:
target_labels = None
get_data_with_multi_classes(data_dir, target_labels)

[INFO] target_label: None
[INFO] load data with labels:['sky', 'clouds', 'person', 'water', 'animal'] for multi-classification.
[INFO] load rows with valid label (60157, 5)
[INFO] load image feature (Train_Normalized_CM55.dat) with (225) dimension.
[INFO] load image feature (Train_Normalized_EDH.dat) with (73) dimension.
[INFO] load image feature (Train_Normalized_CH.dat) with (64) dimension.
[INFO] load image feature (Train_Normalized_WT.dat) with (128) dimension.
[INFO] load image feature (Train_Normalized_CORR.dat) with (144) dimension.
[INFO] load all image features with shape: (161789, 634)
[INFO] load all text features (Train_Tags1k.dat) with shape: (161789, 1000).
[INFO] image shape: (60157, 634)
[INFO] text shape: (60157, 1000)
[INFO] labels shape: (60157, 5)


### Load data with 2 classes using `get_data_with_multi_classes` will throw an exception

In [36]:
target_labels = ['sky', 'clouds']
try:
    get_data_with_multi_classes(data_dir, target_labels)
except Exception as exc:
    print(f"Exception occur:{exc}")

[INFO] target_label: ['sky', 'clouds']
Exception occur:Multi-classification does not support the number of classes smaller than or equal to 2


### Load data with 2 classes using `get_data_with_binary_classes` 

In [37]:
target_labels = ["sky", "person"]
get_data_with_binary_classes(data_dir, target_labels)

[INFO] load data with labels:['sky'] vs ['person'] for binary-classification.
[INFO] load rows with valid label (68278, 2)
[INFO] load image feature (Train_Normalized_CM55.dat) with (225) dimension.
[INFO] load image feature (Train_Normalized_EDH.dat) with (73) dimension.
[INFO] load image feature (Train_Normalized_CH.dat) with (64) dimension.
[INFO] load image feature (Train_Normalized_WT.dat) with (128) dimension.
[INFO] load image feature (Train_Normalized_CORR.dat) with (144) dimension.
[INFO] load all image features with shape: (161789, 634)
[INFO] load all text features (Train_Tags1k.dat) with shape: (161789, 1000).
[INFO] # of positive samples: 40812, # of negative samples: 27466
[INFO] image shape: (68278, 634)
[INFO] text shape: (68278, 1000)
[INFO] labels shape: (68278,)


### Load data with only one target label using `get_data_with_binary_classes` 

**When using `get_data_with_binary_classes`, if only one target label is set, then this target label will be treated as the positive labels and all other top 5 labels as negative labels, which are top K labels, will be treated as negative labels.**

In [38]:
target_labels = ["water"]
get_data_with_binary_classes(data_dir, target_labels)

[INFO] load data with labels:['water'] vs ['sky', 'clouds', 'person', 'animal'] for binary-classification.
[INFO] load rows with valid label (60157, 5)
[INFO] load image feature (Train_Normalized_CM55.dat) with (225) dimension.
[INFO] load image feature (Train_Normalized_EDH.dat) with (73) dimension.
[INFO] load image feature (Train_Normalized_CH.dat) with (64) dimension.
[INFO] load image feature (Train_Normalized_WT.dat) with (128) dimension.
[INFO] load image feature (Train_Normalized_CORR.dat) with (144) dimension.
[INFO] load all image features with shape: (161789, 634)
[INFO] load all text features (Train_Tags1k.dat) with shape: (161789, 1000).
[INFO] # of positive samples: 5454, # of negative samples: 54703
[INFO] image shape: (60157, 634)
[INFO] text shape: (60157, 1000)
[INFO] labels shape: (60157,)


In [40]:
target_labels = ["plants"]
get_data_with_binary_classes(data_dir, target_labels)

[INFO] load data with labels:['plants'] vs ['sky', 'clouds', 'person', 'water'] for binary-classification.
[INFO] load rows with valid label (51702, 5)
[INFO] load image feature (Train_Normalized_CM55.dat) with (225) dimension.
[INFO] load image feature (Train_Normalized_EDH.dat) with (73) dimension.
[INFO] load image feature (Train_Normalized_CH.dat) with (64) dimension.
[INFO] load image feature (Train_Normalized_WT.dat) with (128) dimension.
[INFO] load image feature (Train_Normalized_CORR.dat) with (144) dimension.
[INFO] load all image features with shape: (161789, 634)
[INFO] load all text features (Train_Tags1k.dat) with shape: (161789, 1000).
[INFO] # of positive samples: 4835, # of negative samples: 46867
[INFO] image shape: (51702, 634)
[INFO] text shape: (51702, 1000)
[INFO] labels shape: (51702,)


### Load data with more than two classes using `get_data_with_binary_classes`, which will throw an exception

In [41]:
target_labels = ["sky", "person", "plants"]
try:
    get_data_with_binary_classes(data_dir, target_labels)
except Exception as exc:
    print(f"Exception occur:{exc}")

Exception occur:binary classification does not support 3 # of labels, which are ['sky', 'person', 'plants'].
