In [54]:
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

In [2]:
main_folder = Path('imageclef2011_feats/')
with open('trainset_gt_annotations.txt') as f:
    y_all = []
    x_all = []
    for l in f.readlines():
        y_all.append([int(o) for o in l.split()[1:]])
        filepath = str(main_folder/l.split()[0]).split('.')[0] + '.jpg_ft.npy'
        x_all.append(np.load(filepath))

with open('concepts_2011.txt') as f:
    labels_names = [l.split()[1] for l in f.readlines()[1:]]

## generate dataset

In [3]:
labels_names.index('Spring'), labels_names.index('Summer'), labels_names.index('Autumn'), labels_names.index('Winter')

(9, 10, 11, 12)

In [4]:
X_spring, X_summer, X_autumn, X_winter = [], [], [], []
for xo, yo in zip(x_all, y_all):
    if yo[9] == 1 or yo[10] == 1 or yo[11] == 1 or yo[12] == 1:
        if yo[9] == 1:
            X_spring.append(xo)
        elif yo[10] == 1:
            X_summer.append(xo)
        elif yo[11] == 1:
            X_autumn.append(xo)
        else: 
            X_winter.append(xo)

## First test out spring

In [5]:
train_size = 0.6
val_size = 0.15
test_size = 0.25
# spring
spring_train_x, spring_test_x = np.split(X_spring, [int((train_size+val_size)*len(X_spring))])
spring_train_x, spring_val_x = np.split(spring_train_x, [int(train_size*len(X_spring))])
# Summner
summer_train_x, summer_test_x = np.split(X_summer, [int((train_size+val_size)*len(X_summer))])
summer_train_x, summer_val_x = np.split(summer_train_x, [int(train_size*len(X_summer))])
# Autumn
autumn_train_x, autumn_test_x = np.split(X_autumn, [int((train_size+val_size)*len(X_autumn))])
autumn_train_x, autumn_val_x = np.split(autumn_train_x, [int(train_size*len(X_autumn))])
# Winter
winter_train_x, winter_test_x = np.split(X_winter, [int((train_size+val_size)*len(X_winter))])
winter_train_x, winter_val_x = np.split(winter_train_x, [int(train_size*len(X_winter))])


In [47]:
train_x = np.concatenate([spring_train_x, summer_train_x, autumn_train_x, winter_train_x])
spring_train_y = len(spring_train_x)*[1] + len(summer_train_x)*[0] + len(autumn_train_x)*[0] + len(winter_train_x)*[0]
summer_train_y = len(spring_train_x)*[0] + len(summer_train_x)*[1] + len(autumn_train_x)*[0] + len(winter_train_x)*[0]
autumn_train_y = len(spring_train_x)*[0] + len(summer_train_x)*[0] + len(autumn_train_x)*[1] + len(winter_train_x)*[0]
winter_train_y = len(spring_train_x)*[0] + len(summer_train_x)*[0] + len(autumn_train_x)*[0] + len(winter_train_x)*[1]

val_x = np.concatenate([spring_val_x, summer_val_x, autumn_val_x, winter_val_x])
val_y = len(spring_val_x)*[0] + len(summer_val_x)*[1] + len(autumn_val_x)*[2] + len(winter_val_x)*[3]

test_x = np.concatenate([spring_test_x, summer_test_x, autumn_test_x, winter_test_x])
test_y = len(spring_test_x)*[0] + len(summer_test_x)*[1] + len(autumn_test_x)*[2] + len(winter_test_x)*[3]

## Why split classwise
Split classwise means split the class proportionally between training and test set.
Split randomly is just randomly split the dataset with no constraints so it cannot make sure that the proportions of the class in training, validation and test set is the same.
You asked us to split classwise so that training set is a good representation of the test set.

## using sklearn.svm.SVC using 'RBF' kernel (default)

In [57]:
Cs = [1e-2, 1e-1, 1e-1**0.5, 1, 1e1**0.5, 1e1, 1e2**0.5]
balanced_accs = []

In [58]:
for c in Cs: # Training
    clf_spring = SVC(C=c, probability=True)
    clf_spring.fit(train_x, spring_train_y)
    clf_summer = SVC(C=c, probability=True)
    clf_summer.fit(train_x, summer_train_y)
    clf_autumn = SVC(C=c, probability=True)
    clf_autumn.fit(train_x, autumn_train_y)
    clf_winter = SVC(C=c, probability=True)
    clf_winter.fit(train_x, winter_train_y)
    # val
    val_spring = clf_spring.predict_proba(val_x)[:, 1]
    val_summer = clf_summer.predict_proba(val_x)[:, 1]
    val_autumn = clf_autumn.predict_proba(val_x)[:, 1]
    val_winter = clf_winter.predict_proba(val_x)[:, 1]
    val_prob = np.vstack([val_spring, val_summer, val_autumn, val_winter]).T
    val_predict = val_prob.argmax(axis=1)
    acc = accuracy_score(val, val_y)
    balanced_acc = balanced_accuracy_score(val, val_y)
    balanced_accs.append(balanced_acc)





In [59]:
balanced_accs

[0.7179166666666666,
 0.7179166666666666,
 0.7179166666666666,
 0.7179166666666666,
 0.7179166666666666,
 0.7179166666666666,
 0.7179166666666666]

In [60]:
# test on the test set

In [63]:
train_x = np.concatenate([train_x, val_x])
spring_val_y = len(spring_val_x)*[1] + len(summer_val_x)*[0] + len(autumn_val_x)*[0] + len(winter_val_x)*[0]
summer_val_y = len(spring_val_x)*[0] + len(summer_val_x)*[1] + len(autumn_val_x)*[0] + len(winter_val_x)*[0]
autumn_val_y = len(spring_val_x)*[0] + len(summer_val_x)*[0] + len(autumn_val_x)*[1] + len(winter_val_x)*[0]
winter_val_y = len(spring_val_x)*[0] + len(summer_val_x)*[0] + len(autumn_val_x)*[0] + len(winter_val_x)*[1]
spring_train_y = np.concatenate([spring_train_y, spring_val_y])
summer_train_y = np.concatenate([summer_train_y, summer_val_y])
autumn_train_y = np.concatenate([autumn_train_y, autumn_val_y])
winter_train_y = np.concatenate([winter_train_y, winter_val_y])

In [66]:
c = 0.1

clf_spring = SVC(C=c, probability=True)
clf_spring.fit(train_x, spring_train_y)
clf_summer = SVC(C=c, probability=True)
clf_summer.fit(train_x, summer_train_y)
clf_autumn = SVC(C=c, probability=True)
clf_autumn.fit(train_x, autumn_train_y)
clf_winter = SVC(C=c, probability=True)
clf_winter.fit(train_x, winter_train_y)
# val
val_spring = clf_spring.predict_proba(test_x)[:, 1]
val_summer = clf_summer.predict_proba(test_x)[:, 1]
val_autumn = clf_autumn.predict_proba(test_x)[:, 1]
val_winter = clf_winter.predict_proba(test_x)[:, 1]
val_prob = np.vstack([val_spring, val_summer, val_autumn, val_winter]).T
val_predict = val_prob.argmax(axis=1)
acc = accuracy_score(val, val_y)
balanced_acc = balanced_accuracy_score(val, val_y)
balanced_accs.append(balanced_acc)

