### Import Libraries

In [1]:
import tensorflow as tf
from tensorflow.image import resize
from tensorflow.keras.backend import clear_session
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from keras.metrics import  Recall, CategoricalAccuracy
from IPython.display import clear_output

from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from numpy import concatenate as concat
from scipy.stats import entropy
import os

#A custom library for helper functions
from src.helper import *
np.random.seed(0)

### Build Datasets

In [2]:
path=os.path.join(os.getcwd(),'train')
label_dict={'cat':0,'dog':1}
dataset=np.array([(os.path.join(path,i),label_dict[i.split('.')[0]]) for i in os.listdir(path)])

In [3]:
dataset[0:3]

array([['C:\\Users\\arind\\Documents\\Active_Learning\\dataset\\cat.0.jpg',
        '0'],
       ['C:\\Users\\arind\\Documents\\Active_Learning\\dataset\\cat.1.jpg',
        '0'],
       ['C:\\Users\\arind\\Documents\\Active_Learning\\dataset\\cat.10.jpg',
        '0']], dtype='<U62')

In [4]:
X,y=dataset[::,0],dataset[::,1]
y = y.astype(int)
y = to_categorical(y)

#Shuffle the dataset
p = np.random.permutation(len(X))
X,y = X[p], y[p]

#Strip off 10% samples for hold out test set
test_idxs = np.random.choice(len(X), size=int(0.1*len(X)), replace=False, p=None)
x_test, y_test = X[test_idxs],y[test_idxs]

#Delete the test set samples from X,y 
X = np.delete(X, test_idxs)
y = np.delete(y, test_idxs, axis = 0)

#usual train-val split
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.11, random_state=42)

In contrast to the previous notebook , in this experiment we will not use the full training set. Instead we will use only a subset of 7000 samples. Feel free to change the initial_seed yourself, say 3000. We will use a __seed__ dataset to build the initial model and keep the remaining samples in a __pool__

In [5]:
initial_seed = 7000
x_seed , x_pool = x_train[0:initial_seed], x_train[initial_seed:]
y_seed , y_pool = y_train[0:initial_seed], y_train[initial_seed:]

In [6]:
print(f"Samples in Seed set: {x_seed.shape[0]}")
print(f"Samples in Pool: {x_pool.shape[0]}")
print(f"Samples in Validation set: {x_val.shape[0]}")
print(f"Samples in Test set: {x_test.shape[0]}")

Samples in Seed set: 7000
Samples in Pool: 13025
Samples in Validation set: 2475
Samples in Test set: 2500


A quick check for data imbalance 

In [7]:
for i in [y_seed, y_pool, y_test, y_val]:
    print(np.unique(i, return_counts = True, axis = 0))

(array([[0., 1.],
       [1., 0.]], dtype=float32), array([3559, 3441], dtype=int64))
(array([[0., 1.],
       [1., 0.]], dtype=float32), array([6526, 6499], dtype=int64))
(array([[0., 1.],
       [1., 0.]], dtype=float32), array([1211, 1289], dtype=int64))
(array([[0., 1.],
       [1., 0.]], dtype=float32), array([1204, 1271], dtype=int64))


We build the tensorflow dataset objects again. Note how the __train_dataset__ is now built from the seed set.

In [8]:
#The buid_dataset is a custom function that returns tensor batches

val_dataset=build_dataset(x_val,y_val,repeat=False,batch=256)
test_dataset=build_dataset(x_test,y_test,repeat=False,batch=256)
pool_dataset=build_dataset(x_pool,y_pool,repeat=False,batch=256, shuffle = False)

BATCH_SIZE=16
STEPS_PER_EPOCH=len(x_train)/BATCH_SIZE

train_dataset=build_dataset(x_seed,y_seed,batch=BATCH_SIZE)
input_shape=train_dataset.element_spec[0].shape[1:]

### Build Seed Model

In [9]:
model=simple_model(input_shape)
model.compile(
        loss = "categorical_crossentropy",
        optimizer = Adam(),
        metrics= CategoricalAccuracy()
    )
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 64, 64, 32)        896       
                                                                 
 batch_normalization (BatchN  (None, 64, 64, 32)       128       
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 32, 32, 32)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 32, 32, 32)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 30, 30, 64)        18496     
                                                                 
 batch_normalization_1 (Batc  (None, 30, 30, 64)       2

In [10]:
checkpoint=ModelCheckpoint(filepath='model/model_al.h5',
                           monitor='val_loss',save_best_only=True,verbose=1)

csv_logger=keras.callbacks.CSVLogger('logger/trainlog_al.csv',
                                     separator=',',append=False)

early_stopper=keras.callbacks.EarlyStopping(monitor='val_loss',
                                            min_delta=0.001,
                                            restore_best_weights=True,
                                            patience=10)

callbacks_list=[checkpoint,early_stopper,csv_logger]

In [11]:

model.fit(train_dataset,steps_per_epoch=STEPS_PER_EPOCH,epochs=200,
          validation_data=val_dataset,validation_steps=None,
          callbacks=callbacks_list)


Epoch 1/200
Epoch 1: val_loss improved from inf to 0.55950, saving model to model\model_al.h5
Epoch 2/200
Epoch 2: val_loss did not improve from 0.55950
Epoch 3/200
Epoch 3: val_loss did not improve from 0.55950
Epoch 4/200
Epoch 4: val_loss improved from 0.55950 to 0.42460, saving model to model\model_al.h5
Epoch 5/200
Epoch 5: val_loss did not improve from 0.42460
Epoch 6/200
Epoch 6: val_loss improved from 0.42460 to 0.40107, saving model to model\model_al.h5
Epoch 7/200
Epoch 7: val_loss did not improve from 0.40107
Epoch 8/200
Epoch 8: val_loss improved from 0.40107 to 0.39841, saving model to model\model_al.h5
Epoch 9/200
Epoch 9: val_loss did not improve from 0.39841
Epoch 10/200
Epoch 10: val_loss did not improve from 0.39841
Epoch 11/200
Epoch 11: val_loss did not improve from 0.39841
Epoch 12/200
Epoch 12: val_loss did not improve from 0.39841
Epoch 13/200
Epoch 13: val_loss did not improve from 0.39841
Epoch 14/200
Epoch 14: val_loss did not improve from 0.39841
Epoch 15/200

<keras.callbacks.History at 0x2952a5a20d0>

### Initial Model Evaluation on Test Dataset

Now we have two models

- An initial model trained on the seed dataset
- A baseline model that we got from the 1st notebook , built on the entire training dataset.

How do these two models perform on the test data ?  

In [12]:
model = keras.models.load_model("model/model_al.h5")

In [13]:
print("-" * 100)
print(model.evaluate(test_dataset, verbose=0,return_dict=True))

----------------------------------------------------------------------------------------------------
{'loss': 0.38912251591682434, 'categorical_accuracy': 0.8339999914169312}


### Baseline Model Evaluation on Test Dataset

In [1]:
try:
    
    model_full = keras.models.load_model("model/model_full.h5")
    _, acc_baseline = model_full.evaluate(test_dataset)

    print("-" * 100)
    print(model_full.evaluate(test_dataset, verbose=0,return_dict=True))
    
except FileNotFoundError:
    print("model file model_full.h5 not found. Make sure to run 01_Training_Full.ipynb entirely")

NameError: name 'keras' is not defined

So , the model built on an initial seed data is almost 10% behind in term of accuracy. Can we reach this baseline performance by incrementally querying samples from the pool.  

### Entering the AL Loop

We will now iteratively query the pool for samples and add them to the seed set. Every time we can pick __sampling_size__ number of points from the pool with the largest entropy.

- Step_1: test the current model on the test set. If it exceeds or equals the baseline accuracy , then we exit the AL loop. Otherwise we proceed.
- Step_2: measure the uncertainties in the pool dataset. In other words we query the pool dataset. For this experiment, we will use the entropy measure. Pick the top 200 samples with maximum entropy, append them to the seed dataset and delete them from the pool.
- Step_3: re-compile the model to reset the optimizer states and fit again. Save model if there is an improvement in loss. Go back to Step_1

In [15]:
sampling_size=200
num_iterations = int(x_pool.shape[0]/sampling_size)

In [16]:
al_history = []
csv_logger=keras.callbacks.CSVLogger('logger/trainlog_al.csv',
                                 separator=',',append=True)
callbacks_list=[checkpoint,early_stopper,csv_logger]

In [17]:
for iteration in range(num_iterations):
    
    #Step_1
    loss, acc = model.evaluate(test_dataset, verbose=0)
    print(f"Test Set Accuracy after {iteration} iteration {acc}")
    al_history.append([loss, acc, x_seed.shape[0], x_pool.shape[0]])
    if acc >= acc_baseline:
        print("Terminating Training")
        break
    
    #Step_2
    #Use the current model to predict the pool dataset
    y_pool_proba = model.predict(pool_dataset)
    
    #Pick the index of the top entropy samples in pool
    pool_max_ents = np.argsort(entropy(y_pool_proba.T))[-sampling_size:]
    
    #Acquire those samples from pool
    x_sample = x_pool[pool_max_ents]
    y_sample = y_pool[pool_max_ents]
    
    #Add these samples to the seed dataset
    y_seed = concat((y_seed,y_sample),axis=0)
    x_seed = concat((x_seed,x_sample),axis=0)
     
    #Delete the acquired samples from pool
    x_pool = np.delete(x_pool, pool_max_ents, 0 )
    y_pool = np.delete(y_pool, pool_max_ents, 0 )

    #Build the tensorflow dataset object for this iteration
    pool_dataset = build_dataset(x_pool,y_pool,repeat=False,batch=256,
                                 shuffle = False)
    train_dataset = build_dataset(x_seed,y_seed,batch=BATCH_SIZE) 

    print(f"Samples in seed dataset {x_seed.shape[0]} , in pool dataset {x_pool.shape[0]}")
    print("-" * 100)

    #Step_3
    model.compile(
        loss = "binary_crossentropy",
        optimizer = Adam(),
        metrics = CategoricalAccuracy()
    )
    
    history = model.fit(train_dataset,steps_per_epoch=STEPS_PER_EPOCH,epochs=100,
          validation_data=val_dataset,validation_steps=None,
          callbacks=callbacks_list)
    
    #If the fit method generated a new best model , load it for
    #the next iteration
    model = keras.models.load_model("model/model_al.h5")
    clear_output()
    clear_session()

Test Set Accuracy after 22 iteration 0.8871999979019165
Terminating Training


In [20]:
df = pd.DataFrame(al_history, columns = ['Test Loss', 'Test Accuracy', 'Seed', 'Pool'])
df

Unnamed: 0,Test Loss,Test Accuracy,Seed,Pool
0,0.389123,0.834,7000,13025
1,0.389123,0.834,7200,12825
2,0.368032,0.8376,7400,12625
3,0.387399,0.8452,7600,12425
4,0.369284,0.8556,7800,12225
5,0.369284,0.8556,8000,12025
6,0.343127,0.8576,8200,11825
7,0.343127,0.8576,8400,11625
8,0.343127,0.8576,8600,11425
9,0.36089,0.8568,8800,11225


In [21]:
df.to_csv('logger/AL_tracking.csv', index = False)