----
## Pseudo Labeling

In [1]:
import numpy as np
from tqdm import tqdm
import pandas as pd
from os.path import join as opj
from mpl_toolkits.mplot3d import Axes3D
import pylab
import pickle
from operator import itemgetter

from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = 10, 10
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

from keras import optimizers
from keras.layers import Dropout
from keras.utils import to_categorical
from keras.models import load_model

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


### Predict the test data labels using first pass VGG16 model

Here are using the model output from VGG15 First Pass in order to predict the labels on the unlabeled 'test.json' dataset provided.

In [2]:
# Load the datasets
df_validation = pd.read_json('test.json')
Valid_75 = pickle.load(open('HH_HV_Com_validation_75.p', 'rb'))
vgg_Pseudo_dec5 = load_model('models/vgg16.h5')

In [3]:
# Predict using first pass VGG16 model
yproba_vgg_aug_params_dec5_Pseudo = vgg_Pseudo_dec5.predict(Valid_75)[:, 1].astype(float)

In [4]:
# Function to create csv file for kaggle submission
def output_submission(yproba, df_validation):
    submission = pd.DataFrame(yproba)
    df_submission = pd.concat([pd.DataFrame(df_validation['id']), submission], axis=1)
    df_submission = df_submission.rename(columns={0: 'is_iceberg'})
    df_submission.to_csv('First_Pass_VGG16.csv', index=False)

In [5]:
output_submission(yproba_vgg_aug_params_dec5_Pseudo, df_validation)

### Create pseudo labeled for input to VGG16

Method of Pseudo labeling used from following paper on machine learning for SAR images: <br> Gao, F., Yue, Z., Wang, J., Sun, J., Yang, E., & Zhou, H. A Novel Active Semi-Supervised Convolutional Neural Network Algorithm for SAR Image Recognition. Computational Intelligence and Neuroscience. <br>

The pseudo labeling technique is to use yproba for each observation and select your most confident predictions then merge this dataset back in with the original labeled dataset. In this way you're able to expand the size of the training set, for retraining the CNN.

In [11]:
# Function taking in the num images of each class, and returning the pseudo labeled dataset with that many additional images
def Pseudo_Label(num_images):
    # Load predicted labels from previous VGG16 data
    validation_labels_vgg16 = pd.read_csv('First_Pass_VGG16.csv')
    validation_vgg16 = []
    validation_labels_vgg16 = validation_labels_vgg16.iloc[:, 1:].mean(axis=1)

    # Merge probability labels with image data
    for i, entry in enumerate(validation_labels_vgg16):
        validation_vgg16.append([entry, Valid_75[i]])

    # Sort by probability (i.e. confidence), select 1600 ship labels and icebergs labels with highest confidence
    first_item = itemgetter(0)
    validation_sorted_vgg16 = sorted(validation_vgg16, key = first_item)
    validation_select_vgg16 = validation_sorted_vgg16[:num_images] + validation_sorted_vgg16[-num_images:]

    # Force labels to be 0 or 1
    labels_vgg16 = []
    for x in validation_select_vgg16:
        if x[0] > .5:
            labels_vgg16.append(1.0)
        else:
            labels_vgg16.append(0.0)

    # Seperate X and Y validation datasets
    y_valid_vgg16 = np.array(labels_vgg16)
    HH_HV_Com_Valid_75_vgg16 = [x[1] for x in validation_select_vgg16]
    HH_HV_Com_Valid_75_vgg16 = np.array(HH_HV_Com_Valid_75_vgg16)

    # Load training set and merge with Pseudo labeled set
    HH_HV_Com_75 = pickle.load(open('HH_HV_Com_75.p', 'rb'))
    df_train = pd.read_json('train.json')
    y = np.array(df_train.is_iceberg)
    HH_HV_Com_Pseudo_vgg16 = np.concatenate((HH_HV_Com_75, HH_HV_Com_Valid_75_vgg16), axis=0)
    y_Pseudo_vgg16 = np.concatenate((y, y_valid_vgg16), axis=0)
    
    with open('test_train/HH_HV_Com_Pseudo_vgg16_Dec5.p', 'wb') as handle:
        pickle.dump(HH_HV_Com_Pseudo_vgg16, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('test_train/y_Pseudo_vgg16_Dec5.p', 'wb') as handle:
        pickle.dump(y_Pseudo_vgg16, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
# Writes out Pseudo labeled dataset
Pseudo_Label(1600)