In [None]:
import csv
#import io
#import matplotlib.pyplot as plt
import numpy as np
#import pandas as pd
import tensorflow as tf
import utils

from sklearn.model_selection import StratifiedShuffleSplit
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow_addons.metrics import F1Score

In [None]:
TEST_SET = 'eanci'

'''
Ratio defining how much data is reserved for testing. Ex: 0.8 is an 80/20 train/test split
Float on interval (0.0, 1.0)
'''
TRAIN_TEST_SPLIT = 0.9

'''
Ratio defining how many synthetic minority samples should be created. 1.0 results in a fully balanced set.
Float on interval (0.0, 1.0]
'''
BALANCE_RATIO = 1.0

# Number of rounds of active learning to perform
AL_ROUNDS = 5

# Percent of total valid links to add in each round of active learning
# NOTE: Ratio applied to size of full dataset
AL_STEP_SIZE = .05 

# Number of trials of experiment
N_TRIALS = 1 

# Training epochs
N_EPOCHS=15

# Training batch size
BATCH_SIZE = 32

# Dimension of the embedding layer. 
EMBEDDING_DIM = 8

# Metrics to meature training performance
METRICS = ['loss', 'binary_accuracy', 'recall', 'precision', 'f1_score']

# Folder to store TF callback logs
# uncomment in loop
TENSORBOARD_CALLBACK = tf.keras.callbacks.TensorBoard(log_dir='logs')

# Verbosity: 0 = silent, 1 = progress bar, 2 = one line per epoch
TRAIN_VERBOSITY = 1
TEST_VERBOSITY = 1

In [None]:
DATASET_PATHS = {
    'albergate': ('data/albergate/albergate_data.txt', 'data/albergate/albergate_labels.txt'),
    'eanci': ('data/eanci/eanci_data.txt', 'data/eanci/eanci_labels.txt'),
    'etour': ('data/etour/etour_data.txt', 'data/etour/etour_labels.txt'),
    'itrust': ('data/itrust/itrust_data.txt', 'data/itrust/itrust_labels.txt'),
    #'kepler': ('data/kepler/kepler_data.txt', 'data/kepler/kepler_labels.txt'),
    'modis': ('data/modis/modis_data.txt', 'data/modis/modis_labels.txt'),
    'smos': ('data/smos/smos_data.txt', 'data/smos/smos_labels.txt')
}

In [None]:
'''
Load all datasets and associated labels.
'''

DATASETS = {}

for set_name, paths in DATASET_PATHS.items():
    data = []
    labels = []
    lengths = []
    
    # Load dataset metadocs
    with open(paths[0], newline='') as datafile:
        data_reader = csv.reader(datafile, delimiter='\n')

        for row in data_reader:
            data.append(row[0])
            
    # Load dataset labels
    with open(paths[1], newline='') as labelfile:
        label_reader = csv.reader(labelfile, delimiter='\n')

        for row in label_reader:
            labels.append(int(row[0]))
            
    for seq in data:
        lengths.append(len(seq.split()))
            
    DATASETS[set_name] = (np.array(data, dtype=object), np.array(labels), int(sum(lengths) / len(lengths)))

In [None]:
# Use the text vectorization layer to normalize, split, and map strings to integers. 
vectorize_layer = TextVectorization()

In [None]:
RANDOM_SEEDS = set()

# Generate list of unique random seeds to use with StratifiedShuffleSplit objects
while len(RANDOM_SEEDS) < N_TRIALS:
    RANDOM_SEEDS.add(np.random.randint(1000))

RANDOM_SEEDS = list(RANDOM_SEEDS)

In [None]:
# Store metric averages for each trial
trial_averages = dict([(metric,[]) for metric in METRICS])

# Peform N_TRIALS of experiment
for i,RANDOM_SEED in enumerate(RANDOM_SEEDS):

    # Store metric averages for each fold of a single trial
    trial_history = dict([(metric,[]) for metric in ['loss', 'binary_accuracy', 'recall', 'precision', 'f1_score']])
    
    skf = StratifiedShuffleSplit(n_splits=1, train_size=TRAIN_TEST_SPLIT, random_state=RANDOM_SEED)

    print('\n\n******************** TRIAL %d ********************' %(i+1))
    for train, test in skf.split(DATASETS[TEST_SET][0], DATASETS[TEST_SET][1]):
        
        total_valid = 0

        for label in DATASETS[TEST_SET][1]:
            total_valid += label

        AL_VALID_LINKS = int(AL_STEP_SIZE * total_valid)
        
        test_valid = 0
        
        for index in test:
            test_valid += DATASETS[TEST_SET][1][index]
            
        test_invalid = len(DATASETS[TEST_SET][1]) - valid
        
        initial = StratifiedShuffleSplit(n_splits=1, train_size=(0.1/0.9), random_state=123)
        
        for initial_set, remaining_data in initial.split(DATASETS[TEST_SET][0][train], DATASETS[TEST_SET][1][train]):

            # Create balanced initial training set
            data, labels = utils.balance_data(
                DATASETS[TEST_SET][0][initial_set],
                DATASETS[TEST_SET][1][initial_set],
                DATASETS[TEST_SET][2],
                BALANCE_RATIO
            )            

            for iteration in range(AL_ROUNDS):

                # This will cause the model to build an index of strings to integers.
                # Per TF: It's important to only use training data when creating vocabulary (using the test set would leak information).
                vectorize_layer.set_vocabulary(utils.get_vocabulary(data))

                input_dim = len(vectorize_layer.get_vocabulary())

                # Embed vocabulary into embedding_dim dimensions.
                # Embedding tutorial uses size, Text Classification tutorial uses size + 1 for input_dim
                embedding_layer = tf.keras.layers.Embedding(input_dim, EMBEDDING_DIM, name='embedding')

                # Define model structure
                model = Sequential([
                    vectorize_layer,
                    embedding_layer,
                    #Dropout(0.2),
                    GlobalAveragePooling1D(),
                    #Dropout(0.2),
                    Dense(16, activation='relu'),
                    Dense(1, activation='sigmoid')
                ])

                # Create model
                model.compile(
                    optimizer='adam',
                    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), # tutorials use true for training, false for production
                    metrics=[tf.metrics.BinaryAccuracy(threshold=0.5), tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), F1Score(1, threshold=0.5)]
                )

                print('\n\n*************** TRAIN SET - %d%% ***************' %(iteration*10+10))

                print('TRAIN:', len(data))
                #if iteration == 0:
                #    print('BALANCED TRAIN:', len(data)) 
                print('TEST:', len(test), test_valid, '(valid)', test_invalid, '(invalid)')
                print('REMAIN:', len(remaining_data))

                print('\n******* TRAIN *******')
                # Train model
                # Verbosity: 0 = silent, 1 = progress bar, 2 = one line per epoch
                history = model.fit(
                    data,
                    labels,
                    batch_size=BATCH_SIZE, 
                    epochs=N_EPOCHS,
                    #callbacks=[TENSORBOARD_CALLBACK],
                    verbose=TRAIN_VERBOSITY
                )

                print('\n******* TEST *******')
                # Test 
                model.evaluate(
                        DATASETS[TEST_SET][0][test],
                        DATASETS[TEST_SET][1][test],
                        verbose=TEST_VERBOSITY
                    )

                print('\n******* PREDICT *******')
                # Active learning step
                predictions = model.predict(DATASETS[TEST_SET][0][remaining_data])

                # Find weakest predictions and append them to training set
                if iteration < AL_ROUNDS-1:
                    predictions = np.concatenate(predictions, axis=0)
                    entropies = np.zeros((len(predictions),), dtype=float)

                    for i in range(len(predictions)):
                        entropies[i] = utils.calculate_entropy(predictions[i])

                    weakest = np.argsort(entropies)

                    num_added = 0
                    to_add = np.zeros((2*AL_VALID_LINKS,), dtype=int)
                    
                    for i in range(len(weakest) - 1, -1, -1) :
                        if DATASETS[TEST_SET][1][remaining_data[weakest[i]]] == 1:
                            #print('found pos link')
                            to_add[num_added] = weakest[i]
                            num_added += 1
                        if num_added == AL_VALID_LINKS:
                            break

                    for i in range(len(weakest) - 1, -1, -1) :
                        if DATASETS[TEST_SET][1][remaining_data[weakest[i]]] == 0:
                            #print('found neg link')
                            to_add[num_added] = weakest[i]
                            num_added += 1
                        if num_added == 2*AL_VALID_LINKS:
                            break

                    data = np.concatenate((data, DATASETS[TEST_SET][0][remaining_data[to_add]]), axis=0)
                    labels = np.concatenate((labels, DATASETS[TEST_SET][1][remaining_data[to_add]]), axis=0)

                    remaining_data = np.setdiff1d(remaining_data, remaining_data[to_add])


                #break

                #print(history.history.items())