In [None]:
import io
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import utils

from random import sample, shuffle
from sklearn.model_selection import StratifiedShuffleSplit
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow_addons.metrics import F1Score

In [None]:
'''
Ratio defining how much data is reserved for CV testing. Ex: 0.8 is an 80/20 train/test split
Float on interval (0.0, 1.0)
'''
TRAIN_TEST_SPLIT = 0.8

'''
Percentage of cross training data to hold out for testing. This test set will not change across folds,
but will change across trials.
Ex. ratio=0.1 means a 90/10 split where 90% of a cross training dataset is available for training and 10% is held for testing.
'''
CROSS_TRAIN_HOLDOUT_RATIO = 0.1

'''
Percentage of dataset to use for cross training. NOTE: this can not be larger that 1 - CROSS_TRAIN_HOLDOUT_RATIO
Ex. ratio=0.1 means a training set 10% of the ORIGINAL dataset is used for cross training. The training data will never overlap with the holdout data.

'''
CROSS_TRAIN_RATIO = 0.1

# Number of cross validation folds. Default: 10
N_FOLDS = 10 

# Number of trials of n fold cv
N_TRIALS = 10 

# Training epochs
N_EPOCHS=15

# Training batch size
BATCH_SIZE = 32

# Dimension of the embedding layer. 
EMBEDDING_DIM = 8

# Metrics to meature training performance
METRICS = ['loss', 'binary_accuracy', 'recall', 'precision', 'f1_score']

# Folder to store TF callback logs
TENSORBOARD_CALLBACK = tf.keras.callbacks.TensorBoard(log_dir='logs')

# Verbosity: 0 = silent, 1 = progress bar, 2 = one line per epoch
VERBOSITY = 0

In [None]:
DATASET_PATHS = {
    'albergate': ('data/albergate/albergate_data_balanced.txt', 'data/albergate/albergate_labels_balanced.txt'),
    'eanci': ('data/eanci/eanci_data_balanced.txt', 'data/eanci/eanci_labels_balanced.txt'),
    'etour': ('data/etour/etour_data_balanced.txt', 'data/etour/etour_labels_balanced.txt'),
    'itrust': ('data/itrust/itrust_data_balanced.txt', 'data/itrust/itrust_labels_balanced.txt'),
    #'kepler': ('data/kepler/kepler_data_balanced.txt', 'data/kepler/kepler_labels_balanced.txt'),
    'modis': ('data/modis/modis_data_balanced.txt', 'data/modis/modis_labels_balanced.txt'),
    'smos': ('data/smos/smos_data_balanced.txt', 'data/smos/smos_labels_balanced.txt')
}

In [None]:
def format_output(trial_averages):
    result_averages = {}

    # Stores metric averages across all trials
    for metric, results in trial_averages.items():
        result_averages[metric+'_avg'] = sum(results)/len(results)
    
    return pd.DataFrame(trial_averages), pd.DataFrame(result_averages, index=result_averages.keys())

In [None]:
def mix_data(data_labels, ct_data_labels, CROSS_TRAIN_HOLDOUT_RATIO, CT_SEED):
    skf = StratifiedShuffleSplit(n_splits=1, train_size=CROSS_TRAIN_HOLDOUT_RATIO, random_state=CT_SEED)
    ct_test_data, ct_test_labels, ct_train_data, ct_train_labels = None, None, None, None

    ct_all_data = np.array(ct_data_labels[0], dtype=object)
    ct_all_labels = np.array(ct_data_labels[1])

    for train, test in skf.split(ct_all_data, ct_all_labels):
        ct_test_data = ct_all_data[test]
        ct_test_labels = ct_all_labels[test]

        ct_train_data = [ct_data_labels[0][index] for index in train]
        ct_train_labels = [ct_data_labels[1][index] for index in train]

    training_data = data_labels[0].copy()
    labels = data_labels[1].copy()



    print(primary_set, ':', len(training_data))
    print(cross_train_set, ':', len(ct_all_data))

    for datapoint, label in sample(list(zip(ct_train_data, ct_train_labels)), k=int(len(ct_all_data)*CROSS_TRAIN_RATIO)):
        training_data.append(datapoint)
        labels.append(label)

    shuffled = list(zip(training_data, labels))
    shuffle(shuffled)
    
    training_data.clear()
    labels.clear()
    
    for datapoint, label in shuffled:
        training_data.append(datapoint)
        labels.append(label)
    
    print('Mixed :', len(training_data))
    
    return np.array(training_data), np.array(labels), ct_test_data, ct_test_labels

In [None]:
def plot_f1scores(averages, datasets, phase):    
    f1_scores = []
    
    # Create lists for axes
    datasets = sorted(datasets)
    x_datasets = datasets.copy()
    datasets.reverse()
    y_datasets = datasets.copy()
    
    for cross_train_set in y_datasets:
        temp = []
        for dataset in x_datasets:
            
            if dataset == cross_train_set:
                temp.append(float(format(0, '.2f')))
            else:
                temp.append(float(format(averages[dataset+'|'+cross_train_set], '.2f')))

        f1_scores.append(temp)
        
    f1_scores = np.array(f1_scores)
    
    fig, ax = plt.subplots(figsize=(8, 6))
    im = ax.imshow(f1_scores, cmap='Greys')
    
    # Label axes
    plt.xlabel('Primary Set')
    plt.ylabel('Cross Train Set')
    
    # Set chart title
    ax.set_title(phase.upper()+' '+"F1 Scores")

    # Show all axis ticks
    ax.set_xticks(np.arange(len(x_datasets)))
    ax.set_yticks(np.arange(len(y_datasets)))
    
    # Label axis ticks
    ax.set_xticklabels(x_datasets)
    ax.set_yticklabels(y_datasets)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(y_datasets)):
        for j in range(len(x_datasets)):
            text = ax.text(j, i, f1_scores[i, j],
                           ha="center", va="center", color="r")

    
    fig.tight_layout()
    plt.savefig('trials/'+phase+'_heatmap')

In [None]:
def save_results(fold_results, history):
    # Append current fold results to trial_history dict
    # Metric names are appended with a _(run number) each trial, hence the nested for loop
    for current_metric, results in fold_results.items():
        for metric in history.keys():
            if metric in current_metric:
                # F1Score stores results as a list of lists instead of list of floats
                if metric == 'f1_score':
                    try:
                        total=0
                        for result in results:
                            total+=result[0]
                        
                        history[metric].append(total/len(results))
                    except:
                        history[metric].append(results[0])
                    
                else:
                    try:
                        history[metric].append(sum(results)/len(results))
                    except:
                        history[metric].append(results)

                break
    return history

In [None]:
DATASETS = {}

for dataset, paths in DATASET_PATHS.items():
    data = []
    labels = []
    
    # Load dataset metadocs
    with open(paths[0], newline='') as datafile:
        data_reader = csv.reader(datafile, delimiter='\n')

        for row in data_reader:
            data.append(row[0])
            
    # Load dataset labels
    with open(paths[1], newline='') as labelfile:
        label_reader = csv.reader(labelfile, delimiter='\n')

        for row in label_reader:
            labels.append(int(row[0]))
            
    DATASETS[dataset] = (data, labels)

In [None]:
RANDOM_CT_SEEDS = set()
RANDOM_CV_SEEDS = set()

# Generate list of unique random seeds to use with StratifiedShuffleSplit objects
while len(RANDOM_CT_SEEDS) < (len(DATASETS.keys()) * (len(DATASETS.keys())-1) * N_TRIALS):
    RANDOM_CT_SEEDS.add(np.random.randint(10000))

# Generate list of unique random seeds to use with StratifiedShuffleSplit objects
while len(RANDOM_CV_SEEDS) < N_TRIALS:
    RANDOM_CV_SEEDS.add(np.random.randint(1000))

RANDOM_CT_SEEDS = iter(list(RANDOM_CT_SEEDS))
RANDOM_CV_SEEDS = list(RANDOM_CV_SEEDS)

In [None]:
# Use the text vectorization layer to normalize, split, and map strings to integers. 
vectorize_layer = TextVectorization()

train_f1scores = {}
validation_f1scores = {}
ct_f1scores = {}

for primary_set, data_labels in DATASETS.items():
    for cross_train_set, ct_data_labels in DATASETS.items():
        if primary_set == cross_train_set:
            pass
        else:
            print('\n\n*************************', primary_set.upper(), '+', cross_train_set.upper(), '*************************')
            
            # Store metric averages for each trial
            train_averages = dict([(metric,[]) for metric in METRICS])
            validation_averages = dict([(metric,[]) for metric in METRICS])
            ct_averages = dict([(metric,[]) for metric in METRICS])

            # Peform N_TRIALS of N_FOLDS CV
            for i,RANDOM_SEED in enumerate(RANDOM_CV_SEEDS):
                print('\n\n******************** TRIAL %d ********************' %(i+1))
                
                training_data, labels, ct_test_data, ct_test_labels = mix_data(data_labels, ct_data_labels, CROSS_TRAIN_HOLDOUT_RATIO, next(RANDOM_CT_SEEDS))

                # Convert data and labels to numpy arrays for training and testing
                #training_data = np.array(training_data, dtype=object)
                #labels = np.array(labels)
                
                k=1 # Fold counter
                # Store metric averages for each fold of a single trial
                train_history = dict([(metric,[]) for metric in METRICS])
                validation_history = dict([(metric,[]) for metric in METRICS])
                ct_history = dict([(metric,[]) for metric in METRICS])
                
                skf = StratifiedShuffleSplit(n_splits=N_FOLDS, train_size=TRAIN_TEST_SPLIT, random_state=RANDOM_SEED)

                for train, test in skf.split(training_data, labels):

                    # This will cause the model to build an index of strings to integers.
                    # Per TF: It's important to only use training data when creating vocabulary (using the test set would leak information).
                    vectorize_layer.set_vocabulary(utils.get_vocabulary(training_data[train]))
                    input_dim = len(vectorize_layer.get_vocabulary())

                    # Embed vocabulary into embedding_dim dimensions.
                    # Embedding tutorial uses size, Text Classification tutorial uses size + 1 for input_dim
                    embedding_layer = tf.keras.layers.Embedding(input_dim, EMBEDDING_DIM, name='embedding')

                    # Define model structure
                    model = Sequential([
                        vectorize_layer,
                        embedding_layer,
                        #Dropout(0.2),
                        GlobalAveragePooling1D(),
                        #Dropout(0.2),
                        Dense(16, activation='relu'),
                        Dense(1, activation='sigmoid')
                    ])

                    # Create model
                    model.compile(optimizer='adam',
                              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), # tutorials use true for training, false for production
                              metrics=[
                                  tf.metrics.BinaryAccuracy(threshold=0.5),
                                  tf.keras.metrics.Recall(),
                                  tf.keras.metrics.Precision(),
                                  F1Score(1, threshold=0.5)
                              ]
                    )

                    print('\n\n*************** FOLD %d ***************' %k)

                    print('\n******* TRAIN *******')
                    # Train model
                    train_results = model.fit(
                        training_data[train],
                        labels[train],
                        batch_size=BATCH_SIZE, 
                        epochs=N_EPOCHS,
                        callbacks=[TENSORBOARD_CALLBACK],
                        verbose=VERBOSITY
                    )

                    print('\n******* VALIDATION *******')
                    # Test model with validation data
                    validation_results = model.evaluate(
                        training_data[test],
                        labels[test],
                        callbacks=[TENSORBOARD_CALLBACK],
                        return_dict=True,
                        verbose=VERBOSITY
                    )

                    print('\n******* CT TEST *******')
                    # Test model with cross train data
                    ct_results = model.evaluate(
                        ct_test_data,
                        ct_test_labels,
                        callbacks=[TENSORBOARD_CALLBACK],
                        return_dict=True,
                        verbose=VERBOSITY
                    )
                    
                    train_history = save_results(train_results.history, train_history)
                    validation_history = save_results(validation_results, validation_history)
                    ct_history = save_results(ct_results, ct_history)

                    # If we are in the last fold of the trial, average the metric results 
                    # across all n folds and append to trial_averages
                    if k == N_FOLDS:
                        for metric, results in train_history.items():
                            train_averages[metric].append(sum(results)/len(results))
                        for metric, results in validation_history.items():
                            validation_averages[metric].append(sum(results)/len(results))
                        for metric, results in ct_history.items():
                            ct_averages[metric].append(sum(results)/len(results))

                    k += 1

            RESULTS_FILE = 'trials/'+primary_set+'_'+str(CROSS_TRAIN_RATIO)+'_'+cross_train_set+'_'+str(N_TRIALS)+'_T_'+ str(N_FOLDS)+'_fCV.xlsx'
            
            '''
            Write all results to an excel file.
            The first sheet shows metric averages for each trial. The second sheet contains the averages across all trials.
            '''
            trial_table, averages_table = format_output(train_averages)
            
            with pd.ExcelWriter(RESULTS_FILE) as writer:
                trial_table.to_excel(writer, sheet_name='Training Trials')

            with pd.ExcelWriter(RESULTS_FILE, mode='a') as writer:
                averages_table.iloc[0].to_excel(writer, sheet_name='Training Averages', header=False)
                
            trial_table, averages_table = format_output(validation_averages)
            
            with pd.ExcelWriter(RESULTS_FILE, mode='a') as writer:
                trial_table.to_excel(writer, sheet_name='Validation Trials')

            with pd.ExcelWriter(RESULTS_FILE, mode='a') as writer:
                averages_table.iloc[0].to_excel(writer, sheet_name='Validation Averages', header=False)
                
            trial_table, averages_table = format_output(ct_averages)
            
            with pd.ExcelWriter(RESULTS_FILE, mode='a') as writer:
                trial_table.to_excel(writer, sheet_name='CT Trials')

            with pd.ExcelWriter(RESULTS_FILE, mode='a') as writer:
                averages_table.iloc[0].to_excel(writer, sheet_name='CT Averages', header=False)
                
            train_f1scores[primary_set+'|'+cross_train_set] = sum(train_averages['f1_score'])/len(train_averages['f1_score'])
            validation_f1scores[primary_set+'|'+cross_train_set] = sum(validation_averages['f1_score'])/len(validation_averages['f1_score'])
            ct_f1scores[primary_set+'|'+cross_train_set] = sum(ct_averages['f1_score'])/len(ct_averages['f1_score'])

In [None]:
plot_f1scores(train_f1scores, list(DATASETS.keys()), 'training')
plot_f1scores(validation_f1scores, list(DATASETS.keys()), 'validation')
plot_f1scores(ct_f1scores, list(DATASETS.keys()), 'cross training')

In [None]:
# Display model information
model.summary()

In [None]:
# Retrieve the trained word embeddings
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
OUTPUT_VECTOR_FILE = 'data/smos/smos_porter_balanced_vectors.tsv'
OUTPUT_METADATA_FILE = 'data/smos/smos_porter_balanced_metadata.tsv'

In [None]:
# Save embeddings to disk
out_vec = io.open(OUTPUT_VECTOR_FILE, 'w', encoding='utf-8')
out_meta = io.open(OUTPUT_METADATA_FILE, 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if  index == 0: continue # skip 0, it's padding.
    vec = weights[index] 
    out_vec.write('\t'.join([str(x) for x in vec]) + '\n')
    out_meta.write(word + '\n')
    
out_vec.close()
out_meta.close()