In [None]:
import io
import csv
import numpy as np
import pandas as pd
import tensorflow as tf
import utils

from sklearn.model_selection import StratifiedShuffleSplit
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow_addons.metrics import F1Score

In [None]:
INPUT_DATA_FILE = 'data/smos/smos_data_porter_balanced.txt'
INPUT_LABEL_FILE = 'data/smos/smos_labels_porter_balanced.txt'

OUTPUT_VECTOR_FILE = 'data/smos/smos_porter_balanced_vectors.tsv'
OUTPUT_METADATA_FILE = 'data/smos/smos_porter_balanced_metadata.tsv'

N_FOLDS = 10 # Number of cross validation folds. Default: 10
N_TRIALS = 50 # Number of trials of n fold cv

# Only edit first chunk of path
RESULTS_FILE = 'data/smos/smos_' + str(N_TRIALS) + '_TRIALS_' + str(N_FOLDS) + '_FOLD_CV.xlsx'

TRAIN_VERBOSITY = 0 # Verbosity: 0 = silent, 1 = progress bar, 2 = one line per epoch

In [None]:
'''
Ratio defining how much data is reserved for testing. Ex: 0.8 is an 80/20 train/test split
Float on interval (0.0, 1.0)
'''
TRAIN_TEST_SPLIT = 0.8 

'''
Number of words in a sequence.
Note: if using a balanced set, we have already set a seq len, 
so the max len (obtained below) should be used for SEQUENCE_LENGTH

SEQUENCE_LENGTH = 220
'''

# Training epochs
N_EPOCHS=15

# Training batch size
BATCH_SIZE = 32

# Folder to store TF callback logs
TENSORBOARD_CALLBACK = tf.keras.callbacks.TensorBoard(log_dir='logs')

# Dimension of the embedding layer. 
EMBEDDING_DIM = 8

# Metrics to meature training performance
METRICS = ['loss', 'binary_accuracy', 'recall', 'precision', 'f1_score']

In [None]:
training_data = []
labels = []
lengths = []

In [None]:
# Load dataset metadocs
with open(INPUT_DATA_FILE, newline='') as datafile:
    data_reader = csv.reader(datafile, delimiter='\n')
    
    for row in data_reader:
        training_data.append(row[0])

In [None]:
# Load dataset labels
with open(INPUT_LABEL_FILE, newline='') as labelfile:
    label_reader = csv.reader(labelfile, delimiter='\n')
    
    for row in label_reader:
        labels.append(int(row[0]))

In [None]:
'''
Sequence information for loaded set
Note: if using a balanced set, we have already set a seq len, so the max len should be used for SEQUENCE_LENGTH
'''
for seq in training_data:
    lengths.append(len(seq.split()))

print('Number of metadocuments: ', len(training_data))
print('Vocab size: ', utils.vocabulary_size(training_data))
print('Avg seq length: ', sum(lengths) / len(lengths))
print('Min seq len: ', min(lengths))
print('Max seq len: ', max(lengths))

In [None]:
# Convert data and labels to numpy arrays for training and testing
training_data = np.array(training_data, dtype=object)
labels = np.array(labels)

In [None]:
# Use the text vectorization layer to normalize, split, and map strings to integers. 
vectorize_layer = TextVectorization()
    #output_mode='int',
    #output_sequence_length=SEQUENCE_LENGTH

In [None]:
RANDOM_SEEDS = []

# Generate list of unique random seeds to use with StratifiedShuffleSplit objects
while len(RANDOM_SEEDS) < N_TRIALS:
    seed = np.random.randint(1000)
    if seed in RANDOM_SEEDS:
        continue
    else:
        RANDOM_SEEDS.append(seed)

In [None]:
# Store metric averages for each trial
trial_averages = dict([(metric,[]) for metric in METRICS])

# Peform N_TRIALS of N_FOLDS CV
for i,RANDOM_SEED in enumerate(RANDOM_SEEDS):
    k=1 # Fold counter
    # Store metric averages for each fold of a single trial
    trial_history = dict([(metric,[]) for metric in ['loss', 'binary_accuracy', 'recall', 'precision', 'f1_score']])
    skf = StratifiedShuffleSplit(n_splits=N_FOLDS, train_size=TRAIN_TEST_SPLIT, random_state=RANDOM_SEED)

    print('\n\n******************** TRIAL %d ********************' %(i+1))
    for train, test in skf.split(training_data, labels):

        # This will cause the model to build an index of strings to integers.
        # Per TF: It's important to only use training data when creating vocabulary (using the test set would leak information).
        vectorize_layer.set_vocabulary(utils.get_vocabulary(training_data[train]))
        input_dim = len(vectorize_layer.get_vocabulary())

        # Embed vocabulary into embedding_dim dimensions.
        # Embedding tutorial uses size, Text Classification tutorial uses size + 1 for input_dim
        embedding_layer = tf.keras.layers.Embedding(input_dim, EMBEDDING_DIM, name='embedding')

        # Define model structure
        model = Sequential([
            vectorize_layer,
            embedding_layer,
            #Dropout(0.2),
            GlobalAveragePooling1D(),
            #Dropout(0.2),
            Dense(16, activation='relu'),
            Dense(1, activation='sigmoid')
        ])

        # Create model
        model.compile(optimizer='adam',
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), # tutorials use true for training, false for production
                  metrics=[tf.metrics.BinaryAccuracy(threshold=0.5), tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), F1Score(1, threshold=0.5)]
        )

        print('\n\n*************** FOLD %d ***************' %k)


        print('\n******* TRAIN *******')
        # Train model
        # Verbosity: 0 = silent, 1 = progress bar, 2 = one line per epoch
        history = model.fit(
            training_data[train],
            labels[train],
            batch_size=BATCH_SIZE, 
            epochs=N_EPOCHS,
            callbacks=[TENSORBOARD_CALLBACK],
            verbose=VERBOSITY
        )

        print('\n******* TEST *******')
        # Test model
        model.evaluate(training_data[test], labels[test])

        # Append current fold results to trial_history dict
        # Metric names are appended with a _(run number) each trial, hence the nested for loop
        for current_metric, results in history.history.items():
            for metric in trial_history.keys():
                if metric in current_metric:
                    # F1Score stores results as a list of lists instead of list of floats
                    if metric == 'f1_score':
                        total=0
                        for result in results:
                            total+=result[0]
                        trial_history[metric].append(total/len(results))
                    else:
                        trial_history[metric].append(sum(results)/len(results))

                    break

        # If we are in the last fold of the trial, average the metric results 
        # across all n folds and append to trial_averages
        if k == N_FOLDS:
            for metric, results in trial_history.items():
                trial_averages[metric].append(sum(results)/len(results))

        k += 1

In [None]:
result_averages = {}

# Stores metric averages across all trials
for metric, results in trial_averages.items():
    result_averages[metric+'_avg'] = sum(results)/len(results)
    
trial_table = pd.DataFrame(trial_averages)
averages_table = pd.DataFrame(result_averages, index=result_averages.keys())

In [None]:
'''
Write all results to an excel file.
The first sheet shows metric averages for each trial. The second sheet contains the averages across all trials. 
'''
with pd.ExcelWriter(RESULTS_FILE) as writer:
    trial_table.to_excel(writer, sheet_name='Trials')
    
with pd.ExcelWriter(RESULTS_FILE, mode='a') as writer:
    averages_table.iloc[0].to_excel(writer, sheet_name='Averages', header=False)

In [None]:
# Display model information
model.summary()

In [None]:
# Retrieve the trained word embeddings
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
# Save embeddings to disk
out_vec = io.open(OUTPUT_VECTOR_FILE, 'w', encoding='utf-8')
out_meta = io.open(OUTPUT_METADATA_FILE, 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if  index == 0: continue # skip 0, it's padding.
    vec = weights[index] 
    out_vec.write('\t'.join([str(x) for x in vec]) + '\n')
    out_meta.write(word + '\n')
    
out_vec.close()
out_meta.close()