In [1]:
import io
import csv
import numpy as np
import tensorflow as tf
import utils

from sklearn.model_selection import StratifiedShuffleSplit
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow_addons.metrics import F1Score

In [None]:
INPUT_DATA_FILE = 'data/smos/smos_data_porter_balanced.txt'
INPUT_LABEL_FILE = 'data/smos/smos_labels_porter_balanced.txt'

N_FOLDS = 10 # Number of cross validation folds. Default: 10
TRAIN_TEST_SPLIT = 0.8 # Float on interval (0.0, 1.0)

In [None]:
training_data = []
labels = []
lengths = []

In [None]:
with open(INPUT_DATA_FILE, newline='') as datafile:
    data_reader = csv.reader(datafile, delimiter='\n')
    
    for row in data_reader:
        training_data.append(row[0])

In [None]:
with open(INPUT_LABEL_FILE, newline='') as labelfile:
    label_reader = csv.reader(labelfile, delimiter='\n')
    
    for row in label_reader:
        labels.append(int(row[0]))

In [None]:
for seq in training_data:
    lengths.append(len(seq.split()))

print('Number of metadocuments: ', len(training_data))
print('Vocab size: ', utils.vocabulary_size(training_data))
print('Avg seq length: ', sum(lengths) / len(lengths))
print('Min seq len: ', min(lengths))
print('Max seq len: ', max(lengths))

In [None]:
# Convert data and labels to numpy arrays for training and testing
training_data = np.array(training_data, dtype=object)
labels = np.array(labels)

In [None]:
# Training batch size
BATCH_SIZE = 32

# Folder to store callback logs
TENSORBOARD_CALLBACK = tf.keras.callbacks.TensorBoard(log_dir='logs')

# Dimension of the embedding layer. 
EMBEDDING_DIM = 8

In [None]:
'''
Number of words in a sequence.
If we are using data which has already been balanced then seq len should be set to the max len above
since the seq len will have already been set before balancing.
'''
SEQUENCE_LENGTH = 200

'''
Use the text vectorization layer to normalize, split, and map strings to 
integers. Note that the layer uses the custom standardization defined above. 
'''
vectorize_layer = TextVectorization(
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH
)

In [None]:
skf = StratifiedShuffleSplit(n_splits=N_FOLDS, train_size=TRAIN_TEST_SPLIT)
i=1

for train, test in skf.split(training_data, labels):
    
    # This will cause the model to build an index of strings to integers.
    # Per TF: It's important to only use training data when calling adapt (using the test set would leak information).
    #vectorize_layer.adapt(training_data[train]) FIX THIS by using set vocab
    vectorize_layer.set_vocabulary(utils.get_vocabulary(training_data[train]))
    input_dim = len(vectorize_layer.get_vocabulary())
    
    # Embed vocabulary into embedding_dim dimensions.
    # Embedding tutorial uses size, Text Classification tutorial uses size + 1 for input_dim
    embedding_layer = tf.keras.layers.Embedding(input_dim, embedding_dim, name='embedding')
    
    # Define model structure
    model = Sequential([
        vectorize_layer,
        embedding_layer,
        #Dropout(0.2),
        GlobalAveragePooling1D(),
        #Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    # Create model
    model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), # tutorials use true for training, false for production
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.5), tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), F1Score(1, threshold=0.5)]
    )
    
    print('\n\n*************** FOLD %d ***************' %i)
    i += 1
    
    print('\n******* TRAIN *******')
    # Train model
    model.fit(
        training_data[train],
        labels[train],
        batch_size=batch_size,
        #validation_data=val_ds, 
        epochs=15,
        callbacks=[tensorboard_callback]
    )
    
    print('\n******* TEST *******')
    # Test model
    model.evaluate(training_data[test], labels[test])

In [None]:
model.summary()