In [None]:
import io
import csv
import numpy as np
import tensorflow as tf
import utils

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow_addons.metrics import F1Score

In [None]:
training_data = []
labels = []

In [None]:
data_file = 'data/smos/smos_data_porter_bal_shuf.txt'
label_file = 'data/smos/smos_labels_bal_shuf.txt'

In [None]:
with open(data_file, newline='') as datafile:
    data_reader = csv.reader(datafile, delimiter='\n')
    
    for row in data_reader:
        training_data.append(row[0])

In [None]:
with open(label_file, newline='') as labelfile:
    label_reader = csv.reader(labelfile, delimiter='\n')
    
    for row in label_reader:
        labels.append(int(row[0]))

In [None]:
# Load and split dataset
batch_size = 32
seed = 123

full_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/smos/train_bal_shuf',
    batch_size=batch_size,
    label_mode='binary',
    seed=seed
)

train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/smos/train_bal_shuf',
    batch_size=batch_size,
    label_mode='binary',
    validation_split=0.2, 
    subset='training',
    seed=seed
)

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/smos/train_bal_shuf',
    batch_size=batch_size,
    label_mode='binary',
    validation_split=0.2, 
    subset='validation',
    seed=seed
)

In [None]:
# Display 2 examples of points in the dataset
for text_batch, label_batch in train_ds.take(1):
    for i in range(2):
        print(label_batch[i].numpy(), text_batch.numpy()[i])

In [None]:
# Configure the dataset for performance
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:

lengths = []

for seq in training_data:
    lengths.append(len(seq.split()))

print('Number of metadocuments: ', len(training_data))
print('Vocab size: ', utils.vocabulary_size(training_data))
print('Avg seq length: ', sum(lengths) / len(lengths))
print('Min seq len: ', min(lengths))
print('Max seq len: ', max(lengths))

In [None]:
'''
Vocabulary size and number of words in a sequence.
If we are using data which has already been balanced then seq len should be set to the max len above
since the seq len will have already been set before balancing.
'''
sequence_length = 200
vocab_size = utils.vocabulary_size(training_data)

In [None]:
'''
Use the text vectorization layer to normalize, split, and map strings to 
integers. Note that the layer uses the custom standardization defined above. 
'''
vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = full_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [None]:
# Dimension of the embedding layer. 
embedding_dim = 8

# Embed vocabulary into embedding_dim dimensions.
# Embedding tutorial uses size, Text Classification tutorial uses size + 1
embedding_layer = tf.keras.layers.Embedding(vocab_size + 1, embedding_dim, name='embedding')

In [None]:
model = Sequential([
    vectorize_layer,
    embedding_layer,
    #Dropout(0.2),
    GlobalAveragePooling1D(),
    #Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.5), tf.keras.metrics.Recall(), tf.keras.metrics.Precision()])

# tf.metrics.BinaryAccuracy(threshold=0.0)
# from_logits=False
# tf.keras.metrics.Recall(),
# ,F1Score(2)

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='logs')

In [None]:
print(train_ds)

In [None]:
model.fit(
    train_ds,
    validation_data=val_ds, 
    epochs=15,
    callbacks=[tensorboard_callback]
)

In [None]:
model.summary()

In [None]:
original_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/smos/train',
    batch_size=batch_size,
    label_mode='binary',
    seed=seed
)

original_ds = original_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
model.evaluate(original_ds)

In [None]:
# Retrieve the trained word embeddings
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
# Save embeddings to disk
out_vec = io.open('data/smos/smos_porter_balanced_vectors.tsv', 'w', encoding='utf-8')
out_meta = io.open('data/smos/smos_porter_balanced_metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if  index == 0: continue # skip 0, it's padding.
    vec = weights[index] 
    out_vec.write('\t'.join([str(x) for x in vec]) + '\n')
    out_meta.write(word + '\n')
    
out_vec.close()
out_meta.close()