In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import utils

from imblearn.over_sampling import SMOTE
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
training_data = []
labels = []

In [None]:
data_file = 'data/smos/smos_data_porter_shuffled.txt'
label_file = 'data/smos/smos_labels.txt'

In [None]:
with open(data_file, newline='') as datafile:
    data_reader = csv.reader(datafile, delimiter='\n')
    
    for row in data_reader:
        training_data.append(row[0])

In [None]:
with open(label_file, newline='') as labelfile:
    label_reader = csv.reader(labelfile, delimiter='\n')
    
    for row in label_reader:
        labels.append(int(row[0]))

In [None]:
lengths = []

for seq in training_data:
    lengths.append(len(seq.split()))

In [None]:
print('Number of metadocuments: ', len(training_data))
print('Vocab size: ', utils.vocabulary_size(training_data))
print('Avg seq length: ', sum(lengths) / len(lengths))
print('Min seq len: ', min(lengths))
print('Max seq len: ', max(lengths))

In [None]:
# plots the sequence length of each metadocument in the dataset
X = [x for x in range(len(lengths))]

plt.scatter(X, lengths)
plt.show()

In [None]:
# Load and split dataset
batch_size = 32
seed = 123

full_ds = tf.keras.preprocessing.text_dataset_from_directory('data/smos/train_s')

In [None]:
# Specify vocabulary size and number of words in a sequence for vectorize layer
# Using ~avg sequence length of all sequences
sequence_length = 200
vocab_size = utils.vocabulary_size(training_data)

In [None]:
# Use the text vectorization layer to split, prune and map strings to 
# integers. Note that the layer uses the custom standardization defined above. 
# Set maximum_sequence length as all samples are not of the same length.
# size + 1 for UNK
vectorize_layer = TextVectorization(
    max_tokens=vocab_size + 1,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = full_ds.map(lambda x, y: x)
# Per the tf documentation, we should build our own vocab instead, how do we account for managled words?
vectorize_layer.adapt(text_ds) 

In [None]:
tf_vocab = vectorize_layer.get_vocabulary()

In [None]:
print(len(tf_vocab))

In [None]:
# Get original vocabulary from unbalanced training data
vocab = (utils.get_vocabulary(training_data)).keys()
inverse_vocab = {}

# Create an inverse vocabulary so we can decode the balanced vectorized data
for i, word in enumerate(tf_vocab):
    inverse_vocab[i] = word
    if word not in vocab:
        print(word) # print words which were magled by fixed sequence length

In [None]:
print(inverse_vocab)

In [None]:
vectorized_data = np.zeros((len(training_data), sequence_length))
vectorized_labels = np.zeros(len(training_data))
i = 0

# Build np arrays for data balancing
for batch in full_ds:
    for sequence, label in zip(vectorize_layer(batch[0]), batch[1]):
        vectorized_data[i] = sequence
        vectorized_labels[i] = label 
        i += 1

In [None]:
# check vectors for [UNK] in sequence
for row in vectorized_data:
    for val in row:
        if val == 1:
            print('hit')

In [None]:
balanced_data_enc, balanced_labels = SMOTE(sampling_strategy=.75).fit_resample(vectorized_data, vectorized_labels)

In [None]:
# create new data, label, and filename files for balanced set
with open('data/smos/smos_filenames_bal_shuf.txt', 'w', newline='') as fnfile:
        filename_writer = csv.writer(fnfile, quoting=csv.QUOTE_MINIMAL)

        with open('data/smos/smos_data_porter_bal_shuf.txt', 'w', newline='') as datafile:
            data_writer = csv.writer(datafile, quoting=csv.QUOTE_MINIMAL)

            with open('data/smos/smos_labels_bal_shuf.txt', 'w', newline='') as labelfile:
                label_writer = csv.writer(labelfile, quoting=csv.QUOTE_MINIMAL)  

                i = 0
                for row, label in zip(balanced_data_enc, balanced_labels):
                    decoded = []
                    for val in row:
                        decoded.append(inverse_vocab[int(val)])
                        
                    filename_writer.writerow(['file'+str(i)])
                    # we will get double spaces due to a 0 being mapped to '' 
                    data_writer.writerow([' '.join(decoded)])
                    label_writer.writerow([int(label)])
                    i+=1