In [None]:
import csv
import numpy as np
import tensorflow as tf
import utils

from imblearn.over_sampling import SMOTE
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
training_data = [] # list of strings
filename = 'data/smos/smos_data_porter.txt'

In [None]:
with open(filename, newline='') as datafile:
    data_reader = csv.reader(datafile, delimiter='\n')
    
    for row in data_reader:
        training_data.append(row[0])

In [None]:
print(len(training_data))

In [None]:
lengths = []

for seq in training_data:
    lengths.append(len(seq.split()))

In [None]:
labels = [] # list of strings
filename = 'data/smos/smos_labels.txt'

In [None]:
with open(filename, newline='') as datafile:
    data_reader = csv.reader(datafile, delimiter='\n')
    
    for row in data_reader:
        labels.append(int(row[0]))

In [None]:
print('Vocab size: ', utils.vocabulary_size(training_data))
print('Avg seq length: ', sum(lengths) / len(lengths))

In [None]:
# Load and split dataset
batch_size = 32
seed = 123

full_ds = tf.keras.preprocessing.text_dataset_from_directory('data/smos/train')

In [None]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(2):
        print(label_batch[i].numpy(), text_batch.numpy()[i])

In [None]:
# Vocabulary size and number of words in a sequence.
# Using ~avg sequence length of all sequences
sequence_length = 200
vocab_size = utils.vocabulary_size(training_data)

In [None]:
# Use the text vectorization layer to normalize, split, and map strings to 
# integers. Note that the layer uses the custom standardization defined above. 
# Set maximum_sequence length as all samples are not of the same length.
# size + 1 for UNK
vectorize_layer = TextVectorization(
    max_tokens=vocab_size + 1,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = full_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds) # we should build our own vocab instead, per the tf documentation, how do we account for managled words?

In [None]:
tf_vocab = vocab = vectorize_layer.get_vocabulary()

In [None]:
vocab = set()

for string in training_data:
        token_list = string.split()
        for token in token_list:
            vocab.add(token)

In [None]:
print(len(vocab))

In [None]:
print(len(tf_vocab))

In [None]:
# print words which were magled by fixed sequence length
# what happens when we fix sequence length and that chops a word?
inverse_vocab = {}

for i, word in enumerate(tf_vocab):
    inverse_vocab[i] = word
    if word not in vocab:
        print(word)

In [None]:
print(inverse_vocab)

In [None]:
vectorized_data = np.zeros((len(training_data), sequence_length))
vectorized_labels = np.zeros(len(training_data))
j = 0

for batch in full_ds:
    for sequence, label in zip(vectorize_layer(batch[0]), batch[1]):
        vectorized_data[j] = sequence
        vectorized_labels[j] = label 
        j += 1

In [None]:
# check vectors for [UNK] in sequence
for row in vectorized_data:
    for val in row:
        if val == 1:
            print('hit')

In [None]:
balanced_data_enc, balanced_labels = SMOTE(sampling_strategy=.75).fit_resample(vectorized_data, vectorized_labels)

In [None]:
with open('data/smos/smos_filenames_balanced.txt', 'w', newline='') as fnfile:
        filename_writer = csv.writer(fnfile, quoting=csv.QUOTE_MINIMAL)

        with open('data/smos/smos_data_porter_balanced.txt', 'w', newline='') as datafile:
            data_writer = csv.writer(datafile, quoting=csv.QUOTE_MINIMAL)

            with open('data/smos/smos_labels_balanced.txt', 'w', newline='') as labelfile:
                label_writer = csv.writer(labelfile, quoting=csv.QUOTE_MINIMAL)  

                i = 0
                for row, label in zip(balanced_data_enc, balanced_labels):
                    decoded = []
                    for val in row:
                        decoded.append(inverse_vocab[int(val)])
                        
                    filename_writer.writerow(['file'+str(i)])
                    # we will get double spaces due to a 0 being mapped to '' 
                    data_writer.writerow([' '.join(decoded)])
                    label_writer.writerow([int(label)])
                    i+=1