### Notebook to perform SMOTE

In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import utils

from imblearn.over_sampling import SMOTE
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
INPUT_DATA_FILE = 'data/smos/smos_data_porter.txt'
INPUT_LABEL_FILE = 'data/smos/smos_labels_porter.txt'

OUTPUT_DATA_FILE = 'data/smos/smos_data_balanced.txt'
OUTPUT_LABEL_FILE = 'data/smos/smos_labels_balanced.txt'

'''
Ratio defining how many synthetic minority samples should be created. 1.0 results in a fully balanced set.
Float on interval (0.0, 1.0]
'''
BALANCE_RATIO = 1.0

'''
Specify number of words in a sequence for vectorize layer.
Sequence stats and lengths plot found below.
'''
SEQUENCE_LENGTH = 220

In [None]:
training_data = []
labels = []
lengths = []

In [None]:
# Load metadocs
with open(INPUT_DATA_FILE, newline='') as datafile:
    data_reader = csv.reader(datafile, delimiter='\n')
    
    for row in data_reader:
        training_data.append(row[0])

In [None]:
# Load labels
with open(INPUT_LABEL_FILE, newline='') as labelfile:
    label_reader = csv.reader(labelfile, delimiter='\n')
    
    for row in label_reader:
        labels.append(int(row[0]))

In [None]:
# Sequence information for loaded set
for seq in training_data:
    lengths.append(len(seq.split()))

print('Number of metadocuments: ', len(training_data))
print('Vocab size: ', utils.vocabulary_size(training_data))
print('Avg seq length: ', sum(lengths) / len(lengths))
print('Min seq len: ', min(lengths))
print('Max seq len: ', max(lengths))

In [None]:
# Plot the sequence length of each metadocument in the dataset
X = [x for x in range(len(lengths))]

plt.scatter(X, lengths)
plt.show()

In [None]:
# Convert data and labels to TF dataset in order to use TextVectorization
unbalanced_data = [tf.convert_to_tensor(metadoc) for metadoc in training_data]
unbalanced_ds = tf.data.Dataset.from_tensor_slices((unbalanced_data,labels)).batch(32)

In [None]:
'''
Use the text vectorization layer to split, prune and map strings to integers.
Set maximum_sequence length as all samples are not of the same length.
'''
vectorize_layer = TextVectorization(
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH)

# Load vocab into vectorization layer
vectorize_layer.set_vocabulary(utils.get_vocabulary(training_data))

In [None]:
# List structure
tf_vocab = vectorize_layer.get_vocabulary()

In [None]:
print(tf_vocab)

In [None]:
# Get original vocabulary from unbalanced training data
vocab = utils.get_vocabulary(training_data)
inverse_vocab = {}

'''
Create an inverse vocabulary so we can decode the balanced vectorized data.
Index of word in vectorization layer's vocabulary maps to it's int encoding.
'''
for i, word in enumerate(tf_vocab):
    inverse_vocab[i] = word

In [None]:
'''
Print any words added to vocab by TF.
Expected: empty string and [UNK]
'''
for word in enumerate(tf_vocab):    
    if word[1] not in vocab:
        if word[1] == '':
            print('empty string')
        else:
            print(word[1])

In [None]:
print(inverse_vocab)

In [None]:
# Create np arrays to store vectorized data
vectorized_data = np.zeros((len(training_data), SEQUENCE_LENGTH))
vectorized_labels = np.zeros(len(training_data))
i = 0

# Vectorize data and arrays for data balancing
for batch in unbalanced_ds:
    for sequence, label in zip(vectorize_layer(batch[0]), batch[1]):
        vectorized_data[i] = sequence
        vectorized_labels[i] = label 
        i += 1

In [None]:
# Check vectors for [UNK]  and empty string in sequence
num_unk = 0
num_empty = 0

for row in vectorized_data:
    for val in row:
        if val == 0:
            num_empty += 1
        elif val == 1:
            num_unk += 1
            
print('Percent empty tokens: %f' %((num_empty/(len(training_data * SEQUENCE_LENGTH)))*100))
print('Percent unk tokens: %f' %((num_unk/(len(training_data * SEQUENCE_LENGTH)))*100))

In [None]:
# Perform SMOTE on loaded dataset
balanced_data_enc, balanced_labels = SMOTE(sampling_strategy=BALANCE_RATIO).fit_resample(vectorized_data, vectorized_labels)

In [None]:
# Create new data and label files for balanced set.
with open(OUTPUT_DATA_FILE, 'w', newline='') as datafile:
    data_writer = csv.writer(datafile, quoting=csv.QUOTE_MINIMAL)

    with open(OUTPUT_LABEL_FILE, 'w', newline='') as labelfile:
        label_writer = csv.writer(labelfile, quoting=csv.QUOTE_MINIMAL)  

        i = 0
        for row, label in zip(balanced_data_enc, balanced_labels):
            decoded = []
            for val in row:
                decoded.append(inverse_vocab[int(val)])
                
            # we will get double spaces due to a 0 being mapped to '' 
            data_writer.writerow([' '.join(decoded)])
            label_writer.writerow([int(label)])
            i+=1