In [1]:
import io
import csv
import numpy as np
import tensorflow as tf
import utils

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
training_data = [] # list of strings
filename = 'data/smos/smos_data_porter_balanced.txt'

In [3]:
with open(filename, newline='') as datafile:
    data_reader = csv.reader(datafile, delimiter='\n')
    
    for row in data_reader:
        training_data.append(row[0])

In [4]:
print(len(training_data))

6146


In [5]:
lengths = []
max = 0

for seq in training_data:
    lengths.append(len(seq.split()))
    if max < len(seq.split()):
        max = len(seq.split())

In [6]:
print(max)

200


In [7]:
labels = [] # list of strings
filename = 'data/smos/smos_labels_balanced.txt'

In [8]:
with open(filename, newline='') as datafile:
    data_reader = csv.reader(datafile, delimiter='\n')
    
    for row in data_reader:
        labels.append(int(row[0]))

In [9]:
# Load and split dataset
batch_size = 32
seed = 123

full_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/smos/train_balanced',
    batch_size=batch_size,
    label_mode='binary',
    seed=seed
)

train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/smos/train_balanced',
    batch_size=batch_size,
    label_mode='binary',
    validation_split=0.2, 
    subset='training',
    seed=seed
)

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/smos/train_balanced',
    batch_size=batch_size,
    label_mode='binary',
    validation_split=0.2, 
    subset='validation',
    seed=seed
)

Found 6146 files belonging to 2 classes.
Found 6146 files belonging to 2 classes.
Using 4917 files for training.
Found 6146 files belonging to 2 classes.
Using 1229 files for validation.


In [None]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(2):
        print(label_batch[i].numpy(), text_batch.numpy()[i])

In [10]:
# Configure the dataset for performance
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [11]:
print('Vocab size: ', utils.vocabulary_size(training_data))
print('Avg seq length: ', sum(lengths) / len(lengths))

Vocab size:  1059
Avg seq length:  162.92857142857142


In [12]:
# Vocabulary size and number of words in a sequence.
# Using ~avg sequence length of all sequences
sequence_length = 200
vocab_size = utils.vocabulary_size(training_data)

In [13]:
# Use the text vectorization layer to normalize, split, and map strings to 
# integers. Note that the layer uses the custom standardization defined above. 
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = full_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [14]:
embedding_dim = 8

# Embed vocabulary into embedding_dim dimensions.
# Embedding tutorial uses size, Text Classification tutorial uses size + 1
embedding_layer = tf.keras.layers.Embedding(vocab_size + 1, embedding_dim, name='embedding')

In [15]:
model = Sequential([
    vectorize_layer,
    embedding_layer,
    #Dropout(0.2),
    GlobalAveragePooling1D(),
    #Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)
])

In [16]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [17]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='logs')

In [18]:
model.fit(
    train_ds,
    validation_data=val_ds, 
    epochs=15,
    callbacks=[tensorboard_callback]
)

Epoch 1/15
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x1360bae20>

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 200)               0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 8)            8480      
_________________________________________________________________
global_average_pooling1d (Gl (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 16)                144       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 8,641
Trainable params: 8,641
Non-trainable params: 0
_________________________________________________________________


In [19]:
original_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/smos/train',
    batch_size=batch_size,
    label_mode='binary',
    seed=seed
)

original_ds = original_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 4556 files belonging to 2 classes.


In [20]:
model.evaluate(original_ds)



[0.5017554759979248, 0.742317795753479]

In [None]:
# Retrieve the trained word embeddings
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
# Save embeddings to disk
out_vec = io.open('data/smos_porter_vectors.tsv', 'w', encoding='utf-8')
out_meta = io.open('data/smos_porter_metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if  index == 0: continue # skip 0, it's padding.
    vec = weights[index] 
    out_vec.write('\t'.join([str(x) for x in vec]) + '\n')
    out_meta.write(word + '\n')
    
out_vec.close()
out_meta.close()