In [2]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import numpy as np
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [3]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)

In [4]:
BUFFER_SIZE = 200000
BATCH_SIZE = 64
VALIDATION_SIZE = 20000

In [5]:
FILE_NAMES = ["./data/train_pos.txt", "./data/train_neg.txt"]
labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(str(file_name))
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)
    

all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [156]:
for text, label in all_labeled_data.take(10):
    print("Sentence: ", text.numpy())
    print("Label:", label.numpy())

Sentence:  b'well this sucks ..'
Label: 1
Sentence:  b'the dead sea scrolls electronic library ( the dead sea scrolls electronic reference library the new and compreh ... <url>'
Label: 1
Sentence:  b'i love our fans ! there all so beautiful ! x <url>'
Label: 0
Sentence:  b"<user> very good , just as i was about to go to sleep you made me open this omg freaked me out won't be able to sleep now"
Label: 1
Sentence:  b"<user> i will come pick you up holding a bouquet of flowers and bring you to my abode . it's our second date ."
Label: 0
Sentence:  b"otay , i guess i better not sizzle too then . lol rt <user> yes ! ! rt <user> if i'm frazzled am i also frazzling ?"
Label: 0
Sentence:  b'<user> one thing you have taught me , is no matter how bad your past is . we can always look for a change in the future . thanks a lot'
Label: 0
Sentence:  b'jo twins for <user> . if you already have it i can give you another pic <url>'
Label: 0
Sentence:  b'olympus stylus 1040 carrying casecrown simply cam

In [157]:
train_data = all_labeled_data.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
validation_data = all_labeled_data.take(VALIDATION_SIZE).padded_batch(BATCH_SIZE)

In [158]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<user>', '')
    stripp = tf.strings.regex_replace(stripped_html, '<url>', '')
    return tf.strings.regex_replace(stripp,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [159]:
max_features = 10000
sequence_length = 70

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features)

In [160]:
train_text = train_data.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [161]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [162]:
print(list(train_text)[10])

tf.Tensor(
[b'<user> hope your show went really good ! i love you so much'
 b"<user> they say that good things come to those who wait , the day you'll follow me"
 b'<user> happy birthday tomorrow ! ! what a great kick off ! !'
 b"<user> ah that's awesome ! thanks lady ! ! #gooniesneversaydie"
 b"<user> good donno if i am though it's just an idea x"
 b'who wanna phone call tonight ?'
 b"<user> aww i soo wish i could come ! ! but i've got uni exams and rehearsals miss u lotss ! ! hope to see u soon ! ! xoxo"
 b'nex stylin stencil ( runway fun with all kinds of features and accessory designer stickers included , anyone can ... <url>'
 b"i'm missing out on a niam twitcam ? ? ?"
 b'rt : <user> #retweetif your birthday falls between january and december--im on the map ! !'
 b'uncharted 2 : among thieves - game of the year edition ( video game uncharted 2 : among thieves - game of the yea ... <url>'
 b'monroe 37027 sensa-trac light truck shock absorber ( misc . the monroe sensa-trac truck sho

In [163]:
#train_ds = train_data.map(vectorize_text)
#val_ds = validation_data.map(vectorize_text)

In [164]:
"""embedding_dim = 16
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])"""

model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(
        input_dim=len(vectorize_layer.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])


In [165]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [167]:
epochs = 10
history = model.fit(
    train_data,
    validation_data=validation_data,
    validation_steps=30,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

KeyboardInterrupt: 