In [49]:
%pip install tensorflow tf-keras -U
%pip install pandas numpy

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [58]:
import os
import re
import shutil
import string
import tensorflow as tf
import pandas as pd
import numpy as np

tf.random.set_seed(42)
print("TensorFlow version:", tf.__version__)

VOCAB_SIZE = 512
EMBEDDING_DIM = 64
MAX_LENGTH = 64
TRAINING_SPLIT = 0.8

TensorFlow version: 2.18.0


In [59]:
df = pd.read_csv("titles-labelled.csv")
# Shuffle
df = df.sample(frac=1)

train_size = int(len(df) * TRAINING_SPLIT)

titles = df['title'].values
labels = df['is_transaction'].values.astype(int)

# Split them
train_titles = titles[:train_size]
train_labels = labels[:train_size]
test_titles = titles[train_size:]
test_labels = labels[train_size:]

# Create the datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_titles, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_titles, test_labels))

In [60]:
print(f"There are {train_dataset.cardinality()} sentence-label pairs for training.\n")
print(f"There are {test_dataset.cardinality()} sentence-label pairs for validation.\n")

There are 8000 sentence-label pairs for training.

There are 2000 sentence-label pairs for validation.



In [61]:
vectorizer = tf.keras.layers.TextVectorization(
	max_tokens=VOCAB_SIZE,
    output_sequence_length=MAX_LENGTH
)

vectorizer.adapt(train_dataset.map(lambda title, is_transaction: title))
vocab_size = vectorizer.vocabulary_size()

print("Vocabulary size: {}".format(vocab_size))

Vocabulary size: 512


In [62]:
def preprocess_dataset(dataset, text_vectorizer):
    """Apply the preprocessing to a dataset

    Args:
        dataset (tf.data.Dataset): dataset to preprocess
        text_vectorizer (tf.keras.layers.TextVectorization ): text vectorizer

    Returns:
        tf.data.Dataset: transformed dataset
    """
    # Convert the Dataset sentences to sequences, and encode the text labels
    dataset = dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=32).batch(32)
    
    return dataset

In [63]:
train_proc_dataset = preprocess_dataset(train_dataset, vectorizer)
test_proc_dataset = preprocess_dataset(test_dataset, vectorizer)

print(f"Number of batches in the train dataset: {train_proc_dataset.cardinality()}")
print(f"Number of batches in the validation dataset: {test_proc_dataset.cardinality()}")

Number of batches in the train dataset: 250
Number of batches in the validation dataset: 63


In [64]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(MAX_LENGTH,)),
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(train_proc_dataset, epochs=10, validation_data=test_proc_dataset)
test_loss, test_acc = model.evaluate(test_proc_dataset)

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6612 - loss: 0.5961 - val_accuracy: 0.9550 - val_loss: 0.1566
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9490 - loss: 0.1639 - val_accuracy: 0.9550 - val_loss: 0.1128
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9580 - loss: 0.1260 - val_accuracy: 0.9650 - val_loss: 0.1000
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9600 - loss: 0.1137 - val_accuracy: 0.9640 - val_loss: 0.0961
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9613 - loss: 0.1080 - val_accuracy: 0.9640 - val_loss: 0.0942
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9618 - loss: 0.1048 - val_accuracy: 0.9640 - val_loss: 0.0930
Epoch 7/10
[1m250/250[0m 

In [65]:
# Save model
model.save('email_titles_nlp.keras')