In [137]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, Dropout
from tensorflow.keras.utils import text_dataset_from_directory
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import re
import plotly.graph_objects as go
import datetime
import os


In [138]:
# Setup some initial parameters
batch_size = 64
validation_ratio = 0.2
seed = 42
max_features = 1000
sequence_length = 500

In [139]:
# Load datasets
raw_train_ds = text_dataset_from_directory(
    "temp/train",
    batch_size=batch_size,
    validation_split=validation_ratio,
    subset='training',
    seed=seed
)
raw_val_ds = text_dataset_from_directory(
    'temp/train',
    batch_size=batch_size,
    validation_split=validation_ratio,
    subset='validation',
    seed=seed
)
raw_test_ds = text_dataset_from_directory(
    'temp/test',
    batch_size=batch_size
)

Found 3124 files belonging to 131 classes.
Using 2500 files for training.
Found 3124 files belonging to 131 classes.
Using 624 files for validation.
Found 347 files belonging to 80 classes.


In [140]:
# Define a custom text standardization function to preprocess text data
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'), '')

In [141]:
# Create a TextVectorization layer to vectorize text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length
)
vectorize_layer.adapt(raw_train_ds.map(lambda text, label: text))

In [142]:
# Define the model architecture
model = Sequential([
    vectorize_layer,
    Embedding(max_features + 1, 16),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    GlobalAveragePooling1D(),
    Dropout(0.2),
    Dense(1)  # Output layer for regression; no activation function
])

In [143]:
# Compile the model
model.compile(
    loss="mean_squared_error",  # Using MSE for regression
    optimizer="adam",
    metrics=["mean_absolute_error"]  # Using MAE as a metric for regression
)

In [144]:
# Set the directory to store the logs
log_dir = os.path.join("logs", "fit", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=0)

In [145]:
early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

In [146]:
# Train the model
epochs = 100
history = model.fit(
    raw_train_ds,
    validation_data=raw_val_ds,
    epochs=epochs,
    callbacks=[tensorboard_callback, early_stopping_callback]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100


In [149]:
# Creating a line plot for training and validation loss using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(history.history['loss']))), 
                         y=history.history['loss'], 
                         mode='lines',
                         name='Training Loss'))
fig.add_trace(go.Scatter(x=list(range(len(history.history['val_loss']))), 
                         y=history.history['val_loss'], 
                         mode='lines',
                         name='Validation Loss'))

# Updating the layout of the plot
fig.update_layout(title='Loss over Epochs',
                  xaxis_title='Epochs',
                  yaxis_title='Loss',
                  legend_title='Legend')

# Display the plot
fig.show()

In [150]:
# Evaluate the model
model.evaluate(raw_test_ds)



[1147.5655517578125, 29.946840286254883]

In [123]:
# Prepare lists to hold the data
texts = []
actual_labels = []
predicted_labels = []

# Iterate over the entire test dataset
for text_batch, label_batch in raw_test_ds:
    # Make predictions for the current batch
    predictions = model.predict(text_batch)
    # Decode text data if it's in bytes format and store in the list
    texts.extend([text.numpy().decode('utf-8') for text in text_batch])
    # Store actual labels and predictions
    actual_labels.extend(label_batch.numpy())
    predicted_labels.extend(predictions.flatten())  # Flatten in case the predictions have an extra dimension

# Create a DataFrame from the collected data
data = {
    "Text": texts,
    "Actual Label": actual_labels,
    "Predicted Label": predicted_labels
}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_file_path = "model_predictions.csv"
df.to_csv(csv_file_path, index=False)

print(f"All predictions saved to {csv_file_path}")

All predictions saved to model_predictions.csv
