## Train a LSTM (RNN) to make predictions based on specific train and validation sets

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense # Ιnclude LSTM layer
from tensorflow.python.client import device_lib
import time
from google.colab import drive
import os

# Check available devices
devices = device_lib.list_local_devices()
print(devices)

class LSTMTraining:
    def __init__(self, learning_rate, epochs, batch_size, max_len, feature_col, label_col):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.max_len = max_len
        self.feature_col = feature_col
        self.label_col = label_col
        self.history = None

    def load_data(self, train_file_path, val_file_path):
        drive.mount('/content/gdrive') # Mount Google Drive to access files
        self.train_df = pd.read_csv(train_file_path) # Store data frames as instance variables
        self.val_df = pd.read_csv(val_file_path)

    def preprocess_data(self):
        self.tokenizer = Tokenizer() # Initializing Tokenizer
        self.tokenizer.fit_on_texts(self.train_df[self.feature_col]) # Fitting tokenizer on training text data

        # Converting text data to sequences
        train_sequences = self.tokenizer.texts_to_sequences(self.train_df[self.feature_col])
        val_sequences = self.tokenizer.texts_to_sequences(self.val_df[self.feature_col])

        # Padding sequences to a fixed length
        self.train_data = pad_sequences(train_sequences, maxlen=self.max_len, padding='post')
        self.val_data = pad_sequences(val_sequences, maxlen=self.max_len, padding='post')

        # Extracting labels
        self.train_labels = self.train_df[self.label_col].values
        self.val_labels = self.val_df[self.label_col].values

    def build_model(self):
        self.model = Sequential() # Initializing sequential model
        self.model.add(Embedding(len(self.tokenizer.word_index) + 1, 128, input_length=self.max_len)) # Adding Embedding layer
        self.model.add(LSTM(128)) # Adding LSTM layer
        self.model.add(Dense(64, activation='relu')) # Adding Dense layer
        self.model.add(Dense(1, activation='sigmoid')) # Adding Output layer

        optimizer = Adam(learning_rate=self.learning_rate) # Initializing Adam optimizer
        self.model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) # Compiling model

    def train_model(self):
        self.history = self.model.fit(self.train_data, self.train_labels, epochs=self.epochs, batch_size=self.batch_size, validation_data=(self.val_data, self.val_labels)) # Training the model

    def get_training_loss(self):
        return self.history.history['loss']

    def get_validation_loss(self):
        return self.history.history['val_loss']

    def get_validation_accuracy(self):
        return self.history.history['val_accuracy']

    def save_model(self, save_dir, model_name):
      os.makedirs(save_dir, exist_ok=True) # Creating directory if not exists
      self.model.save(os.path.join(save_dir, model_name + '.keras')) # Saving the model with .keras extension

# Usage:
start_time = time.time()
model = 'lstm'

## Hyperparameters
learning_rate = 2e-5
epochs = 3
batch_size = 6
max_len = 4096
optimizer = 'Adam' # TODO! Need to create the functionality to switch optimizers between Adam and AdamW

## Paths and filenames
absolute_path = '/content/gdrive/My Drive/EmailSpam/'
train_file_path = 'Datasets/train_set.csv'
val_file_path = 'Datasets/validation_set.csv'
save_dir = 'TrainedModels/'
trained_model = model + '_optimizer_' + optimizer + '_lr_' + str(learning_rate) + '_epochs_' + str(epochs) + '_bs_' + str(batch_size) + '_maxlen_' + str(max_len)
feature_col = 'Text'
label_col = 'Spam'

trainer = LSTMTraining(learning_rate, epochs, batch_size, max_len, feature_col, label_col) # Creating instance of LSTMTraining class
trainer.load_data(absolute_path + train_file_path, absolute_path + val_file_path) # Loading data
trainer.preprocess_data() # Preprocessing data
trainer.build_model() # Building model
trainer.train_model() # Training model
trainer.save_model(absolute_path + save_dir, trained_model) # Saving trained model

print("Training Loss:", trainer.get_training_loss())
print("Validation Loss:", trainer.get_validation_loss())
print("Validation Accuracy:", trainer.get_validation_accuracy())
print("Training time: {:.2f} seconds".format(time.time() - start_time))

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13819575463734692822
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 15510929408
locality {
  bus_id: 1
  links {
  }
}
incarnation: 10752727187895355201
physical_device_desc: "device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0"
xla_global_id: 416903419
]
Mounted at /content/gdrive
Epoch 1/3
Epoch 2/3
Epoch 3/3


## Use the trained LSTM model to make predictions for a specific test set

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from google.colab import drive

class LSTMPredictions:
    def __init__(self, max_len, absolute_path, test_file_path, predictions_path, trained_model, feature_col, prediction_col):
        self.max_len = max_len
        self.test_file_path = test_file_path
        self.predictions_path = predictions_path
        self.absolute_path = absolute_path
        self.trained_model = trained_model
        self.feature_col = feature_col
        self.prediction_col = prediction_col
        drive.mount('/content/gdrive')  # Mount Google Drive to access files

    def predict(self):
        # Load test dataset
        test_df = pd.read_csv(self.absolute_path + self.test_file_path)
        test_texts = test_df[self.feature_col].tolist()  # Extract the text data from the specified feature column

        # Tokenize text data using the same tokenizer used during training
        tokenizer = Tokenizer()  # Initialize a Tokenizer object
        tokenizer.fit_on_texts(test_texts)  # Fit the tokenizer on the test text data
        test_sequences = tokenizer.texts_to_sequences(test_texts)  # Convert text data to sequences of integers

        test_data = pad_sequences(test_sequences, maxlen=self.max_len, padding='post')  # Pad sequences to the maximum length specified during training

        saved_model = load_model(self.absolute_path + self.trained_model)  # Load the trained LSTM model from the specified path

        # Make predictions on test data using the loaded LSTM model
        predictions = saved_model.predict(test_data)  # Use the loaded LSTM model to make predictions on the test data

        # Convert predictions to binary labels (0 or 1) based on a threshold (e.g., 0.5)
        binary_predictions = (predictions > 0.5).astype(int)

        # Add the binary predictions as a new column to the test dataframe
        test_df[self.prediction_col] = binary_predictions

        # Save predictions to CSV
        test_df.to_csv(self.absolute_path + self.predictions_path, index=False)

        print("Predictions done")

# Usage:
max_len = 4096
str_params = 'lstm_optimizer_Adam_lr_2e-05_epochs_3_bs_6_maxlen_4096'

## Paths and filenames
absolute_path = '/content/gdrive/My Drive/EmailSpam/'
test_file_path = 'Datasets/test_set.csv'
predictions_path = 'Datasets/test_set_lstm.csv'
trained_model = 'TrainedModels/' + str_params + '.keras'
feature_col = 'Text'
prediction_col = str_params + '_prediction'

# Instantiate the LSTMPredictions class
lstm_predictions = LSTMPredictions(max_len, absolute_path, test_file_path, predictions_path, trained_model, feature_col, prediction_col)

# Perform predictions
lstm_predictions.predict()

Mounted at /content/gdrive
Predictions done
