In [1]:
!nvidia-smi

Thu Apr 27 11:23:48 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import tensorflow as tf
print(tf.__version__)

2.12.0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
path = "/content/drive/MyDrive/"
os.chdir(path)

In [5]:
# Standard library imports
import datetime
import email
import logging
import os
import pickle
import re
import string
import time

# Third-party imports
import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup
from email.utils import parseaddr

from keras.callbacks import EarlyStopping
from keras.layers import LSTM, Dense, Activation, Embedding, SimpleRNN, Bidirectional, Dropout
from keras.models import Sequential
from keras import regularizers

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, auc, f1_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.naive_bayes import MultinomialNB

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from urllib.parse import urljoin

# Jupyter-specific imports
%matplotlib inline

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Define Porter Stemmer and Stop Words
porter_stemmer = PorterStemmer()
stop_words = list(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
max_vocab=600000
max_len=2000

In [7]:
from keras.callbacks import Callback

class CustomEarlyStopping(Callback):
    def __init__(self, monitor='val_loss', min_delta=0.01, patience=10, loss_threshold=0.01, restore_best_weights=False):
        super(CustomEarlyStopping, self).__init__()
        self.monitor = monitor
        self.min_delta = min_delta
        self.patience = patience
        self.loss_threshold = loss_threshold
        self.wait = 0
        self.stopped_epoch = 0
        self.last_loss = None
        self.restore_best_weights = restore_best_weights
        self.best_weights = None
        self.best_loss = float('inf')
    def on_epoch_end(self, epoch, logs=None):
      current_loss = logs.get(self.monitor)
      # Check if loss is below the loss threshold
      if current_loss is not None and current_loss < self.loss_threshold:
          self.stopped_epoch = epoch
          self.model.stop_training = True
          print("Epoch %05d: early stopping due to loss below threshold" % (self.stopped_epoch + 1))
          return
      if self.last_loss is None:
          self.last_loss = current_loss
          return
      # Check if the difference between the current loss and the last loss is below the min_delta
      elif abs(current_loss - self.last_loss) < self.min_delta:
          self.wait += 1
          if self.wait >= self.patience:
              self.stopped_epoch = epoch
              self.model.stop_training = True
              if self.restore_best_weights:
                  self.model.set_weights(self.best_weights)
      else:
          self.wait = 0
      # Update best loss and best weights
      if current_loss < self.best_loss:
          self.best_loss = current_loss
          self.best_weights = self.model.get_weights()
      self.last_loss = current_loss
    def on_train_end(self, logs=None):
        if self.stopped_epoch > 0 and self.model.stop_training:
            print("Epoch %05d: early stopping" % (self.stopped_epoch + 1))


In [8]:
def stem_tokenizer(text):
    words = [porter_stemmer.stem(token) for token in word_tokenize(text.lower())]
    return " ".join([w for w in words if w not in stop_words])

def load_data():
    email06_data = pd.read_csv("cleaned_email_full.csv")
    email07_data = pd.read_csv("cleaned_email_07_full.csv")
    email_data = email06_data.append(email07_data)
    email_data.to_csv("email_0607_full.csv")
    messages = []
    for text in tqdm(email_data['body']):
        messages.append(stem_tokenizer(str(text)))
    return messages, email_data

In [9]:
def preprocess_data(messages):
    tokenizer = Tokenizer(num_words=max_vocab)
    tokenizer.fit_on_texts(messages)
    sequences = tokenizer.texts_to_sequences(messages)
    word_index = tokenizer.word_index
    data = pad_sequences(sequences, maxlen=max_len)

    with open("vectokenizer.p", "wb") as f:
        pickle.dump(tokenizer, f)

    return data

In [10]:
def split_data(data, labels):
    return train_test_split(data, labels, shuffle=True, random_state=42, test_size=0.15)

In [11]:

def build_and_train_lstm_model(x_train, y_train, x_val, y_val, embedding_mat_columns, name, batchsize=128):
    model = Sequential()
    model.add(Embedding(input_dim=max_vocab, output_dim=embedding_mat_columns, input_length=max_len))
    model.add(Bidirectional(LSTM(units=embedding_mat_columns//2, return_sequences=True, kernel_regularizer=regularizers.l1(0.01), recurrent_regularizer=regularizers.l1(0.01))))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(units=embedding_mat_columns//2, kernel_regularizer=regularizers.l1(0.01), recurrent_regularizer=regularizers.l1(0.01))))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

    early_stopping = CustomEarlyStopping(monitor='val_loss', min_delta=0.01, patience=10, loss_threshold=0.01, restore_best_weights=True)
    result = model.fit(x_train, y_train, epochs=100, batch_size=batchsize, validation_data=(x_val, y_val), callbacks=[early_stopping])

    model.save(f"./models/LSTM_model_{name}")

    with open(f"./models/LSTM_result_{name}.p", "wb") as f:
        pickle.dump(result.history, f)



In [12]:
messages, email_data = load_data()

100%|██████████| 101111/101111 [07:14<00:00, 232.54it/s]


In [13]:
with open("message_body_stem.pickle", "wb") as f:
    pickle.dump(messages, f)

In [14]:
email_data.to_csv("email_0607_full.csv")

In [15]:
labels = pd.get_dummies(email_data["spam"]).values

In [16]:
data = preprocess_data(messages)

In [17]:
x_train, x_test, y_train, y_test = split_data(data, labels)

In [18]:
# from keras.wrappers.scikit_learn import KerasClassifier
# from sklearn.model_selection import GridSearchCV

# def build_lstm_model(embedding_mat_columns, l1_regularizer, max_vocab=max_vocab, max_len=max_len):
#     model = Sequential()
#     model.add(Embedding(input_dim=max_vocab, output_dim=embedding_mat_columns, input_length=max_len))
#     model.add(Bidirectional(LSTM(units=embedding_mat_columns//2, return_sequences=True, kernel_regularizer=regularizers.l1(l1_regularizer), recurrent_regularizer=regularizers.l1(l1_regularizer))))
#     model.add(Dropout(0.5))
#     model.add(Bidirectional(LSTM(units=embedding_mat_columns//2, kernel_regularizer=regularizers.l1(l1_regularizer), recurrent_regularizer=regularizers.l1(l1_regularizer))))
#     model.add(Dropout(0.5))
#     model.add(Dense(2, activation='softmax'))
#     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
#     return model

# lstm_classifier = KerasClassifier(build_fn=build_lstm_model, epochs=100, batch_size=128, verbose=0)

# param_grid = {
#     'embedding_mat_columns': [100,128,200],
#     'l1_regularizer': [0.001,0.01,0.1]
# }

# grid = GridSearchCV(estimator=lstm_classifier, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
# grid_result = grid.fit(x_train, y_train)

# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


In [19]:
# best_embedding_mat_columns = grid_result.best_params_['embedding_mat_columns']
# best_l1_regularizer = grid_result.best_params_['l1_regularizer']

# model = build_lstm_model(embedding_mat_columns=best_embedding_mat_columns, l1_regularizer=best_l1_regularizer)

# early_stopping = CustomEarlyStopping(monitor='val_loss', min_delta=0.01, patience=10, loss_threshold=0.01, restore_best_weights=True)

# result = model.fit(x_train, y_train, epochs=100, batch_size=128, validation_data=(x_test, y_test), callbacks=[early_stopping])

# model.save(f"./models/LSTM_model_optimized")

# with open(f"./models/LSTM_result_optimized.p", "wb") as f:
#     pickle.dump(result.history, f)


In [None]:
name = "message_body"
embedding_mat_columns = 128
batchsize = 128
build_and_train_lstm_model(x_train, y_train, x_test, y_test, embedding_mat_columns, name, batchsize)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

In [None]:
import matplotlib.pyplot as plt
name = "message_body"

with open("./models/LSTM_result_{}.p".format(name),"rb") as f:
    result=pickle.load(f)


history=result
plt.figure(figsize=(12,6))
plt.subplot(1, 2, 1)
plt.plot(history["loss"],label="Train loss")
plt.plot(history["val_loss"],label="Test loss")
plt.title('Loss')
plt.xlabel('epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history["acc"],label="Train accuracy")
plt.plot(history["val_acc"],label="Test accuracy")
plt.title('Accuracy')
plt.xlabel('epoch')
plt.legend()

In [None]:
# from keras.layers import Bidirectional, Dropout

# def build_and_train_lstm_model(x_train, y_train, x_val, y_val, embedding_mat_columns, name, batchsize=128):
#     model = Sequential()
#     model.add(Embedding(input_dim=max_vocab, output_dim=embedding_mat_columns, input_length=max_len))
#     model.add(Bidirectional(LSTM(units=embedding_mat_columns, return_sequences=True)))
#     model.add(Dropout(0.2))
#     model.add(Bidirectional(LSTM(units=embedding_mat_columns, return_sequences=True)))
#     model.add(Dropout(0.2))
#     model.add(Bidirectional(LSTM(units=embedding_mat_columns)))
#     model.add(Dropout(0.2))
#     model.add(Dense(2, activation='softmax'))
#     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

#     early_stopping = CustomEarlyStopping(monitor='val_loss', min_delta=0.01, patience=10, loss_threshold=0.01, restore_best_weights=True)
#     result = model.fit(x_train, y_train, epochs=100, batch_size=batchsize, validation_data=(x_val, y_val), callbacks=[early_stopping])

#     model.save(f"./models/LSTM_model_{name}")

#     with open(f"./models/LSTM_result_{name}.p", "wb") as f:
#         pickle.dump(result.history, f)
