In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D, Dense, Dropout
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Check if tensorflow.keras.wrappers is installed
from scikeras.wrappers import KerasClassifier


# Function to clean text
def clean_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text

# Function to predict spam/ham category
def predict_spam_ham(text, model, tokenizer, max_len, label_encoder):
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    return label_encoder.inverse_transform(np.argmax(prediction, axis=1))[0]

# Function to plot training metrics
def plot_training_metrics(history, img_prefix='training'):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)

    # Accuracy plot
    plt.figure()
    plt.plot(epochs, acc, 'bo-', label='Training Accuracy')
    plt.plot(epochs, val_acc, 'ro-', label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig(f'{img_prefix}_accuracy.png')  # Save as image
    plt.show()

    # Loss plot
    plt.figure()
    plt.plot(epochs, loss, 'bo-', label='Training Loss')
    plt.plot(epochs, val_loss, 'ro-', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(f'{img_prefix}_loss.png')  # Save as image
    plt.show()

# Load the dataset
df = pd.read_csv('enron_05_17_2015_with_labels_v2.csv\enron_05_17_2015_with_labels_v2.csv')

# Display initial data information
print(df.head())

# Combine and clean text data
df['Text'] = df['Subject'].fillna('') + ' ' + df['content'].fillna('')
df['Text'] = df['Text'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['labeled'])

# Tokenization and Padding
MAX_WORDS = 10000
MAX_LEN = 100
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(df['Text'])
sequences = tokenizer.texts_to_sequences(df['Text'])
X = pad_sequences(sequences, maxlen=MAX_LEN)
y = df['Label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the model
def create_model(dropout_rate=0.0, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(MAX_WORDS, 128, input_length=MAX_LEN))
    model.add(Conv1D(64, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(LSTM(64))
    model.add(Dropout(dropout_rate))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

# define the grid search parameters
batch_size = [10, 20, 50]
epochs = [10, 50, 100]
dropout_rate = [0.1, 0.5, 0.9]
optimizer = ['adam', 'nadam', 'sgd']

param_grid = dict(batch_size=batch_size, epochs=epochs, dropout_rate=dropout_rate, optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)

# Summarize results
print(f"Best: {grid_result.best_score_:.2f} using {grid_result.best_params_}")

# Train the best model
best_model = grid_result.best_estimator_.fit(X_train, y_train)

# Evaluate the model
loss, accuracy = best_model.model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Plot training metrics
plot_training_metrics(best_model.model.history, img_prefix='best_model')

# Example prediction
example_text = "Win a brand new car! Click here for details."
prediction = predict_spam_ham(example_text, best_model.model, tokenizer, MAX_LEN, label_encoder)
print(f"Prediction: {prediction}")

# Additional Feature Engineering
df['text_length'] = df['content'].apply(len)
df['subject_length'] = df['Subject'].apply(len)
df['count_exclamation'] = df['content'].apply(lambda x: x.count('!'))
df['count_links'] = df['content'].apply(lambda x: len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x)))

# Combine and clean text data
df['Text'] = df['Subject'] + ' ' + df['content']
df['Text'] = df['Text'].apply(clean_text)

# Preparing the text data
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Text'])

# Converting to DataFrame to concatenate with other features
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate TF-IDF features with engineered features
X = pd.concat([tfidf_df, df[['text_length', 'subject_length', 'count_exclamation', 'count_links']]], axis=1)
y = df['Label']

# RFE with RandomForestClassifier
forest = RandomForestClassifier()
rfe = RFE(estimator=forest, n_features_to_select=10, step=1)
rfe.fit(X, y)

# Transform X to the selected features
X_transformed = rfe.transform(X)

# Cross-validation score
scores = cross_val_score(forest, X_transformed, y, cv=5)
print(f"Mean cross-validation score: {scores.mean():.2f}")
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D, Dense, Dropout
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scikeras.wrappers import KerasClassifier

# Function to clean text
def clean_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text

# Function to predict spam/ham category
def predict_spam_ham(text, model, tokenizer, max_len, label_encoder):
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    return label_encoder.inverse_transform(np.argmax(prediction, axis=1))[0]

# Function to plot training metrics
def plot_training_metrics(history, img_prefix='training'):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)

    # Accuracy plot
    plt.figure()
    plt.plot(epochs, acc, 'bo-', label='Training Accuracy')
    plt.plot(epochs, val_acc, 'ro-', label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig(f'{img_prefix}_accuracy.png')  # Save as image
    plt.show()

    # Loss plot
    plt.figure()
    plt.plot(epochs, loss, 'bo-', label='Training Loss')
    plt.plot(epochs, val_loss, 'ro-', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(f'{img_prefix}_loss.png')  # Save as image
    plt.show()

# Load the dataset
df = pd.read_csv('enron_05_17_2015_with_labels_v2.csv\enron_05_17_2015_with_labels_v2.csv')

# Display initial data information
print(df.head())

# Combine and clean text data
df['Text'] = df['Subject'].fillna('') + ' ' + df['content'].fillna('')
df['Text'] = df['Text'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['labeled'])

# Tokenization and Padding
MAX_WORDS = 10000
MAX_LEN = 100
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(df['Text'])
sequences = tokenizer.texts_to_sequences(df['Text'])
X = pad_sequences(sequences, maxlen=MAX_LEN)
y = df['Label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  df = pd.read_csv('enron_05_17_2015_with_labels_v2.csv\enron_05_17_2015_with_labels_v2.csv')


   Unnamed: 0                                     Message-ID  \
0           0  <18782981.1075855378110.JavaMail.evans@thyme>   
1           1  <15464986.1075855378456.JavaMail.evans@thyme>   
2           2  <24216240.1075855687451.JavaMail.evans@thyme>   
3           3  <13505866.1075863688222.JavaMail.evans@thyme>   
4           4  <30922949.1075863688243.JavaMail.evans@thyme>   

                  Date                                    From  \
0  2001-05-14 23:39:00  frozenset({'phillip.allen@enron.com'})   
1  2001-05-04 20:51:00  frozenset({'phillip.allen@enron.com'})   
2  2000-10-18 10:00:00  frozenset({'phillip.allen@enron.com'})   
3  2000-10-23 13:13:00  frozenset({'phillip.allen@enron.com'})   
4  2000-08-31 12:07:00  frozenset({'phillip.allen@enron.com'})   

                                       To    Subject           X-From  \
0     frozenset({'tim.belden@enron.com'})        NaN  Phillip K Allen   
1  frozenset({'john.lavorato@enron.com'})        Re:  Phillip K Allen   

In [None]:

# Building the model
def create_model(dropout_rate=0.0, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(MAX_WORDS, 128, input_length=MAX_LEN))
    model.add(Conv1D(64, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(LSTM(64))
    model.add(Dropout(dropout_rate))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# create model
model = KerasClassifier(model=create_model, verbose=0)

# define the grid search parameters with model__ prefix
param_grid = {
    'model__dropout_rate': [0.1, 0.5, 0.9],
    'model__optimizer': ['adam', 'nadam', 'sgd'],
    'batch_size': [10, 20, 50],
    'epochs': [10, 50, 100]
}

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)

# Summarize results
print(f"Best: {grid_result.best_score_:.2f} using {grid_result.best_params_}")

# Train the best model
best_model = grid_result.best_estimator_

# Evaluate the model
loss, accuracy = best_model.model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Plot training metrics
plot_training_metrics(best_model.model.history, img_prefix='best_model')

# Example prediction
example_text = "Win a brand new car! Click here for details."
prediction = predict_spam_ham(example_text, best_model.model, tokenizer, MAX_LEN, label_encoder)
print(f"Prediction: {prediction}")


In [None]:

# Additional Feature Engineering
df['text_length'] = df['content'].apply(len)
df['subject_length'] = df['Subject'].apply(len)
df['count_exclamation'] = df['content'].apply(lambda x: x.count('!'))
df['count_links'] = df['content'].apply(lambda x: len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x)))

# Combine and clean text data
df['Text'] = df['Subject'] + ' ' + df['content']
df['Text'] = df['Text'].apply(clean_text)

# Preparing the text data
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Text'])

# Converting to DataFrame to concatenate with other features
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate TF-IDF features with engineered features
X = pd.concat([tfidf_df, df[['text_length', 'subject_length', 'count_exclamation', 'count_links']]], axis=1)
y = df['Label']

# RFE with RandomForestClassifier
forest = RandomForestClassifier()
rfe = RFE(estimator=forest, n_features_to_select=10, step=1)
rfe.fit(X, y)

# Transform X to the selected features
X_transformed = rfe.transform(X)

# Cross-validation score
scores = cross_val_score(forest, X_transformed, y, cv=5)
print(f"Mean cross-validation score: {scores.mean():.2f}")

# Error analysis with confusion matrix
y_pred = best_model.model.predict(X_test)
conf_matrix = confusion_matrix(y_test, np.argmax(y_pred, axis=1))
report = classification_report(y_test, np.argmax(y_pred, axis=1), target_names=label_encoder.classes_)
print(report)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.show()

# Feature Importance
importances = forest.feature_importances_
indices = np.argsort(importances)[-10:]  # top 10 features

plt.figure(figsize=(10, 7))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [tfidf_vectorizer.get_feature_names_out()[i] for i in indices])
plt.xlabel('Relative Importance')
plt.savefig('feature_importance.png')
plt.show()

# Feature Imp


In [None]:

# SHAP values for model explainability
import shap
explainer = shap.TreeExplainer(forest)
shap_values = explainer.shap_values(X_transformed)

shap.summary_plot(shap_values, X_transformed, feature_names=[tfidf_vectorizer.get_feature_names_out()[i] for i in indices])

# Save the model
best_model.model.save('best_model.h5')

# Documenting the code with comments and docstrings
# Ensure your code is well-documented with comments explaining the purpose of each section and docstrings for functions

# Error analysis with confusion matrix
y_pred = best_model.model.predict(X_test)
conf_matrix = confusion_matrix(y_test, np.argmax(y_pred, axis=1))
report = classification_report(y_test, np.argmax(y_pred, axis=1), target_names=label_encoder.classes_)
print(report)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.show()
