In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text

# Function to predict spam/ham category
def predict_spam_ham(text, model, tokenizer, max_len, label_encoder):
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    return label_encoder.inverse_transform(np.argmax(prediction, axis=1))[0]

# Function to plot training metrics
def plot_training_metrics(history, img_prefix='training'):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)

    plt.figure()
    plt.plot(epochs, acc, 'bo-', label='Training Accuracy')
    plt.plot(epochs, val_acc, 'ro-', label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig(f'{img_prefix}_accuracy.png')
    plt.show()

    plt.figure()
    plt.plot(epochs, loss, 'bo-', label='Training Loss')
    plt.plot(epochs, val_loss, 'ro-', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(f'{img_prefix}_loss.png')
    plt.show()


In [None]:

# Load the dataset
df = pd.read_csv('enron_05_17_2015_with_labels_v2.csv\enron_05_17_2015_with_labels_v2.csv')

# Proceed with your processing
print(df.shape)
print(df.head())

# Combine and clean text data
df['Text'] = df['Subject'].fillna('') + ' ' + df['content'].fillna('')
df['Text'] = df['Text'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['labeled'])

# Tokenization and Padding
MAX_WORDS = 10000
MAX_LEN = 100

# Ensure the tokenizer uses only the top 10,000 words
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(df['Text'])
sequences = tokenizer.texts_to_sequences(df['Text'])

# Check if any index is out of bounds
max_index = max(max(seq) for seq in sequences if seq)  # find the max index across all sequences, avoiding empty sequences
print(f"Maximum token index: {max_index}")
assert max_index < MAX_WORDS  # this should be true, or else there's a configuration error
# Pad the sequences
X = pad_sequences(sequences, maxlen=MAX_LEN)

y = df['Label']


In [None]:
# Example of adding additional features
df['Text_length'] = df['Text'].apply(len)
df['num_words'] = df['Text'].apply(lambda x: len(x.split()))
df['num_unique_words'] = df['Text'].apply(lambda x: len(set(x.split())))

# Feature Selection using TF-IDF and SelectKBest
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf_vectorizer.fit_transform(df['Text'])

additional_features = df[['Text_length', 'num_words', 'num_unique_words']].values
X = np.hstack((X_tfidf.toarray(), additional_features))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['Label'], test_size=0.2, random_state=42)

selector = SelectKBest(chi2, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D, Dense, Dropout, Flatten

# Building the model
# Assuming X_train_selected and X_test_selected are based on TF-IDF or similar and not sequences
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_selected.shape[1],)))  # Adjust input shape to match feature vector size
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


# Train the model
history = model.fit(X_train_selected, y_train, epochs=5, batch_size=32, validation_data=(X_test_selected, y_test))






In [None]:
from sklearn.metrics import classification_report

# Evaluate the model
loss, accuracy = model.evaluate(X_test_selected, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Get predictions on the test set
y_pred = model.predict(X_test_selected)
y_pred_classes = np.argmax(y_pred, axis=1)

# Ensure labels are properly encoded
y_test_encoded = np.array(y_test, dtype=int)
y_pred_classes_encoded = np.array(y_pred_classes, dtype=int)

# Convert the classes to string labels if they are boolean
label_names = label_encoder.classes_
if label_names.dtype == np.bool_:
    label_names = label_names.astype(str)

# Print classification report
print(classification_report(y_test_encoded, y_pred_classes_encoded, target_names=label_names))

# Example prediction (make sure to use the same type of preprocessed data as the model expects)
example_text = "Win a brand new car! Click here for details."
# Clean the text
cleaned_example_text = clean_text(example_text)

# Transform the text using the TF-IDF vectorizer
example_tfidf = tfidf_vectorizer.transform([cleaned_example_text]).toarray()

# Add the additional features
example_text_length = len(cleaned_example_text)
example_num_words = len(cleaned_example_text.split())
example_num_unique_words = len(set(cleaned_example_text.split()))

# Combine TF-IDF features with additional features
example_features = np.hstack((example_tfidf, [[example_text_length, example_num_words, example_num_unique_words]]))

# Ensure the features are selected in the same way as the training data
example_features_selected = selector.transform(example_features)

# Predict using the model
prediction = model.predict(example_features_selected)
predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])

print(f"Prediction: {predicted_label[0]}")


In [None]:
# Plot the training and validation metrics
plot_training_metrics(history, img_prefix='enron_spam_detection')
