In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
data = pd.read_csv('Emails.csv')

# Explore the dataset
print(data.head())
print(data.shape)

# Visualize the distribution of spam and non-spam emails
plt.figure(figsize=(8, 6))
sns.countplot(x='spam', data=data)
plt.title('Distribution of Spam and Non-Spam Emails')
plt.xlabel('Email Type')
plt.ylabel('Count')
plt.show()

# Downsample the majority class to balance the dataset
ham_emails = data[data['spam'] == 0]
spam_emails = data[data['spam'] == 1]
ham_emails = ham_emails.sample(n=len(spam_emails), random_state=42)
balanced_data = pd.concat([ham_emails, spam_emails]).reset_index(drop=True)

# Visualize the distribution of spam and non-spam emails after downsampling
plt.figure(figsize=(8, 6))
sns.countplot(x='spam', data=balanced_data)
plt.title('Distribution of Spam and Non-Spam Emails after Downsampling')
plt.xlabel('Email Type')
plt.ylabel('Count')
plt.show()

# Preprocess the text data
balanced_data['text'] = balanced_data['text'].apply(lambda x: x.replace('Subject', ''))
balanced_data['text'] = balanced_data['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
balanced_data['text'] = balanced_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords.words('english')]))

# Split the data into training and testing sets
train_text, test_text, train_labels, test_labels = train_test_split(balanced_data['text'], balanced_data['spam'], test_size=0.2, random_state=42)

# Create a tokenizer to split the text data into words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)

# Convert the text data into sequences of words
train_sequences = tokenizer.texts_to_sequences(train_text)
test_sequences = tokenizer.texts_to_sequences(test_text)

# Pad the sequences to have the same length
max_length = 100
padded_train = pad_sequences(train_sequences, maxlen=max_length)
padded_test = pad_sequences(test_sequences, maxlen=max_length)

# Create a model to classify the emails
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=max_length))
model.add(LSTM(16))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping and learning rate reduction callbacks
early_stopping = EarlyStopping(patience=3, monitor='val_accuracy', restore_best_weights=True)
learning_rate_reduction = ReduceLROnPlateau(patience=2, monitor='val_loss', factor=0.5, verbose=0)

# Train the model
history = model.fit(padded_train, train_labels, epochs=20, batch_size=32, validation_data=(padded_test, test_labels), callbacks=[early_stopping, learning_rate_reduction])

# Evaluate the model
test_loss, test_accuracy = model.evaluate(padded_test, test_labels)
print(f'Test Loss: {test_loss:.3f}')
print(f'Test Accuracy: {test_accuracy:.3f}')

# Plot the training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()