In [12]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
import matplotlib.pyplot as plt

# Load and prepare the dataset
df = pd.read_csv('enron_05_17_2015_with_labels_v2.csv\enron_05_17_2015_with_labels_v2.csv')
df['Text'] = df['Subject'].fillna('') + ' ' + df['content'].fillna('')
df['Text'] = df['Text'].apply(lambda text: re.sub(r'\W', ' ', text.lower()).strip())
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['labeled'])

# Tokenization and Padding
MAX_WORDS = 10000
MAX_LEN = 100
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(df['Text'])
sequences = tokenizer.texts_to_sequences(df['Text'])
X = pad_sequences(sequences, maxlen=MAX_LEN)
y = df['Label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the CNN model
model_cnn = Sequential()
model_cnn.add(Embedding(MAX_WORDS, 128, input_length=MAX_LEN))
model_cnn.add(Conv1D(64, 5, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=4))
model_cnn.add(Flatten())
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(2, activation='softmax'))
model_cnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the CNN model
history_cnn = model_cnn.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the CNN model
loss_cnn, accuracy_cnn = model_cnn.evaluate(X_test, y_test)
print(f'CNN Test Accuracy: {accuracy_cnn * 100:.2f}%')


  df = pd.read_csv('enron_05_17_2015_with_labels_v2.csv\enron_05_17_2015_with_labels_v2.csv')


Epoch 1/5




[1m12935/12935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 21ms/step - accuracy: 0.9962 - loss: 0.0253 - val_accuracy: 0.9969 - val_loss: 0.0208
Epoch 2/5
[1m12935/12935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m298s[0m 23ms/step - accuracy: 0.9964 - loss: 0.0186 - val_accuracy: 0.9969 - val_loss: 0.0207
Epoch 3/5
[1m12935/12935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 19ms/step - accuracy: 0.9968 - loss: 0.0168 - val_accuracy: 0.9968 - val_loss: 0.0210
Epoch 4/5
[1m12935/12935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 19ms/step - accuracy: 0.9964 - loss: 0.0183 - val_accuracy: 0.9969 - val_loss: 0.0284
Epoch 5/5
[1m12935/12935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 19ms/step - accuracy: 0.9969 - loss: 0.0167 - val_accuracy: 0.9969 - val_loss: 0.0292
[1m3234/3234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9970 - loss: 0.0281
CNN Test Accuracy: 99.69%
