In [1]:
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, confusion_matrix

In [2]:
# Load and combine training datasets
train_data_part1 = pd.read_csv('TrainData01.csv')
train_data_part2 = pd.read_csv('TrainData02.csv')
train_data = pd.concat([train_data_part1, train_data_part2], ignore_index=True)

In [3]:
# Load test dataset
test_data = pd.read_csv('TestData.csv')

In [4]:
# Separate features and labels for train and test data
X_train = train_data[['subject', 'cleanMessage']]
y_train = train_data['label']
X_test = test_data[['subject', 'cleanMessage']]
y_test = test_data['label']

In [6]:
# Combine text fields (subject, email_to, email_from, cleanMessage) into a single field for vectorization
X_train.loc[:, 'text'] = X_train['subject'] + ' ' + X_train['cleanMessage']
X_test.loc[:, 'text'] = X_test['subject'] + ' ' + X_test['cleanMessage']

In [None]:
# # Combine text fields (subject, email_to, email_from, cleanMessage) into a single field for vectorization
# X_train.loc[:, 'text'] = X_train['subject'] + ' ' + X_train['email_to'] + ' ' + X_train['email_from'] + ' ' + X_train['cleanMessage']
# X_test.loc[:, 'text'] = X_test['subject'] + ' ' + X_test['email_to'] + ' ' + X_test['email_from'] + ' ' + X_test['cleanMessage']

In [10]:
# Replace NaN values with an empty string using .loc
X_train.loc[:, 'text'] = X_train['text'].fillna('')
X_test.loc[:, 'text'] = X_test['text'].fillna('')

In [12]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train['text'])
X_test_tfidf = vectorizer.transform(X_test['text'])

In [13]:
# Dictionary to store results
results = {}

In [14]:
# Function to evaluate model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    training_time = end_time - start_time
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    
    return training_time, accuracy, recall, precision, y_pred

In [15]:
# Naive Bayes
nb_model = MultinomialNB()
nb_results = evaluate_model(nb_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
results['Naive Bayes'] = nb_results

In [16]:
# Decision Tree
dt_model = DecisionTreeClassifier()
dt_results = evaluate_model(dt_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
results['Decision Tree'] = dt_results

In [17]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_results = evaluate_model(rf_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
results['Random Forest'] = rf_results

In [18]:
# Print results
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Training Time: {result[0]:.4f} seconds")
    print(f"Accuracy: {result[1]:.4f}")
    print(f"Recall: {result[2]:.4f}")
    print(f"Precision: {result[3]:.4f}")
    print(f"Classification Report:\n{classification_report(y_test, result[4])}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, result[4])}\n")

Model: Naive Bayes
Training Time: 0.1210 seconds
Accuracy: 0.9463
Recall: 0.9005
Precision: 0.9914
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      7566
           1       0.99      0.90      0.94      7566

    accuracy                           0.95     15132
   macro avg       0.95      0.95      0.95     15132
weighted avg       0.95      0.95      0.95     15132

Confusion Matrix:
[[7507   59]
 [ 753 6813]]

Model: Decision Tree
Training Time: 35.0011 seconds
Accuracy: 0.9837
Recall: 0.9836
Precision: 0.9837
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7566
           1       0.98      0.98      0.98      7566

    accuracy                           0.98     15132
   macro avg       0.98      0.98      0.98     15132
weighted avg       0.98      0.98      0.98     15132

Confusion Matrix:
[[7443  123]
 [ 124 7442]]

Model: 