In [22]:
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, confusion_matrix

In [23]:
# Load the datasets
X_train = pd.read_csv('Training_Set.csv')
y_train = pd.read_csv('Training_Labels.csv')
X_test = pd.read_csv('Test_Set.csv')
y_test = pd.read_csv('Test_Labels.csv')

In [24]:
# Combine text fields into a single field for vectorization
X_train['text'] = X_train['subject'] + ' ' + X_train['body']
X_test['text'] = X_test['subject'] + ' ' + X_test['body']

In [25]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train['text'])
X_test_tfidf = vectorizer.transform(X_test['text'])

In [26]:
# Dictionary to store results
results = {}

In [27]:
# Function to evaluate model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start_time = time.time()
    model.fit(X_train, y_train.values.ravel())
    end_time = time.time()
    training_time = end_time - start_time
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    
    return training_time, accuracy, recall, precision, y_pred

In [28]:
# Naive Bayes
nb_model = MultinomialNB()
nb_results = evaluate_model(nb_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
results['Naive Bayes'] = nb_results

In [29]:
# Decision Tree
dt_model = DecisionTreeClassifier()
dt_results = evaluate_model(dt_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
results['Decision Tree'] = dt_results

In [30]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_results = evaluate_model(rf_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
results['Random Forest'] = rf_results

In [31]:
# Print results
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Training Time: {result[0]:.4f} seconds")
    print(f"Accuracy: {result[1]:.4f}")
    print(f"Recall: {result[2]:.4f}")
    print(f"Precision: {result[3]:.4f}")
    print(f"Classification Report:\n{classification_report(y_test, result[4])}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, result[4])}\n")

Model: Naive Bayes
Training Time: 0.0050 seconds
Accuracy: 0.9909
Recall: 0.9949
Precision: 0.9915
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       288
           1       0.99      0.99      0.99       587

    accuracy                           0.99       875
   macro avg       0.99      0.99      0.99       875
weighted avg       0.99      0.99      0.99       875

Confusion Matrix:
[[283   5]
 [  3 584]]

Model: Decision Tree
Training Time: 1.3800 seconds
Accuracy: 0.9634
Recall: 0.9761
Precision: 0.9695
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94       288
           1       0.97      0.98      0.97       587

    accuracy                           0.96       875
   macro avg       0.96      0.96      0.96       875
weighted avg       0.96      0.96      0.96       875

Confusion Matrix:
[[270  18]
 [ 14 573]]

Model: Random Fo