In [1]:
# Importing necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score
import joblib

In [2]:
# Read the processed training data
train_data = pd.read_csv('processed_data/train.csv', sep='\t')
val_data = pd.read_csv('processed_data/validation.csv', sep='\t')

# Split features and target
X_train = train_data['preprocessed_message']
X_train = X_train.fillna('')
y_train = train_data['label']

X_val = val_data['preprocessed_message']
X_val = X_val.fillna('')
y_val = val_data['label']

# Convert text to TF-IDF features
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

In [3]:
# Initialize classifiers
nb_classifier = MultinomialNB()
svm_classifier = LinearSVC(random_state=42)
rf_classifier = RandomForestClassifier(random_state=42, n_estimators=100)

# Training the classifiers
nb_classifier.fit(X_train_tfidf, y_train)
svm_classifier.fit(X_train_tfidf, y_train)
rf_classifier.fit(X_train_tfidf, y_train)

# Predicting on validation data
nb_pred = nb_classifier.predict(X_val_tfidf)
svm_pred = svm_classifier.predict(X_val_tfidf)
rf_pred = rf_classifier.predict(X_val_tfidf)

In [4]:
# Function to calculate specificity
def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

# List of classifiers and their predictions
classifiers = ['Naive Bayes', 'Linear SVM', 'Random Forest']
predictions = [nb_pred, svm_pred, rf_pred]

# Printing model performance metrics
print("Model Performance Metrics:\n")
print("Classifier\t\t\tAccuracy\tF1 Score\tRecall\t\tPrecision\tSpecificity\tAUC")
print("-" * 120)

# Loop through each classifier and its predictions to print evaluation metrics
for clf, pred in zip(classifiers, predictions):
    acc = accuracy_score(y_val, pred)
    f1 = f1_score(y_val, pred)
    r = recall_score(y_val, pred)
    p = precision_score(y_val, pred)
    auc = roc_auc_score(y_val, pred)
    s = specificity(y_val, pred)
    print(f"{clf:<20}\t\t{acc:.4f}\t\t{f1:.4f}\t\t{r:.4f}\t\t{p:.4f}\t\t{s:.4f}\t\t{auc:.4f}")

In [5]:
# Read the processed test data
test_data = pd.read_csv('processed_data/test.csv', sep='\t')

# Split features and target
X_test = test_data['preprocessed_message']
X_test = X_test.fillna('')
y_test = test_data['label']

X_test_tfidf = tfidf.transform(X_test)

In [6]:
# Predict on test data
nb_pred = nb_classifier.predict(X_test_tfidf)
svm_pred = svm_classifier.predict(X_test_tfidf)
rf_pred = rf_classifier.predict(X_test_tfidf)

In [7]:
classifiers = ['Naive Bayes', 'Linear SVM', 'Random Forest']
predictions = [nb_pred, svm_pred, rf_pred]

print("Model Performance Metrics:\n")
print("Classifier\t\t\tAccuracy\tF1 Score\tRecall\t\tPrecision\tSpecificity\tAUC")
print("-" * 120)

for clf, pred in zip(classifiers, predictions):
    acc = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    r = recall_score(y_test, pred)
    p = precision_score(y_test, pred)
    auc = roc_auc_score(y_test, pred)
    s = specificity(y_test, pred)
    print(f"{clf:<20}\t\t{acc:.4f}\t\t{f1:.4f}\t\t{r:.4f}\t\t{p:.4f}\t\t{s:.4f}\t\t{auc:.4f}")

In [8]:
# Save best model (using the Random Forest classifier instead of Logistic Regression)
best_model = rf_classifier
joblib.dump(best_model, 'best_model.pkl')
joblib.dump(tfidf, 'vectorizer.pkl')