In [9]:
import pandas as pd
import tensorflow as tf
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_class_weight
import numpy as np


from backend.TextPreprocessingUtils import vectorize_text_data

xtrain, xtest, ytrain, ytest = vectorize_text_data("data/balanced_raw_data.csv")

Loading data...
Preprocessing text data...
Vectorizing text data with TF-IDF...


FileNotFoundError: [Errno 2] No such file or directory: 'backend/tfidf_vectorizer.pkl'

In [31]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import time

# Define a fast and simple parameter grid (no gamma or kernel needed)
param_grid = {
    'C': [0.1, 1, 10]  # I can expand later if needed
}

# Use LinearSVC: much faster for text data
grid_search = GridSearchCV(
    estimator=LinearSVC(class_weight='balanced', max_iter=10000),
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=2,
    n_jobs=-1
)

# Time the grid search
print("Starting Grid Search with LinearSVC...")

grid_search.fit(xtrain.to_numpy(), ytrain.to_numpy())

# Output best model and evaluate
print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(xtest.to_numpy())

print("\nClassification Report (Test Set):")
print(classification_report(ytest, y_pred))


Starting Grid Search with LinearSVC...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters: {'C': 0.1}
Best Estimator: LinearSVC(C=0.1, class_weight='balanced', max_iter=10000)

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.92      0.67      0.78    119756
           1       0.31      0.73      0.44     24244

    accuracy                           0.68    144000
   macro avg       0.62      0.70      0.61    144000
weighted avg       0.82      0.68      0.72    144000



In [32]:
classes = np.unique(ytrain)
print(classes)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=ytrain)
print(weights)

[0 1]
[0.59969515 3.00764443]


In [33]:
best_model = grid_search.best_estimator_

best_model.score(xtest.to_numpy(), ytest.to_numpy())

0.6823541666666667

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Initialize and train the Naive Bayes model
nb = MultinomialNB()
nb.fit(xtrain, ytrain)

# Predict on test data
y_pred_nb = nb.predict(xtest)

# Evaluate performance
print("Naive Bayes Classification Report:")
print(classification_report(ytest, y_pred_nb))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.84      1.00      0.91    119756
           1       0.69      0.05      0.09     24244

    accuracy                           0.84    144000
   macro avg       0.76      0.52      0.50    144000
weighted avg       0.81      0.84      0.77    144000



In [23]:
import joblib

joblib.dump(best_model, 'svm.pkl')
joblib.dump(nb, 'nb.pkl')

['nb.pkl']

In [2]:
import joblib
from tensorflow.keras.models import load_model

svm_model = joblib.load("svm.pkl")
#cnn_model = joblib.load("cnn.pkl")
# nb_model = joblib.load("nb.pkl")

#good_cnn_model = load_model('cnn.keras')


print("Models loaded successfully!")

Models loaded successfully!


In [3]:
y_pred_svm = svm_model.predict(xtest.to_numpy())
#y_pred_cnn = cnn_model.predict(xtest.to_numpy())
#y_pred_nb = nb_model.predict(xtest.to_numpy())

# Predict probabilities
#y_prob_good_cnn = good_cnn_model.predict(xtest.to_numpy())

# Apply threshold to convert probabilities to binary class labels
#threshold = 0.6  # Adjust freely
#y_pred_good_cnn = (y_prob_good_cnn > threshold).astype(int)


NameError: name 'xtest' is not defined

In [60]:
y_pred_cnn_labels = (y_pred_cnn >= 0.5).astype(int)
#y_pred_good_cnn_labels = (y_pred_normal >= 0.5).astype(int)

In [None]:
from tensorflow.keras.models import load_model
import joblib

# Define the file paths
model_path = '../model/cnn_model.keras'
info_path = '../model/cnn_model_info.pkl'

# Load the Keras model
cnn_model = load_model(model_path)

# Load tokenizer and threshold
model_info = joblib.load(info_path)
tokenizer = model_info["tokenizer"]
threshold = model_info["threshold"]

print("CNN model, tokenizer, and threshold loaded successfully.")

from backend.TextPreprocessingUtils import preprocess_for_cnn

X_test = preprocess_for_cnn(df["message"], tokenizer)

  saveable.load_own_variables(weights_store.get(inner_path))


CNN model, tokenizer, and threshold loaded successfully.


In [72]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

svm_accuracy = accuracy_score(ytest, y_pred_svm)
svm_precision = precision_score(ytest, y_pred_svm)
svm_recall = recall_score(ytest, y_pred_svm)
svm_f1 = f1_score(ytest, y_pred_svm)

cnn_accuracy = accuracy_score(ytest, y_pred_cnn_labels)
cnn_precision = precision_score(ytest, y_pred_cnn_labels)
cnn_recall = recall_score(ytest, y_pred_cnn_labels)
cnn_f1 = f1_score(ytest, y_pred_cnn_labels)

good_cnn_accuracy = accuracy_score(ytest, y_pred_good_cnn)
good_cnn_precision = precision_score(ytest, y_pred_good_cnn)
good_cnn_recall = recall_score(ytest, y_pred_good_cnn)
good_cnn_f1 = f1_score(ytest, y_pred_good_cnn)

nb_accuracy = accuracy_score(ytest, y_pred_nb)
nb_precision = precision_score(ytest, y_pred_nb)
nb_recall = recall_score(ytest, y_pred_nb)
nb_f1 = f1_score(ytest, y_pred_nb)

# Display Results
print("SVM:")
print(f"Accuracy: {svm_accuracy:.4f}")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall: {svm_recall:.4f}")
print(f"F1 Score: {svm_f1:.4f}")
print("")
print("CNN:")
print(f"Accuracy: {cnn_accuracy:.4f}")
print(f"Precision: {cnn_precision:.4f}")
print(f"Recall: {cnn_recall:.4f}")
print(f"F1 Score: {cnn_f1:.4f}")
print("")
print("NB:")
print(f"Accuracy: {nb_accuracy:.4f}")
print(f"Precision: {nb_precision:.4f}")
print(f"Recall: {nb_recall:.4f}")
print(f"F1 Score: {nb_f1:.4f}")
print("CNN:")
print(f"Accuracy: {good_cnn_accuracy:.4f}")
print(f"Precision: {good_cnn_precision:.4f}")
print(f"Recall: {good_cnn_recall:.4f}")
print(f"F1 Score: {good_cnn_f1:.4f}")
print("")




SVM:
Accuracy: 0.6824
Precision: 0.3109
Recall: 0.7291
F1 Score: 0.4359

CNN:
Accuracy: 0.6922
Precision: 0.3183
Recall: 0.7250
F1 Score: 0.4423

NB:
Accuracy: 0.8359
Precision: 0.6892
Recall: 0.0458
F1 Score: 0.0859
CNN:
Accuracy: 0.8316
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Initialize and train Logistic Regression
logreg = LogisticRegression(class_weight='balanced', max_iter=1000)
logreg.fit(xtrain, ytrain)

# Predict on test data
y_pred_log = logreg.predict(xtest)

# Evaluate
log_accuracy = accuracy_score(ytest, y_pred_log)
log_precision = precision_score(ytest, y_pred_log)
log_recall = recall_score(ytest, y_pred_log)
log_f1 = f1_score(ytest, y_pred_log)

print("Logistic Regression:")
print(f"Accuracy: {log_accuracy:.4f}")
print(f"Precision: {log_precision:.4f}")
print(f"Recall: {log_recall:.4f}")
print(f"F1 Score: {log_f1:.4f}")


📊 Logistic Regression:
Accuracy: 0.6822
Precision: 0.3108
Recall: 0.7290
F1 Score: 0.4358
