In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Load CSV and preprocess text
df = pd.read_csv("data/raw_data.csv", header=None)
df.columns = ['message', 'label']

from backend.TextPreprocessingUtils import preprocess_text
df['processed_message'] = df['message'].apply(preprocess_text)

# Tokenize and pad
vocab_size = 10000
maxlen = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['processed_message'])
sequences = tokenizer.texts_to_sequences(df['processed_message'])
padded = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    padded, df['label'], test_size=0.2, random_state=42
)

# Compute class weights to handle imbalance
cw = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(cw))

# Define CNN model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=maxlen),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=2
)

# Predict with threshold tuning
y_probs = model.predict(X_test)
threshold = 0.6  
y_pred = (y_probs > threshold).astype(int)

# Evaluate
print(" CNN (Tokenized + Embedding) Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

import joblib

# Save the Keras model (HDF5 format)
model.save("../model/cnn_model_precision.h5")

# Save the tokenizer and threshold as a .pkl file
model_info = {
    "tokenizer": tokenizer,
    "threshold": threshold
}
joblib.dump(model_info, "../model/cnn_model_info:precision.pkl")

print("CNN model saved as 'cnn_model_precision.h5'")
print("Tokenizer and threshold saved as 'cnn_model_info_precision.pkl'")





Epoch 1/10
14400/14400 - 345s - 24ms/step - accuracy: 0.7861 - loss: 0.4964 - val_accuracy: 0.8097 - val_loss: 0.4409
Epoch 2/10
14400/14400 - 357s - 25ms/step - accuracy: 0.8173 - loss: 0.4486 - val_accuracy: 0.8164 - val_loss: 0.4542
Epoch 3/10
14400/14400 - 352s - 24ms/step - accuracy: 0.8321 - loss: 0.4241 - val_accuracy: 0.8068 - val_loss: 0.4748
Epoch 4/10
14400/14400 - 341s - 24ms/step - accuracy: 0.8439 - loss: 0.4036 - val_accuracy: 0.8383 - val_loss: 0.4095
Epoch 5/10
14400/14400 - 336s - 23ms/step - accuracy: 0.8516 - loss: 0.3846 - val_accuracy: 0.8029 - val_loss: 0.4735
Epoch 6/10
14400/14400 - 336s - 23ms/step - accuracy: 0.8600 - loss: 0.3683 - val_accuracy: 0.8184 - val_loss: 0.4399
Epoch 7/10
14400/14400 - 336s - 23ms/step - accuracy: 0.8651 - loss: 0.3528 - val_accuracy: 0.8119 - val_loss: 0.4675
[1m4500/4500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 5ms/step




📊 CNN (Tokenized + Embedding) Results:
Accuracy: 0.8636
Precision: 0.6000
Recall: 0.5694
F1 Score: 0.5843
CNN model saved as 'cnn_model_precision.h5'
Tokenizer and threshold saved as 'cnn_model_info_precision.pkl'


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

svm_accuracy = accuracy_score(ytest, y_pred_svm)
svm_precision = precision_score(ytest, y_pred_svm)
svm_recall = recall_score(ytest, y_pred_svm)
svm_f1 = f1_score(ytest, y_pred_svm)

cnn_accuracy = accuracy_score(ytest, y_pred_cnn_labels)
cnn_precision = precision_score(ytest, y_pred_cnn_labels)
cnn_recall = recall_score(ytest, y_pred_cnn_labels)
cnn_f1 = f1_score(ytest, y_pred_cnn_labels)

nb_accuracy = accuracy_score(ytest, y_pred_nb)
nb_precision = precision_score(ytest, y_pred_nb)
nb_recall = recall_score(ytest, y_pred_nb)
nb_f1 = f1_score(ytest, y_pred_nb)

# Display Results
print("SVM:")
print(f"Accuracy: {svm_accuracy:.4f}")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall: {svm_recall:.4f}")
print(f"F1 Score: {svm_f1:.4f}")
print("")
print("CNN:")
print(f"Accuracy: {cnn_accuracy:.4f}")
print(f"Precision: {cnn_precision:.4f}")
print(f"Recall: {cnn_recall:.4f}")
print(f"F1 Score: {cnn_f1:.4f}")
print("")
print("NB:")
print(f"Accuracy: {nb_accuracy:.4f}")
print(f"Precision: {nb_precision:.4f}")
print(f"Recall: {nb_recall:.4f}")
print(f"F1 Score: {nb_f1:.4f}")




SVM:
Accuracy: 0.6824
Precision: 0.3109
Recall: 0.7291
F1 Score: 0.4359

CNN:
Accuracy: 0.6922
Precision: 0.3183
Recall: 0.7250
F1 Score: 0.4423

NB:
Accuracy: 0.8359
Precision: 0.6892
Recall: 0.0458
F1 Score: 0.0859


In [3]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Predict probabilities once
y_probs = model.predict(X_test)

# Test multiple thresholds
thresholds = np.arange(0.3, 0.91, 0.05)
results = []

print("Threshold sweep:")
for t in thresholds:
    y_pred = (y_probs > t).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append((t, precision, recall, f1))
    print(f"Threshold: {t:.2f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")

# Sort by highest precision
sorted_by_precision = sorted(results, key=lambda x: x[1], reverse=True)

print("\nTop thresholds by precision:")
for t, precision, recall, f1 in sorted_by_precision[:5]:
    print(f"Threshold: {t:.2f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")


[1m4500/4500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 7ms/step
Threshold sweep:
Threshold: 0.30 | Precision: 0.2781 | Recall: 0.9275 | F1: 0.4279
Threshold: 0.35 | Precision: 0.3131 | Recall: 0.8842 | F1: 0.4624
Threshold: 0.40 | Precision: 0.3646 | Recall: 0.8256 | F1: 0.5059
Threshold: 0.45 | Precision: 0.4399 | Recall: 0.7462 | F1: 0.5535
Threshold: 0.50 | Precision: 0.4942 | Recall: 0.6848 | F1: 0.5741
Threshold: 0.55 | Precision: 0.5425 | Recall: 0.6350 | F1: 0.5851
Threshold: 0.60 | Precision: 0.5861 | Recall: 0.5884 | F1: 0.5873
Threshold: 0.65 | Precision: 0.6318 | Recall: 0.5427 | F1: 0.5839
Threshold: 0.70 | Precision: 0.6781 | Recall: 0.4950 | F1: 0.5723
Threshold: 0.75 | Precision: 0.7271 | Recall: 0.4496 | F1: 0.5557
Threshold: 0.80 | Precision: 0.7759 | Recall: 0.4029 | F1: 0.5304
Threshold: 0.85 | Precision: 0.8271 | Recall: 0.3479 | F1: 0.4898
Threshold: 0.90 | Precision: 0.8821 | Recall: 0.2842 | F1: 0.4299

Top thresholds by precision:
Threshold: 0.90 |

In [7]:
# Save the entire model to a file
model.save('cnn.keras')  # The file extension should be .keras
