In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import joblib  # For saving the model


In [None]:
# using the preprocessed data
from data_reprocessing import train_to_df

train_file = 'balanced_train.csv'  
dataset = train_to_df(train_file)
# Display the first few rows of the dataset to verify the preprocessing
dataset.head()

In [None]:
# Count and display the class distribution for fraud detection
fraud_counts = dataset['is_attributed'].value_counts()
print(f"Fraudulent clicks: {fraud_counts[1]}")
print(f"Non-fraudulent clicks: {fraud_counts[0]}")

# Split the dataset into features (X) and target (y)
X = dataset.drop('is_attributed', axis=1)  # Features
y = dataset['is_attributed']  # Target variable

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shape of the training and validation sets
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")


In [None]:
# Initialize the Gaussian Naive Bayes model
nb_model = GaussianNB()

print("Starting model training...")

# Train the Naive Bayes model
nb_model.fit(X_train, y_train)

print("Model training completed.")


In [None]:
# Save the entire trained model to a file using joblib
joblib.dump(nb_model, 'naive_bayes_model.joblib')

print("Complete model saved as 'naive_bayes_model.joblib'.")


In [None]:
# Make predictions on the validation set
y_pred = nb_model.predict(X_val)  # Predicted labels
y_pred_proba = nb_model.predict_proba(X_val)[:, 1]  # Predicted probabilities for the positive class

# Evaluate model performance
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Calculate and display the ROC AUC score
roc_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\nROC AUC Score: {roc_auc:.2f}")
