In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import joblib  # For saving the model


In [2]:
# using the preprocessed data
from data_reprocessing import train_to_df

train_file = 'balanced_train.csv'  
dataset = train_to_df(train_file)
# Display the first few rows of the dataset to verify the preprocessing
dataset.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,minute,second
0,50512,15,1,19,245,0,8,21,16,35
1,80510,3,1,25,489,0,9,5,24,16
2,1768,1,1,19,439,0,8,13,46,1
3,30587,15,1,31,386,0,6,18,19,13
4,73487,3,1,8,153,0,8,3,44,35


In [3]:
# Count and display the class distribution for fraud detection
fraud_counts = dataset['is_attributed'].value_counts()
print(f"Fraudulent clicks: {fraud_counts[1]}")
print(f"Non-fraudulent clicks: {fraud_counts[0]}")

# Split the dataset into features (X) and target (y)
X = dataset.drop('is_attributed', axis=1)  # Features
y = dataset['is_attributed']  # Target variable

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shape of the training and validation sets
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")


Fraudulent clicks: 365541
Non-fraudulent clicks: 1328573
Training set shape: (1355291, 9)
Validation set shape: (338823, 9)


In [4]:
# Initialize the Gaussian Naive Bayes model
nb_model = GaussianNB()

print("Starting model training...")

# Train the Naive Bayes model
nb_model.fit(X_train, y_train)

print("Model training completed.")


Starting model training...
Model training completed.


In [5]:
# Save the entire trained model to a file using joblib
joblib.dump(nb_model, 'naive_bayes_model.joblib')

print("Complete model saved as 'naive_bayes_model.joblib'.")


Complete model saved as 'naive_bayes_model.joblib'.


In [6]:
# Make predictions on the validation set
y_pred = nb_model.predict(X_val)  # Predicted labels
y_pred_proba = nb_model.predict_proba(X_val)[:, 1]  # Predicted probabilities for the positive class

# Evaluate model performance
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Calculate and display the ROC AUC score
roc_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\nROC AUC Score: {roc_auc:.2f}")



Confusion Matrix:
[[251560  14155]
 [ 45255  27853]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.95      0.89    265715
           1       0.66      0.38      0.48     73108

    accuracy                           0.82    338823
   macro avg       0.76      0.66      0.69    338823
weighted avg       0.81      0.82      0.81    338823


ROC AUC Score: 0.81
