In [7]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import joblib  # For saving the model


In [8]:
# using the preprocessed data
from data_reprocessing import train_to_df

train_file = 'new_train.csv'  
dataset = train_to_df(train_file)
# Display the first few rows of the dataset to verify the preprocessing
dataset.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour,minute,second
0,122044,20,1,13,478,0,7,13,51,27
1,191847,26,1,11,477,0,9,1,10,13
2,29785,3,1,19,466,0,7,7,10,3
3,194772,18,1,19,107,0,6,16,28,59
4,119369,2,1,19,469,0,9,10,12,23


In [9]:
# Count and display the class distribution for fraud detection
fraud_counts = dataset['is_attributed'].value_counts()
print(f"Fraudulent clicks: {fraud_counts[1]}")
print(f"Non-fraudulent clicks: {fraud_counts[0]}")

# Split the dataset into features (X) and target (y)
X = dataset.drop('is_attributed', axis=1)  # Features
y = dataset['is_attributed']  # Target variable

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shape of the training and validation sets
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")


Fraudulent clicks: 365541
Non-fraudulent clicks: 147557571
Training set shape: (118338489, 9)
Validation set shape: (29584623, 9)


In [10]:
# Initialize the Gaussian Naive Bayes model
nb_model = GaussianNB()

print("Starting model training...")

# Train the Naive Bayes model
nb_model.fit(X_train, y_train)

print("Model training completed.")


Starting model training...
Model training completed.


In [11]:
# Save the entire trained model to a file using joblib
joblib.dump(nb_model, 'naive_bayes_imbalanced_model.joblib')

print("Complete model saved as 'naive_bayes_imbalanced_model.joblib'.")


Complete model saved as 'naive_bayes_imbalanced_model.joblib'.


In [12]:
# Make predictions on the validation set
y_pred = nb_model.predict(X_val)  # Predicted labels
y_pred_proba = nb_model.predict_proba(X_val)[:, 1]  # Predicted probabilities for the positive class

# Evaluate model performance
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Calculate and display the ROC AUC score
roc_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\nROC AUC Score: {roc_auc:.2f}")



Confusion Matrix:
[[29350433   161082]
 [   65795     7313]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00  29511515
           1       0.04      0.10      0.06     73108

    accuracy                           0.99  29584623
   macro avg       0.52      0.55      0.53  29584623
weighted avg       1.00      0.99      0.99  29584623


ROC AUC Score: 0.81
