<a href="https://colab.research.google.com/github/2025ab05264-web/ML_Assignment2/blob/main/Naive_Bayes_Classifier_Gaussian.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
import gdown
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, matthews_corrcoef)

# 1. READ DATASET FROM GOOGLE DRIVE


In [14]:
file_id = '1Xg8fbG1L39PeVXMuYiPK8CHyFNIBV6dQ'
url = f'https://drive.google.com/uc?id={file_id}'
datasetName = 'dataset.csv'
gdown.download(url, datasetName, quiet=False)


df = pd.read_csv(datasetName)
df.sample(5)
print(f"Original Data. Total Records : {len(df)}")
print(df.info())

Downloading...
From: https://drive.google.com/uc?id=1Xg8fbG1L39PeVXMuYiPK8CHyFNIBV6dQ
To: /content/dataset.csv
100%|██████████| 4.99M/4.99M [00:00<00:00, 226MB/s]

Original Data. Total Records : 45222
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45222 entries, 0 to 45221
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              45222 non-null  int64 
 1   workclass        45222 non-null  object
 2   fnlwgt           45222 non-null  int64 
 3   education        45222 non-null  object
 4   educational-num  45222 non-null  int64 
 5   marital-status   45222 non-null  object
 6   occupation       45222 non-null  object
 7   relationship     45222 non-null  object
 8   race             45222 non-null  object
 9   gender           45222 non-null  object
 10  capital-gain     45222 non-null  int64 
 11  capital-loss     45222 non-null  int64 
 12  hours-per-week   45222 non-null  int64 
 13  native-country   45222 non-null  object
 14  income           45222 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.2+ MB
None





# 2. DATA CLEANING: Remove all records with missing values
# Handling common missing value placeholders like '?' found in Adult datasets

In [15]:
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)
print(f"Data cleaned. Records remaining: {len(df)}")

Data cleaned. Records remaining: 45222


# FEATURE ENGINEERING
# Label Encoding for categorical columns

In [16]:
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])

# Feature and Target Split

In [17]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# DATA SPILIT for training and Test Test

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling (Mandatory for Logistic Regression, KNN, and Naive Bayes)

In [19]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
model = "Naive Bayes Classifier - Gaussian Metrics"


In [21]:
#  Naive Bayes Classifier - Gaussian Metrics
log_reg = GaussianNB()
log_reg.fit(X_train_scaled, y_train)




In [22]:
# Predictions
y_pred = log_reg.predict(X_test_scaled)
y_proba = log_reg.predict_proba(X_test_scaled)[:, 1]

Matrices evaluation

In [23]:
# 6. EVALUATION METRICS [cite: 40, 41, 42, 43, 44, 45, 46]
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_proba),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1": f1_score(y_test, y_pred),
    "MCC": matthews_corrcoef(y_test, y_pred)
}

In [24]:
# Output Results
print("\n--- #  Naive Bayes Classifier - Gaussian Metrics ---")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


--- #  Naive Bayes Classifier - Gaussian Metrics ---
Accuracy: 0.7989
AUC: 0.8522
Precision: 0.6784
Recall: 0.3314
F1: 0.4453
MCC: 0.3723
