In [26]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.pipeline import Pipeline
import os 
import warnings
warnings.filterwarnings('ignore')

# On CancerDataSet

In [27]:
cancer = pd.read_csv('Cancer.csv',index_col=0)

In [28]:
X = cancer.drop(columns='Class')
y = cancer.Class

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24,test_size=0.3, stratify=y)


In [30]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore').set_output(transform='pandas')
# BernoulliNB is used for the All categorical data and in this dataset all are categorical data
nb = BernoulliNB()
pipe = Pipeline([('OHE',ohe), ('NB', nb)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

                      precision    recall  f1-score   support

no-recurrence-events       0.80      0.78      0.79        60
   recurrence-events       0.52      0.54      0.53        26

            accuracy                           0.71        86
           macro avg       0.66      0.66      0.66        86
        weighted avg       0.71      0.71      0.71        86



In [14]:
y_pred_prob = pipe.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))

0.75


## Using K fold with ROC_AUC

In [22]:
kfold = KFold(n_splits=5, random_state=24, 
                        shuffle=True)
params = {'NB__alpha': np.linspace(0.001,3,10)}

gcv = GridSearchCV(pipe, param_grid=params,
                   scoring='roc_auc',          
                   cv=kfold, verbose=3)
# With GridSearchCV we not need train_test_split so fit on the X and y
gcv.fit(X,y)
print("Best Parameters: ",gcv.best_params_)
print("Best Score: ",gcv.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...................NB__alpha=0.001;, score=0.737 total time=   0.0s
[CV 2/5] END ...................NB__alpha=0.001;, score=0.668 total time=   0.0s
[CV 3/5] END ...................NB__alpha=0.001;, score=0.714 total time=   0.0s
[CV 4/5] END ...................NB__alpha=0.001;, score=0.574 total time=   0.0s
[CV 5/5] END ...................NB__alpha=0.001;, score=0.716 total time=   0.0s
[CV 1/5] END .....NB__alpha=0.33422222222222225;, score=0.735 total time=   0.0s
[CV 2/5] END .....NB__alpha=0.33422222222222225;, score=0.656 total time=   0.0s
[CV 3/5] END .....NB__alpha=0.33422222222222225;, score=0.782 total time=   0.0s
[CV 4/5] END .....NB__alpha=0.33422222222222225;, score=0.617 total time=   0.0s
[CV 5/5] END .....NB__alpha=0.33422222222222225;, score=0.718 total time=   0.0s
[CV 1/5] END ......NB__alpha=0.6674444444444445;, score=0.730 total time=   0.0s
[CV 2/5] END ......NB__alpha=0.6674444444444445;

In [49]:
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.52      0.68      3429
           1       0.39      0.97      0.55      1070

    accuracy                           0.63      4499
   macro avg       0.68      0.74      0.62      4499
weighted avg       0.84      0.63      0.65      4499



# On HR DataSet

In [39]:
hr = pd.read_csv('HR_comma_sep.csv',index_col=0)

In [40]:
X = hr.drop(columns='left')
y = hr.left

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24,test_size=0.3, stratify=y)


In [42]:
# Here we are using the discrete naive bayes' type


In [46]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore').set_output(transform='pandas')
# GaussianNB is used for the All Numerical data 
gnb = GaussianNB()
pipe = Pipeline([('OHE',ohe), ('GNB', gnb)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))
pipe.get_params()

              precision    recall  f1-score   support

           0       0.98      0.52      0.68      3429
           1       0.39      0.97      0.55      1070

    accuracy                           0.63      4499
   macro avg       0.68      0.74      0.62      4499
weighted avg       0.84      0.63      0.65      4499



{'memory': None,
 'steps': [('OHE',
   OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
  ('GNB', GaussianNB())],
 'verbose': False,
 'OHE': OneHotEncoder(handle_unknown='ignore', sparse_output=False),
 'GNB': GaussianNB(),
 'OHE__categories': 'auto',
 'OHE__drop': None,
 'OHE__dtype': numpy.float64,
 'OHE__feature_name_combiner': 'concat',
 'OHE__handle_unknown': 'ignore',
 'OHE__max_categories': None,
 'OHE__min_frequency': None,
 'OHE__sparse_output': False,
 'GNB__priors': None,
 'GNB__var_smoothing': 1e-09}

In [47]:
y_pred_prob = pipe.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))

0.7478715082733038


## Using K fold with ROC_AUC

In [48]:
kfold = KFold(n_splits=5, random_state=24, 
                        shuffle=True)
params = {'GNB__var_smoothing': np.linspace(0.001,3,10)}

gcv = GridSearchCV(pipe, param_grid=params,
                   scoring='roc_auc',          
                   cv=kfold, verbose=3)
# With GridSearchCV we not need train_test_split so fit on the X and y
gcv.fit(X,y)
print("Best Parameters: ",gcv.best_params_)
print("Best Score: ",gcv.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ..........GNB__var_smoothing=0.001;, score=0.835 total time=   0.1s
[CV 2/5] END ..........GNB__var_smoothing=0.001;, score=0.845 total time=   0.0s
[CV 3/5] END ..........GNB__var_smoothing=0.001;, score=0.836 total time=   0.0s
[CV 4/5] END ..........GNB__var_smoothing=0.001;, score=0.843 total time=   0.0s
[CV 5/5] END ..........GNB__var_smoothing=0.001;, score=0.836 total time=   0.0s
[CV 1/5] END GNB__var_smoothing=0.33422222222222225;, score=0.935 total time=   0.0s
[CV 2/5] END GNB__var_smoothing=0.33422222222222225;, score=0.944 total time=   0.0s
[CV 3/5] END GNB__var_smoothing=0.33422222222222225;, score=0.944 total time=   0.0s
[CV 4/5] END GNB__var_smoothing=0.33422222222222225;, score=0.946 total time=   0.0s
[CV 5/5] END GNB__var_smoothing=0.33422222222222225;, score=0.938 total time=   0.0s
[CV 1/5] END GNB__var_smoothing=0.6674444444444445;, score=0.926 total time=   0.0s
[CV 2/5] END GNB__var_smo