In [57]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, confusion_matrix, accuracy_score, roc_curve, roc_auc_score,log_loss,classification_report
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder, OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet, LogisticRegression, LinearRegression
from sklearn.compose import make_column_transformer, make_column_selector
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB, GaussianNB

In [27]:
df = pd.read_csv("Cancer.csv",index_col = 0)

In [29]:
df.head()


Unnamed: 0_level_0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
subjid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,40-49,premeno,15 to 19,0 to 2,yes,three,right,left_up,no,recurrence-events
2,50-59,ge40,15 to 19,0 to 2,no,one,right,central,no,no-recurrence-events
3,50-59,ge40,35 to 39,0 to 2,no,two,left,left_low,no,recurrence-events
4,40-49,premeno,35 to 39,0 to 2,yes,three,right,left_low,yes,no-recurrence-events
5,40-49,premeno,30 to 34,3 to 5,yes,two,left,right_up,no,recurrence-events


In [31]:
X = df.drop("Class", axis = 1)
y = df["Class"]

In [33]:
X_train,X_test, y_train,y_test = train_test_split(X, y, random_state = 24, test_size = 0.3, stratify = y)

In [35]:
ohe = OneHotEncoder(sparse_output = False,handle_unknown = 'ignore').set_output(transform='pandas')


In [37]:
nb = BernoulliNB()
pipe = Pipeline([('OHE',ohe),('NB',nb)])
pipe.fit(X_train,y_train)

In [39]:
y_pred = pipe.predict(X_test)

In [41]:
print(classification_report(y_test, y_pred))

                      precision    recall  f1-score   support

no-recurrence-events       0.80      0.78      0.79        60
   recurrence-events       0.52      0.54      0.53        26

            accuracy                           0.71        86
           macro avg       0.66      0.66      0.66        86
        weighted avg       0.71      0.71      0.71        86



In [45]:
y_pred_prob = pipe.predict_proba(X_test)[:,1]

In [49]:
print(roc_auc_score(y_test,y_pred_prob))

0.75


In [None]:
############################### Gridsearch

In [51]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 24)
#alpha = np.linspace(0.001,3,10)
params = {'NB__alpha':np.linspace(0.001,3,10)}
gcv = GridSearchCV(pipe,param_grid = params,verbose = 3,scoring = 'roc_auc', cv = kfold) 
gcv.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...................NB__alpha=0.001;, score=0.769 total time=   0.0s
[CV 2/5] END ...................NB__alpha=0.001;, score=0.701 total time=   0.0s
[CV 3/5] END ...................NB__alpha=0.001;, score=0.656 total time=   0.0s
[CV 4/5] END ...................NB__alpha=0.001;, score=0.710 total time=   0.0s
[CV 5/5] END ...................NB__alpha=0.001;, score=0.629 total time=   0.0s
[CV 1/5] END .....NB__alpha=0.33422222222222225;, score=0.788 total time=   0.0s
[CV 2/5] END .....NB__alpha=0.33422222222222225;, score=0.704 total time=   0.0s
[CV 3/5] END .....NB__alpha=0.33422222222222225;, score=0.688 total time=   0.0s
[CV 4/5] END .....NB__alpha=0.33422222222222225;, score=0.754 total time=   0.0s
[CV 5/5] END .....NB__alpha=0.33422222222222225;, score=0.643 total time=   0.0s
[CV 1/5] END ......NB__alpha=0.6674444444444445;, score=0.795 total time=   0.0s
[CV 2/5] END ......NB__alpha=0.6674444444444445;

In [53]:
print(gcv.best_params_)
print(gcv.best_score_)

{'NB__alpha': 2.666777777777778}
0.7194010043041608


In [55]:
hr_df = pd.read_csv("HR_comma_sep.csv")

In [59]:
X = hr_df.drop('left',axis = 1)
y = hr_df['left']

In [61]:
X_train,X_test, y_train,y_test = train_test_split(X, y, random_state = 24, test_size = 0.3, stratify = y)

In [63]:
ohe = OneHotEncoder(sparse_output = False,handle_unknown = 'ignore').set_output(transform='pandas')


In [65]:
gb = GaussianNB()
pipe = Pipeline([('OHE',ohe),('GB',gb)])
pipe.fit(X_train,y_train)

In [75]:
##################

kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 24)
params = {'GB__var_smoothing':np.linspace(1e-15,1,10)}

gcv = GridSearchCV(pipe,param_grid = params,
                  scoring = 'roc_auc', cv = kfold)



In [77]:
gcv.fit(X,y)

In [78]:
print(gcv.best_params_)
print(gcv.best_score_)

{'GB__var_smoothing': 0.111111111111112}
0.9654732081687083
