In [24]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, ElasticNet, LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.pipeline import Pipeline
import os 
import warnings
warnings.filterwarnings('ignore')

# On Kaggle competition Dataset 
 - Binary Prediction of Smoker Status using Bio-Signals


In [25]:
smoke = pd.read_csv(r'playground-series-s3e24\train.csv',index_col=0)
smoke

Unnamed: 0_level_0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,55,165,60,81.0,0.5,0.6,1,1,135,87,...,40,75,16.5,1,1.0,22,25,27,0,1
1,70,165,65,89.0,0.6,0.7,2,2,146,83,...,57,126,16.2,1,1.1,27,23,37,1,0
2,20,170,75,81.0,0.4,0.5,1,1,118,75,...,45,93,17.4,1,0.8,27,31,53,0,1
3,35,180,95,105.0,1.5,1.2,1,1,131,88,...,38,102,15.9,1,1.0,20,27,30,1,0
4,30,165,60,80.5,1.5,1.0,1,1,121,76,...,44,93,15.4,1,0.8,19,13,17,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159251,40,155,45,69.0,1.5,2.0,1,1,127,80,...,72,159,14.5,1,0.8,25,26,13,0,0
159252,50,155,75,82.0,1.0,1.0,1,1,120,80,...,64,108,14.5,1,0.6,21,20,18,0,0
159253,40,160,50,66.0,1.5,1.0,1,1,114,70,...,87,93,10.9,1,0.6,15,9,12,0,0
159254,50,165,75,92.0,1.2,1.0,1,1,121,90,...,55,80,14.4,1,1.1,22,17,37,0,1


In [26]:
smoke.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159256 entries, 0 to 159255
Data columns (total 23 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   age                  159256 non-null  int64  
 1   height(cm)           159256 non-null  int64  
 2   weight(kg)           159256 non-null  int64  
 3   waist(cm)            159256 non-null  float64
 4   eyesight(left)       159256 non-null  float64
 5   eyesight(right)      159256 non-null  float64
 6   hearing(left)        159256 non-null  int64  
 7   hearing(right)       159256 non-null  int64  
 8   systolic             159256 non-null  int64  
 9   relaxation           159256 non-null  int64  
 10  fasting blood sugar  159256 non-null  int64  
 11  Cholesterol          159256 non-null  int64  
 12  triglyceride         159256 non-null  int64  
 13  HDL                  159256 non-null  int64  
 14  LDL                  159256 non-null  int64  
 15  hemoglobin           1

In [27]:
smoke.isna().sum()

age                    0
height(cm)             0
weight(kg)             0
waist(cm)              0
eyesight(left)         0
eyesight(right)        0
hearing(left)          0
hearing(right)         0
systolic               0
relaxation             0
fasting blood sugar    0
Cholesterol            0
triglyceride           0
HDL                    0
LDL                    0
hemoglobin             0
Urine protein          0
serum creatinine       0
AST                    0
ALT                    0
Gtp                    0
dental caries          0
smoking                0
dtype: int64

In [28]:
X_train = smoke.drop(columns='smoking')
y_train = smoke.smoking

In [None]:
## Using Logistic Regression

In [18]:
lr = LogisticRegression(random_state=24)

kfold = KFold(random_state=24, n_splits=5, shuffle=True)
params = {'solver': ['lbfgs', 'liblinear','newton-cholesky','sag','saga'], 'C': np.linspace(0.001,10,5)}

gcv = GridSearchCV(lr, param_grid=params,
                   scoring='roc_auc',          
                   cv=kfold, verbose=3)

gcv.fit(X_train, y_train)

print("Best Parameters: ",gcv.best_params_)
print("Best Score: ",gcv.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .............C=0.001, solver=lbfgs;, score=0.821 total time=   1.0s
[CV 2/5] END .............C=0.001, solver=lbfgs;, score=0.815 total time=   1.2s
[CV 3/5] END .............C=0.001, solver=lbfgs;, score=0.815 total time=   1.1s
[CV 4/5] END .............C=0.001, solver=lbfgs;, score=0.817 total time=   1.0s
[CV 5/5] END .............C=0.001, solver=lbfgs;, score=0.812 total time=   1.1s
[CV 1/5] END .........C=0.001, solver=liblinear;, score=0.825 total time=   6.2s
[CV 2/5] END .........C=0.001, solver=liblinear;, score=0.821 total time=   5.8s
[CV 3/5] END .........C=0.001, solver=liblinear;, score=0.820 total time=   4.4s
[CV 4/5] END .........C=0.001, solver=liblinear;, score=0.820 total time=   3.2s
[CV 5/5] END .........C=0.001, solver=liblinear;, score=0.817 total time=   5.4s
[CV 1/5] END ...C=0.001, solver=newton-cholesky;, score=0.838 total time=   0.2s
[CV 2/5] END ...C=0.001, solver=newton-cholesky

In [19]:
pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

(25, 15)


In [22]:
# Inferencing
best_model = gcv.best_estimator_
test = pd.read_csv(r'playground-series-s3e24\test.csv',index_col=0)

y_pred_prob = best_model.predict_proba(test)[:,1]   # because we need probability of 1

submit = pd.read_csv(r'playground-series-s3e24/sample_submission.csv')
submit['smoking'] = y_pred_prob
submit.to_csv('playground-series-s3e24/logisticR_submission.csv', index=False)

## Using GausianNB

In [30]:
gnb = GaussianNB()

kfold = KFold(n_splits=5, random_state=24, shuffle=True)
params = {'var_smoothing': np.linspace(0.001,3,10)}

gcv = GridSearchCV(gnb, param_grid=params, scoring='roc_auc', cv=kfold, verbose=3)
# With GridSearchCV we not need train_test_split so fit on the X and y
gcv.fit(X_train,y_train)
print("Best Parameters: ",gcv.best_params_)
print("Best Score: ",gcv.best_score_)
# gnb.get_params()

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...............var_smoothing=0.001;, score=0.796 total time=   0.0s
[CV 2/5] END ...............var_smoothing=0.001;, score=0.792 total time=   0.0s
[CV 3/5] END ...............var_smoothing=0.001;, score=0.791 total time=   0.0s
[CV 4/5] END ...............var_smoothing=0.001;, score=0.794 total time=   0.0s
[CV 5/5] END ...............var_smoothing=0.001;, score=0.789 total time=   0.0s
[CV 1/5] END .var_smoothing=0.33422222222222225;, score=0.776 total time=   0.0s
[CV 2/5] END .var_smoothing=0.33422222222222225;, score=0.769 total time=   0.0s
[CV 3/5] END .var_smoothing=0.33422222222222225;, score=0.770 total time=   0.0s
[CV 4/5] END .var_smoothing=0.33422222222222225;, score=0.772 total time=   0.0s
[CV 5/5] END .var_smoothing=0.33422222222222225;, score=0.768 total time=   0.0s
[CV 1/5] END ..var_smoothing=0.6674444444444445;, score=0.767 total time=   0.0s
[CV 2/5] END ..var_smoothing=0.6674444444444445;

In [31]:
pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

(10, 14)


In [33]:
# Inferencing
best_model = gcv.best_estimator_
test = pd.read_csv(r'playground-series-s3e24\test.csv',index_col=0)

y_pred_prob = best_model.predict_proba(test)[:,1]   # because we need probability of 1

submit = pd.read_csv(r'playground-series-s3e24/sample_submission.csv')
submit['smoking'] = y_pred_prob
submit.to_csv('playground-series-s3e24/GaussianNB_submission.csv', index=False)
y_pred_prob

array([7.20231012e-01, 1.86914357e-02, 9.55862648e-01, ...,
       8.31303580e-01, 4.49885924e-02, 9.34501644e-04])