In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data=pd.read_csv("smoking.csv")

In [3]:
data.head(4)

Unnamed: 0,ID,gender,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),...,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,oral,dental caries,tartar,smoking
0,0,F,40,155,60,81.3,1.2,1.0,1.0,1.0,...,12.9,1.0,0.7,18.0,19.0,27.0,Y,0,Y,0
1,1,F,40,160,60,81.0,0.8,0.6,1.0,1.0,...,12.7,1.0,0.6,22.0,19.0,18.0,Y,0,Y,0
2,2,M,55,170,60,80.0,0.8,0.8,1.0,1.0,...,15.8,1.0,1.0,21.0,16.0,22.0,Y,0,N,1
3,3,M,40,165,70,88.0,1.5,1.5,1.0,1.0,...,14.7,1.0,1.0,19.0,26.0,18.0,Y,0,Y,0


In [4]:
data.duplicated().sum()

0

In [5]:
data.drop(columns=["ID","waist(cm)"],axis=1,inplace=True)

**Data Cleaning** 

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55692 entries, 0 to 55691
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               55692 non-null  object 
 1   age                  55692 non-null  int64  
 2   height(cm)           55692 non-null  int64  
 3   weight(kg)           55692 non-null  int64  
 4   eyesight(left)       55692 non-null  float64
 5   eyesight(right)      55692 non-null  float64
 6   hearing(left)        55692 non-null  float64
 7   hearing(right)       55692 non-null  float64
 8   systolic             55692 non-null  float64
 9   relaxation           55692 non-null  float64
 10  fasting blood sugar  55692 non-null  float64
 11  Cholesterol          55692 non-null  float64
 12  triglyceride         55692 non-null  float64
 13  HDL                  55692 non-null  float64
 14  LDL                  55692 non-null  float64
 15  hemoglobin           55692 non-null 

In [7]:
data.isnull().sum()

gender                 0
age                    0
height(cm)             0
weight(kg)             0
eyesight(left)         0
eyesight(right)        0
hearing(left)          0
hearing(right)         0
systolic               0
relaxation             0
fasting blood sugar    0
Cholesterol            0
triglyceride           0
HDL                    0
LDL                    0
hemoglobin             0
Urine protein          0
serum creatinine       0
AST                    0
ALT                    0
Gtp                    0
oral                   0
dental caries          0
tartar                 0
smoking                0
dtype: int64

In [8]:
for i in [1,2,3,4,5,8,9,10,11,12,13,14,15,17,18,19,20]:
    q1=np.percentile(data.iloc[:,i],25)
    q3=np.percentile(data.iloc[:,i],75)
    IQR=q3-q1
    min=q1-(1.5*IQR)
    max=q3+(1.5*IQR)
    data.iloc[:,i]=np.where(data.iloc[:,i]>max,max,np.where(data.iloc[:,i]<min,min,data.iloc[:,i]))

**Feature Engineering**

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [10]:
data.head(2)

Unnamed: 0,gender,age,height(cm),weight(kg),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,oral,dental caries,tartar,smoking
0,F,40.0,155,60,1.2,1.0,1.0,1.0,114.0,73.0,...,12.9,1.0,0.7,18.0,19.0,27.0,Y,0,Y,0
1,F,40.0,160,60,0.8,0.6,1.0,1.0,119.0,70.0,...,12.7,1.0,0.6,22.0,19.0,18.0,Y,0,Y,0


In [11]:
ohe=ColumnTransformer(transformers=[
    ("ohe",OneHotEncoder(drop="first",dtype=np.int32,sparse=False),[0,21,23])
],remainder="passthrough")

In [12]:
scl=ColumnTransformer(transformers=[
    ("scale",StandardScaler(),slice(0,23))
],remainder="passthrough")

In [13]:
pca=ColumnTransformer(transformers=[
    ("pca",PCA(n_components=5),slice(0,23))
])

In [14]:
pipe=Pipeline([
    ("onehotencoding",ohe),
    ("pca",pca),
    ("scaling",scl)
])

In [15]:
pipe

In [16]:
X=data.drop(columns=["smoking"])

In [17]:
Y=data["smoking"]

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=100)

In [19]:
X_train.shape

(44553, 24)

In [20]:
X_train=pipe.fit_transform(X_train)



In [21]:
X_train.shape

(44553, 5)

In [22]:
X_test=pipe.transform(X_test)

In [23]:
X_test.shape

(11139, 5)

# **Model Creation**

**Logistic Regression**

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [25]:
lgr=LogisticRegression()

In [26]:
lgr.fit(X_train,y_train)

In [27]:
X1=np.concatenate((X_train,X_test))

In [28]:
cross_val_score(lgr,X1,Y).mean()*100

63.27120584721466

**Decision Tree**

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [30]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=100)

In [31]:
dt=DecisionTreeClassifier()

In [32]:
pipe=Pipeline([
    ("onehotencoding",ohe),
    ("pca",pca),
    ("scaling",scl),
    ("dt",dt)
])

In [33]:
param_grid={"dt__max_depth":[3,5,7,9,10],
           "dt__max_leaf_nodes":[5,7,9,12,13],
           "dt__min_samples_leaf":[10,15,25,30]}

In [34]:
dt_cv=GridSearchCV(estimator=pipe,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   scoring="accuracy")

In [35]:
dt_cv.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits




In [36]:
y_pred=dt_cv.predict(X_test)

In [37]:
accuracy_score(y_pred,y_test)*100

67.3489541251459

In [38]:
##Without PCA:
pipe_npc=Pipeline([
    ("onehotencoding",ohe),
    ("scaling",scl),
    ("dt",dt)
])
param_grid={"dt__max_depth":[3,5,7,9,10],
           "dt__max_leaf_nodes":[5,7,9,12,13],
           "dt__min_samples_leaf":[10,15,25,30]}
dt_cv_npc=GridSearchCV(estimator=pipe_npc,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   scoring="accuracy")
dt_cv_npc.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits




In [39]:
y_pred=dt_cv_npc.predict(X_test)

In [40]:
accuracy_score(y_pred,y_test)*100

73.93841457940569

**RandomForest Classifier**

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
rf=RandomForestClassifier()

In [43]:
pipe=Pipeline([
    ("onehotencoding",ohe),
    ("pca",pca),
    ("scaling",scl),
    ("rf",rf)
])

In [44]:
param_grid={"rf__max_depth":[3,5,7,9,10,None],
           "rf__max_leaf_nodes":[5,7,9,12,17,None],
           "rf__min_samples_leaf":[1,2,3]}

In [45]:
rf_cv=GridSearchCV(estimator=pipe,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   scoring="accuracy")

In [46]:
rf_cv.fit(X_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits




In [47]:
rf_cv.best_params_

{'rf__max_depth': None, 'rf__max_leaf_nodes': None, 'rf__min_samples_leaf': 1}

In [48]:
y_pred=rf_cv.predict(X_test)

In [49]:
accuracy_score(y_pred,y_test)*100

77.76281533351288

In [50]:
#Without PCA
pipe_npc=Pipeline([
    ("onehotencoding",ohe),
    ("scaling",scl),
    ("rf",rf)
])
param_grid={"rf__max_depth":[3,5,7,9,10,None],
           "rf__max_leaf_nodes":[5,7,9,12,17,None],
           "rf__min_samples_leaf":[1,2,3]}
rf_cv_npc=GridSearchCV(estimator=pipe_npc,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   scoring="accuracy")
rf_cv_npc.fit(X_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits




In [51]:
y_pred=rf_cv_npc.predict(X_test)

In [52]:
accuracy_score(y_pred,y_test)*100

82.9966783373732

In [53]:
rf_cv_npc.best_params_

{'rf__max_depth': None, 'rf__max_leaf_nodes': None, 'rf__min_samples_leaf': 1}

**GradientBoosting Classifier**

In [54]:
from sklearn.ensemble import GradientBoostingClassifier

In [55]:
gbc=GradientBoostingClassifier()
param_grid={"gbc__max_depth":[10,15,30],
            "gbc__learning_rate":[0.1,0.2,0.01,0.7],
            "gbc__n_estimators":[18,28,35,40]}
pipe=Pipeline([
    ("onehotencoding",ohe),
    ("pca",pca),
    ("scaling",scl),
    ("gbc",gbc)
])
gbc_cv=GridSearchCV(estimator=pipe,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2)

In [56]:
gbc_cv.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




In [57]:
y_pred=gbc_cv.predict(X_test)

In [58]:
gbc_cv.best_params_

{'gbc__learning_rate': 0.1, 'gbc__max_depth': 15, 'gbc__n_estimators': 40}

In [59]:
accuracy_score(y_pred,y_test)*100

77.58326600233414

In [60]:
#Without PCA
pipe_npc=Pipeline([
    ("onehotencoding",ohe),
    ("scaling",scl),
    ("gbc",gbc)
])
param_grid={"gbc__max_depth":[10,15,30],
            "gbc__learning_rate":[0.1,0.2,0.01,0.7],
            "gbc__n_estimators":[18,28,35,40]}
gbc_cv_npc=GridSearchCV(estimator=pipe_npc,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   scoring="accuracy")
gbc_cv_npc.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




In [61]:
y_pred=gbc_cv_npc.predict(X_test)

In [62]:
accuracy_score(y_pred,y_test)*100

82.31439087889397

**Adaboost Classifier**

In [63]:
from sklearn.ensemble import AdaBoostClassifier
adc=AdaBoostClassifier()
pipe=Pipeline([
    ("onehotencoding",ohe),
    ("pca",pca),
    ("scaling",scl),
    ("adc",adc)
])
param_grid={"adc__n_estimators":[30,60,80]}
adc_cv=GridSearchCV(estimator=pipe,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2)

In [64]:
adc_cv.fit(X_train,y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




In [65]:
y_pred=adc_cv.predict(X_test)

In [66]:
adc_cv.best_params_

{'adc__n_estimators': 60}

In [67]:
accuracy_score(y_pred,y_test)*100

69.18035730316905

In [68]:
#Without PCA
pipe_npc=Pipeline([
    ("onehotencoding",ohe),
    ("scaling",scl),
    ("adc",adc)
])
param_grid={"adc__n_estimators":[30,60,80]}
adc_cv_npc=GridSearchCV(estimator=pipe_npc,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   scoring="accuracy")
adc_cv_npc.fit(X_train,y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




In [69]:
y_pred=adc_cv_npc.predict(X_test)

In [70]:
accuracy_score(y_pred,y_test)*100

76.10198402010953

**KNearestNeighbors**

In [71]:
from sklearn.neighbors import KNeighborsClassifier

In [72]:
knn=KNeighborsClassifier()
pipe=Pipeline([
    ("onehotencoding",ohe),
    ("pca",pca),
    ("scaling",scl),
    ("knn",knn)
])
param_grid={"knn__n_neighbors":[5,10,15,20]}
knn_cv=GridSearchCV(estimator=pipe,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2)

In [73]:
knn_cv.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




In [74]:
y_pred=knn_cv.predict(X_test)

In [75]:
knn_cv.best_params_

{'knn__n_neighbors': 20}

In [76]:
accuracy_score(y_pred,y_test)*100

68.89307837328306

In [77]:
#Without PCA
pipe_npc=Pipeline([
    ("onehotencoding",ohe),
    ("scaling",scl),
    ("knn",knn)
])
param_grid={"knn__n_neighbors":[5,10,15,20]}
knn_cv_npc=GridSearchCV(estimator=pipe_npc,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   scoring="accuracy")
knn_cv_npc.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




In [78]:
y_pred=knn_cv_npc.predict(X_test)

In [79]:
accuracy_score(y_pred,y_test)*100

73.83966244725738

**SVM**

In [80]:
from sklearn.svm import SVC

In [81]:
svc=SVC()
pipe=Pipeline([
    ("onehotencoding",ohe),
    ("pca",pca),
    ("scaling",scl),
    ("svc",svc)
])
param_grid={"svc__kernel":["linear","rbf"],
            "svc__gamma":[0.2,0.5,0.7,1.3]}
svc_cv=GridSearchCV(estimator=pipe,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2)

In [82]:
svc_cv.fit(X_train,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits




In [83]:
y_pred=svc_cv.predict(X_test)

In [84]:
accuracy_score(y_pred,y_test)*100

69.88059969476613

In [85]:
#Without PCA
pipe_npc=Pipeline([
    ("onehotencoding",ohe),
    ("scaling",scl),
    ("svc",svc)
])
param_grid={"svc__kernel":["linear","rbf"],
            "svc__gamma":[0.2,0.5,0.7,1.3]}
svc_cv_npc=GridSearchCV(estimator=pipe_npc,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   scoring="accuracy")
svc_cv_npc.fit(X_train,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits




In [86]:
y_pred=svc_cv_npc.predict(X_test)

In [87]:
accuracy_score(y_pred,y_test)*100

79.32489451476793

**XBGClassifier**

In [88]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [89]:
xgb=XGBClassifier()
pipe=Pipeline([
    ("onehotencoding",ohe),
    ("pca",pca),
    ("scaling",scl),
    ("xgb",xgb)])
param_grid={"xgb__max_depth":[10,15,30],
            "xgb__gamma":[0.2,0.5,0.7,1.3],
            "xgb__learning_rate":[0.2,0.5,0.9,1]}
xgb_cv=GridSearchCV(estimator=pipe,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2)

In [90]:
xgb_cv.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




In [91]:
y_pred=xgb_cv.predict(X_test)

In [92]:
xgb_cv.best_params_

{'xgb__gamma': 0.2, 'xgb__learning_rate': 0.2, 'xgb__max_depth': 15}

In [93]:
accuracy_score(y_pred,y_test)*100

76.64063201364574

In [94]:
#Without PCA
pipe_npc=Pipeline([
    ("onehotencoding",ohe),
    ("scaling",scl),
    ("xgb",xgb)
])
param_grid={"xgb__max_depth":[10,15,30],
            "xgb__gamma":[0.2,0.5,0.7,1.3],
            "xgb__learning_rate":[0.2,0.5,0.9,1]}
xgb_cv_npc=GridSearchCV(estimator=pipe_npc,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   scoring="accuracy")
xgb_cv_npc.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




In [95]:
y_pred=xgb_cv_npc.predict(X_test)

In [96]:
accuracy_score(y_pred,y_test)*100

82.62860220845677

**NaiveBayes**

In [97]:
from sklearn.naive_bayes import GaussianNB

In [98]:
nv=GaussianNB()
pipe=Pipeline([
    ("onehotencoding",ohe),
    ("pca",pca),
    ("scaling",scl),
    ("nv",nv)
])
nv_cv=GridSearchCV(estimator=pipe,
                   param_grid={},
                   cv=5,
                   n_jobs=-1,
                   verbose=2)

In [99]:
nv_cv.fit(X_train,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits




In [100]:
y_pred=nv_cv.predict(X_test)

In [101]:
accuracy_score(y_pred,y_test)*100

68.29158811383428

In [102]:
#Without PCA
pipe_npc=Pipeline([
    ("onehotencoding",ohe),
    ("scaling",scl),
    ("nv",nv)
])
param_grid={}
nv_cv_npc=GridSearchCV(estimator=pipe_npc,
                   param_grid=param_grid,
                   cv=5,
                   n_jobs=-1,
                   verbose=2,
                   scoring="accuracy")
nv_cv_npc.fit(X_train,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits




In [103]:
y_pred=nv_cv_npc.predict(X_test)

In [104]:
accuracy_score(y_pred,y_test)*100

70.38333782206662

**With PCA**

*XGBClassifier : 76.64*                          

*SVC : 69.88*

*KNN : 68.89* 

*AdaBoostClassifier : 69.18*

*GradientboostingClassifier : 77.59* 
 
*RandomForestClassifier : 77.76*  

*DecisionTreeClassifier : 67.34*  

*LogisticRegressor : 63.27* 

*Naive Bayes : 68.29*

**Without PCA**

*XGBClassifier : 82.62* 

*SVC : 79.32* 

*KNN : 73.83* 

*AdaBoostClassifier : 76.10* 

*GradientboostingClassifier : 82.31*

*RandomForestClassifier : 82.99* 

*DecisionTreeClassifier : 73.93* 

*LogisticRegressor : 63.27* 

*Naive Bayes : 70.38* 

In [105]:
## FINAL MODEL:
rff=RandomForestClassifier(max_depth=None, max_leaf_nodes=None, min_samples_leaf=1)
pipef=Pipeline([
    ("onehotencoding",ohe),
    ("scaling",scl),
    ("rff",rff)
])
pipef.fit(X_train,y_train)



In [106]:
import pickle

In [107]:
pickle.dump(pipef,open("model.pkl","wb"))