In [None]:
import pandas as pd 
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('diabetes_prediction_dataset.csv')
df.head(2)

In [None]:
#To check the datatypes of all columns
df.dtypes

In [None]:
#how many samples of 0's and 1's in output column diabetes
f=df['diabetes'].value_counts()
print(f)

In [None]:
#visualisation 
sb.countplot(data=df,x='diabetes')
plt.yticks(f)
plt.show()

In [None]:
#how many samples of 0's and 1's in output column heart_disease
h=df['heart_disease'].value_counts()
print(h)
#visualisation 
sb.countplot(data=df,x='heart_disease')
plt.yticks(h)
plt.show()

In [None]:
#how many samples of 0's and 1's in output column heart_disease
h=df['hypertension'].value_counts()
print(h)
#visualisation 
sb.countplot(data=df,x='hypertension')
plt.yticks(h)
plt.show()

In [None]:
#Apply label encoder on Smoking history 
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['smoking_history']=le.fit_transform(df['smoking_history'])
df.dtypes

In [None]:
df['smoking_history'].value_counts()

In [None]:
#how many samples of 0's and 1's in output column heart_disease
s=df['smoking_history'].value_counts()
print(s)
#visualisation 
sb.countplot(data=df,x='smoking_history')
plt.yticks(s)
plt.show()

In [None]:
#To find the null values if there any 
df.isnull().sum()

In [None]:
#visualising null values
sb.heatmap(df.isnull())
plt.show()

In [None]:
#selecting input and output for given dataset
X=df.drop(['gender','diabetes'],axis=1)   #input selected
Y=df['diabetes']   #output selected

In [None]:
X.head()

In [None]:
Y.head()

In [None]:
Y.value_counts()

In [None]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test=train_test_split(X,Y,test_size=0.3,random_state=1)

In [None]:
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

In [None]:
#Aplly StandardScaler on X_train and X_test
from sklearn.preprocessing import StandardScaler
#create object of StandardScaler class
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

In [None]:
X_train

In [None]:
#calling confusion_matrix
from sklearn.metrics import confusion_matrix
#calling classification report
from sklearn.metrics import classification_report

In [None]:
#Given is classification Dataset 
#Some method used in all classification Algorithm
#1. fit()mean train the model with 70% input and output
#2. predict() means test the model with 30% data of input and output and recieve the predicted output
#3. #print classification report and confusion matrix

#create a userdefine function and passsing the object of classification Algorithm
def create_model(model):        #userdefine function name and model name of Classification Algorithm 
    model.fit(X_train,Y_train)
    Y_pred=model.predict(X_test)
    #print confusion matrix
    cm=confusion_matrix(Y_test,Y_pred)
    print(cm)
    #print classification report 
    print(classification_report(Y_test,Y_pred))
    return model

In [None]:
#perform model with LogisticRegression
from sklearn.linear_model import LogisticRegression

In [None]:
#create object of LogisticRegression 
lr=LogisticRegression()

In [None]:
#train test the model 
lr=create_model(lr)

In [None]:
#Here recall=0.63 means 63%, we can try for much more better score

In [None]:
#balance Target variable diabetes 
#use inbuilt class RandomOverSampler class -->define in outer class over_sampling -->inbuilt package imblearn of ML
#first time install on your sysytem 
#!pip install imblearn

In [None]:
#call RandomOverSampler class
from imblearn.over_sampling import RandomOverSampler

In [None]:
#TRAIN TEST SPLIT
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=1)

In [None]:
Y_train.value_counts()

In [None]:
Y_test.value_counts()

In [None]:
##Apply StandardScaler on X_train and X_test
#create object of StandardScaler class
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

In [None]:
#create the object of RandomOverSampler class
ros=RandomOverSampler(random_state=1)

In [None]:
#Apply RandomOverSampler on training input and training output X_train,Y_train ,use fit_resample() method of RandomOverSampler class
X_train_ros,Y_train_ros=ros.fit_resample(X_train,Y_train)

In [None]:
Y_train_ros.value_counts()

In [None]:
#Apply RandomOverSampler on test input and test output X_test,Y_test ,use fit_resample() method of RandomOverSampler class
X_test_ros,Y_test_ros=ros.fit_resample(X_test,Y_test)

In [None]:
Y_test_ros.value_counts()

In [None]:
#Perform the model after apply sampling technique
#create a userdefine function and passsing the object of classification Algorithm
def create_model(model):        #userdefine function name and model name of Classification Algorithm 
    model.fit(X_train_ros,Y_train_ros)
    Y_pred=model.predict(X_test_ros)
    #print confusion matrix
    cm=confusion_matrix(Y_test_ros,Y_pred)
    print(cm)
    #print classification report 
    print(classification_report(Y_test_ros,Y_pred))
    return model

In [None]:
#create object of LogisticRegression 
lr=LogisticRegression()

In [None]:
#call function and pass the object of Logistic Regression
lr=create_model(lr)

In [None]:
#limitation of logistic regression, it is not selected automatically
#Now we use second classification Algorithm Decision tree classifier

#calling DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
#create the object of DecisionTreeClassifier class
dt=DecisionTreeClassifier()

In [None]:
#train test the model 
dt=create_model(dt)

In [None]:
#here recall score = 0.74 means 74% it is good but can make it much better 

In [None]:
IG=dt.feature_importances_
dict={'Input columns':X.columns,'Information Gain':IG}
df1=pd.DataFrame(dict)
#sort in descending
df1.sort_values('Information Gain',ascending=False)

In [None]:
#recall score changes from 63% to 74%
#now try decision Tree classifier with entropy
dt1=DecisionTreeClassifier(criterion='entropy',random_state=1)

In [None]:
#call the function 
dt1=create_model(dt1)

In [None]:
IG=dt1.feature_importances_
dict={'Input columns':X.columns,'Information Gain':IG}
df2=pd.DataFrame(dict)
#sort in descending
df2.sort_values('Information Gain',ascending=False)

In [None]:
#limitation of DecisionTreeClassifier : model is overfit
# to reduce overfitting the model  use 2 pruning technique
#there are 2 types of pruning technique
#1. max_depth
#2. min_samples_leaf

In [None]:
'''for i in range(1,9):
    #create decisionTreeclassifier class object
    dt2=DecisionTreeClassifier(max_depth=i,random_state=1)
    print('max_depth',i)
    #call the function
    dt2=create_model(dt2)'''

In [None]:
dt2=DecisionTreeClassifier(max_depth=9,random_state=1)
#call the function
dt2=create_model(dt2)

In [None]:
IG=dt2.feature_importances_
dict={'Input columns':X.columns,'Information Gain':IG}
df2=pd.DataFrame(dict)
#sort in descending
df2.sort_values('Information Gain',ascending=False)

In [None]:
#apply second pruning technique min_sample_leaf 
#value of min_sample_leaf parameter >=50 and_samples_leaf parameter <=100
'''for i in range(50,101):
    #create object
    dt2=DecisionTreeClassifier(min_samples_leaf=i,random_state=1)
    print('min_sample',i)
    #call function
    dt2=create_model(dt2)'''

In [None]:
dt2=DecisionTreeClassifier(min_samples_leaf=56,random_state=1)
#call function
dt2=create_model(dt2)

In [None]:
#clearly understand 
#the recall score of min_sample_leaf=50   gini is good as campared to max_depth=5 : gini

In [None]:
#Ensembling Technique
#Bootstraping Ensemble Technique
#use Random Forest Classifier : work bagging technique
from sklearn.ensemble import RandomForestClassifier

In [None]:
'''for i in range(10,101):
    #create object of RandomForestClassifier class
    rfc=RandomForestClassifier(random_state=1,n_estimators=i)
    print('n_estimators',i)
    #call function 
    rfc=create_model(rfc)'''

In [None]:
rfc=RandomForestClassifier(random_state=1,n_estimators=11)
#call function 
rfc=create_model(rfc)

In [None]:
#in RandomForestClassifier : recall : 71% and Accuracy = 97%

In [None]:
#Next Ensembling Technique :AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

In [None]:
X.shape    #here total no of columns are 7
 #maximum decision stumps =7  and nim decision stump=1

In [None]:
'''for i in range(1,8):
    #create object of AdaBoostClassifier class
    abc=AdaBoostClassifier(random_state=1,n_estimators=i,)
    print('n_estimators',i)
    #call function 
    abc=create_model(abc)'''

In [None]:
abc=AdaBoostClassifier(random_state=1,n_estimators=3,)
#call function
abc=create_model(abc)

In [None]:
IG=abc.feature_importances_
dict={'input column':X.columns,'Information Gain':IG}
df1=pd.DataFrame(dict)
#sort in decending
df1.sort_values('Information Gain',ascending=False)

In [None]:
#now GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
#min decision tree =10, max=100

In [None]:
'''for i in range(10,101):
    #create object of GradientBoostingClassifier
    gbc=GradientBoostingClassifier(random_state=1,n_estimators=i)
    print('n_estimators',i)
    #call function 
    gbc=create_model(gbc)'''

In [None]:
#create object of GradientBoostingClassifier
gbc=GradientBoostingClassifier(random_state=1,n_estimators=97)
#call function 
gbc=create_model(gbc)

In [None]:
#recall score= 69%  and accuracy=97%

In [None]:
#3. ExtremeGradientBoosting (XGB): similar as gradient boost
#it is 3rd technique of boosting Ensembling technique
#XGB is the best version of GradientBoosting

In [None]:
#!pip install xgBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
'''for i in range(10,101):
    #create object of import XGBClassifier class
    xgb=XGBClassifier(random_state=1,n_estimators=i,reg_alpha=1)     #1 means True
    print('n_estimators',i)
    #call function
    xgb=create_model(xgb)'''

In [None]:
#create object of import XGBClassifier class
xgb=XGBClassifier(random_state=1,n_estimators=74,reg_alpha=1)     #1 means True
#call function
xgb=create_model(xgb)

In [None]:
#recall score=70% and accuracy=97

In [None]:
#now svm : support vector machine
#use inner class LinearSVC of the outer class svm
from sklearn.svm import LinearSVC

In [None]:
#create object of LinearSVC class
svc=LinearSVC(random_state=1)  #bydefault hardmargin
#call function 
svc=create_model(svc)

In [None]:
#less recall score = 61%  and less accuracy=96%, region behind the less score,
#outlier found on training time , to add external error time :
#soft margine
#create the object of linearSVC class and passing the paramiter for external error C
#value of C between 0-1
svc1=LinearSVC(random_state=1,C=0.99)
#call function
svc1=create_model(svc1)

In [None]:
#clearly see, No changes the recall store and Accuracy if add external error during training time 
#means the data is notlinear 
#then use Kernal function :
#Polynomial kernal function : increse dimension 

from sklearn.svm import SVC

In [None]:
#create object of SVC polynomial class
p_svc=SVC(random_state=1,kernel='poly')
#call function 
p_svc=create_model(p_svc)

In [None]:
#less recall score =60%

In [None]:
#create object of SVC radial class
r_svc=SVC(random_state=1,kernel='rbf')
#call function 
#r_svc=create_model(r_svc)