## Diabetes Dataset

### Ensemble Learning: Bagging


We will use pima indian diabetes dataset to predict if a person has a diabetes or not based on certain features such as blood pressure, skin thickness, age etc. We will train a standalone model first and then use bagging ensemble technique to check how it can improve the performance of the model

dataset credit: https://www.kaggle.com/gargmanas/pima-indians-diabetes

In [4]:
import pandas as pd

columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

df= pd.read_csv('/content/pima-indians-diabetes.csv', names= columns, header= 0)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0
...,...,...,...,...,...,...,...,...,...
762,10,101,76,48,180,32.9,0.171,63,0
763,2,122,70,27,0,36.8,0.340,27,0
764,5,121,72,23,112,26.2,0.245,30,0
765,1,126,60,0,0,30.1,0.349,47,1


In [5]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [6]:
df.Outcome.value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,267


In [7]:
X= df.drop('Outcome', axis= 1)
y= df.Outcome

In [8]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()
X_scaled= scaler.fit_transform(X)
X_scaled[:3]

array([[-0.84372629, -1.12208597, -0.16024856,  0.53202348, -0.69355921,
        -0.68372895, -0.36426474, -0.18894038],
       [ 1.23423997,  1.94447577, -0.26357823, -1.28688187, -0.69355921,
        -1.10230105,  0.60470064, -0.1037951 ],
       [-0.84372629, -0.99692019, -0.16024856,  0.15569823,  0.12235685,
        -0.49346891, -0.91968415, -1.0403932 ]])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X_scaled, y, random_state=10)

In [10]:
X_train.shape

(575, 8)

In [11]:
X_test.shape

(192, 8)

In [12]:
y_test.value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,128
1,64


### Train using stand alone model

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

scores= cross_val_score(DecisionTreeClassifier(), X, y, cv=5) # KFold
scores

array([0.71428571, 0.65584416, 0.73856209, 0.76470588, 0.73202614])

In [14]:
scores.mean()

np.float64(0.7210847975553858)

### Train using Bagging

In [15]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_model= BaggingClassifier(
    estimator= DecisionTreeClassifier(), # the model will be used
    n_estimators= 100, # divide the data to how many part
    max_samples= 0.8, # the percentage of the data would be in each part
    oob_score=True, # the data that will not exist in all parts would be used as test data
    random_state=0
)

bag_model.fit(X_train, y_train)
bag_model.oob_score_

0.7704347826086957

In [16]:
bag_model.score(X_test, y_test)

0.734375

In [17]:
bag_model= BaggingClassifier(
    estimator= DecisionTreeClassifier(), # the model will be used
    n_estimators= 100, # divide the data to how many part
    max_samples= 0.8, # the percentage of the data would be in each part
    oob_score=True, # the data that will not exist in all parts would be used as test data
    random_state=0
)

scores= cross_val_score(bag_model, X, y, cv=5)
scores.mean()

np.float64(0.7588405058993294)

## Heart Failure Prediction Dataset

This dataset was created by combining different datasets already available independently but not combined before. In this dataset, 5 heart datasets are combined over 11 common features which makes it the largest heart disease dataset available so far for research purposes.

Data link:  https://www.kaggle.com/fedesoriano/heart-failure-prediction

In [35]:
df= pd.read_csv('/content/heart.csv')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [36]:
import numpy as np
from scipy import stats


##Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 formula or Z score < -3
# Compute the Z-scores
z_scores= np.abs(stats.zscore(df.select_dtypes(include=[np.number])))

# Filter rows where all z-scores are < 3
df_no_outliers= df[(z_scores < 3).all(axis=1)]

# Show result
print("Original shape:", df.shape)
print("Shape after removing outliers:", df_no_outliers.shape)

Original shape: (918, 12)
Shape after removing outliers: (899, 12)


In [37]:
df_no_outliers.isnull().sum()

Unnamed: 0,0
Age,0
Sex,0
ChestPainType,0
RestingBP,0
Cholesterol,0
FastingBS,0
RestingECG,0
MaxHR,0
ExerciseAngina,0
Oldpeak,0


In [38]:
df= df_no_outliers
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [39]:
from sklearn.preprocessing import LabelEncoder

le= LabelEncoder()
df['ST_Slope_Label']= le.fit_transform(df['ST_Slope'])
df['RestingECG_Label']= le.fit_transform(df['RestingECG'])

In [40]:
df= pd.get_dummies(df, columns= ['Sex', 'ChestPainType', 'ExerciseAngina'])
df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,Oldpeak,ST_Slope,HeartDisease,ST_Slope_Label,RestingECG_Label,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,ExerciseAngina_N,ExerciseAngina_Y
0,40,140,289,0,Normal,172,0.0,Up,0,2,1,False,True,False,True,False,False,True,False
1,49,160,180,0,Normal,156,1.0,Flat,1,1,1,True,False,False,False,True,False,True,False
2,37,130,283,0,ST,98,0.0,Up,0,2,2,False,True,False,True,False,False,True,False
3,48,138,214,0,Normal,108,1.5,Flat,1,1,1,True,False,True,False,False,False,False,True
4,54,150,195,0,Normal,122,0.0,Up,0,2,1,False,True,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,Normal,132,1.2,Flat,1,1,1,False,True,False,False,False,True,True,False
914,68,144,193,1,Normal,141,3.4,Flat,1,1,1,False,True,True,False,False,False,True,False
915,57,130,131,0,Normal,115,1.2,Flat,1,1,1,False,True,True,False,False,False,False,True
916,57,130,236,0,LVH,174,0.0,Flat,1,1,0,True,False,False,True,False,False,True,False


In [42]:
df.drop(['RestingECG', 'ST_Slope'], axis=1, inplace=True)

In [43]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()
scaled_values= scaler.fit_transform(df)
df = pd.DataFrame(scaled_values, columns=df.columns)
df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,ST_Slope_Label,RestingECG_Label,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,ExerciseAngina_N,ExerciseAngina_Y
0,-1.428154,0.465900,0.849636,-0.550362,1.384320,-0.855469,-1.099475,1.042496,0.014143,-0.515943,0.515943,-1.077524,2.063325,-0.534905,-0.229550,0.822945,-0.822945
1,-0.475855,1.634714,-0.168122,-0.550362,0.752973,0.137516,0.909525,-0.622165,0.014143,1.938199,-1.938199,-1.077524,-0.484655,1.869492,-0.229550,0.822945,-0.822945
2,-1.745588,-0.118507,0.793612,-0.550362,-1.535661,-0.855469,-1.099475,1.042496,1.603415,-0.515943,0.515943,-1.077524,2.063325,-0.534905,-0.229550,0.822945,-0.822945
3,-0.581666,0.349019,0.149344,-0.550362,-1.141069,0.634008,0.909525,-0.622165,0.014143,1.938199,-1.938199,0.928054,-0.484655,-0.534905,-0.229550,-1.215148,1.215148
4,0.053200,1.050307,-0.028064,-0.550362,-0.588640,-0.855469,-1.099475,1.042496,0.014143,-0.515943,0.515943,-1.077524,-0.484655,1.869492,-0.229550,0.822945,-0.822945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,-0.899099,-1.287320,0.616205,-0.550362,-0.194048,0.336112,0.909525,-0.622165,0.014143,-0.515943,0.515943,-1.077524,-0.484655,-0.534905,4.356349,0.822945,-0.822945
895,1.534554,0.699663,-0.046738,1.816985,0.161085,2.520678,0.909525,-0.622165,0.014143,-0.515943,0.515943,0.928054,-0.484655,-0.534905,-0.229550,0.822945,-0.822945
896,0.370633,-0.118507,-0.625646,-0.550362,-0.864854,0.336112,0.909525,-0.622165,0.014143,-0.515943,0.515943,0.928054,-0.484655,-0.534905,-0.229550,-1.215148,1.215148
897,0.370633,-0.118507,0.354763,-0.550362,1.463238,-0.855469,0.909525,-0.622165,-1.575130,1.938199,-1.938199,-1.077524,2.063325,-0.534905,-0.229550,0.822945,-0.822945


In [44]:
from sklearn.model_selection import train_test_split

X= df.drop('HeartDisease', axis=1)
y= df.HeartDisease

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=10)

In [58]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)
y = y.astype(int)


In [54]:
from sklearn.tree import DecisionTreeClassifier

dtc= DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc.score(X_test, y_test)

0.8277777777777777

In [64]:
from sklearn.model_selection import cross_val_score

scores= cross_val_score(DecisionTreeClassifier(), X, y, cv=5)
scores.mean()

np.float64(0.758504034761018)

In [60]:
zzbag_model= BaggingClassifier(
    estimator= DecisionTreeClassifier(), # the model will be used
    n_estimators= 100, # divide the data to how many part
    max_samples= 0.8, # the percentage of the data would be in each part
    oob_score=True, # the data that will not exist in all parts would be used as test data
    random_state=0
)
bag_model.fit(X_train, y_train)
bag_model.oob_score_

0.8539638386648123