In [1]:
import numpy as np
import pandas as pd

In [57]:
df = pd.read_csv('diabetes.csv')

In [58]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [59]:
df.shape

(768, 9)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [61]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# X & Y split

In [62]:
X = df.drop('Outcome', axis=1)
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [63]:
Y = df['Outcome']
Y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [64]:
X.shape, Y.shape

((768, 8), (768,))

In [65]:
X.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
dtype: object

In [66]:
X_ohe = pd.get_dummies(X)

In [67]:
X_ohe.shape

(768, 8)

In [68]:
X_ohe.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
dtype: object

# Format the target Y

In [69]:
Y.dtypes

dtype('int64')

In [70]:
Y.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [71]:
Y.unique()

array([1, 0], dtype=int64)

# train test split

In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_ohe, Y, test_size=0.3, random_state=7, stratify=Y)

In [75]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((537, 8), (231, 8), (537,), (231,))

# Feature selection

# using PCA

In [76]:
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

In [77]:
pca = PCA(n_components = 0.95, random_state=7)

In [78]:
pca.fit(X_train)
# Y_train is not included bcoz it's unsupervised learning

In [79]:
pca.n_components_

2

In [80]:
pca.components_

array([[-2.18407478e-03,  1.02021040e-01,  1.44827924e-02,
         5.78502009e-02,  9.92914668e-01,  1.16538589e-02,
         6.06594145e-04, -3.84993102e-03],
       [-2.05145978e-02, -9.74492787e-01, -1.32288431e-01,
         4.60628284e-02,  9.94237939e-02, -5.20192437e-02,
        -1.00721730e-03, -1.33149689e-01]])

In [81]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

# decision tree

## using gini

In [82]:
dt = DecisionTreeClassifier(criterion='gini', random_state=7, class_weight='balanced')

In [83]:
dt.fit(X_train, Y_train)
# Y_train is included here bcoz it's supervised learning

In [84]:
dt.feature_importances_

array([0.05979953, 0.35506138, 0.11415279, 0.08361145, 0.01169348,
       0.10234483, 0.17095692, 0.10237962])

In [85]:
feature_tup = tuple(zip(X_train.columns, dt.feature_importances_))

In [86]:
sorted(feature_tup, key = lambda x : -x[1])

[('Glucose', 0.3550613818324099),
 ('DiabetesPedigreeFunction', 0.17095691987451025),
 ('BloodPressure', 0.11415279428370856),
 ('Age', 0.10237962394626178),
 ('BMI', 0.10234483055778776),
 ('SkinThickness', 0.08361144559337587),
 ('Pregnancies', 0.05979952778193624),
 ('Insulin', 0.011693476130009737)]

In [87]:
sel = SelectFromModel(DecisionTreeClassifier(criterion='gini', random_state=7, class_weight='balanced'),threshold='mean')

In [88]:
sel.fit(X_train, Y_train)

In [89]:
sel.get_feature_names_out

<bound method SelectorMixin.get_feature_names_out of SelectFromModel(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                 random_state=7),
                threshold='mean')>

In [90]:
X_train_DT = sel.transform(X_train)

In [91]:
X_test_DT = sel.transform(X_test)

In [92]:
sel.get_feature_names_out(X_train.columns)

array(['Glucose', 'DiabetesPedigreeFunction'], dtype=object)

In [93]:
sel.estimator_.feature_importances_

array([0.05979953, 0.35506138, 0.11415279, 0.08361145, 0.01169348,
       0.10234483, 0.17095692, 0.10237962])

In [94]:
X_train.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

# RFE Recursive Feature Selection

In [95]:
from sklearn.feature_selection import RFE

In [96]:
rfe = RFE(DecisionTreeClassifier(criterion='gini',random_state=7,class_weight='balanced'),
          n_features_to_select =0.5)

In [97]:
rfe.fit(X_train,Y_train)

In [98]:
rfe.ranking_

array([4, 1, 2, 3, 5, 1, 1, 1])

## using entropy

In [99]:
dt = DecisionTreeClassifier(criterion='entropy', random_state=7, class_weight='balanced')

In [100]:
dt.fit(X_train,Y_train)

In [101]:
dt.feature_importances_

array([0.10322608, 0.28613871, 0.08363414, 0.04225284, 0.05087119,
       0.16427504, 0.16229953, 0.10730246])

In [102]:
feature_tup = tuple(zip(X_train.columns, dt.feature_importances_))

In [103]:
sorted(feature_tup,key=lambda x:-x[1])

[('Glucose', 0.28613871046800726),
 ('BMI', 0.16427504382377736),
 ('DiabetesPedigreeFunction', 0.16229953243474524),
 ('Age', 0.10730246055199924),
 ('Pregnancies', 0.10322608377655687),
 ('BloodPressure', 0.0836341442501153),
 ('Insulin', 0.05087118645836312),
 ('SkinThickness', 0.04225283823643569)]

In [106]:
sel = SelectFromModel(DecisionTreeClassifier(criterion='entropy',random_state=7,class_weight='balanced' ),
                      threshold = "mean")

In [107]:
sel.fit(X_train,Y_train)

In [108]:
sel.get_feature_names_out

<bound method SelectorMixin.get_feature_names_out of SelectFromModel(estimator=DecisionTreeClassifier(class_weight='balanced',
                                                 criterion='entropy',
                                                 random_state=7),
                threshold='mean')>

In [109]:
X_train_DT = sel.transform(X_train)

In [110]:
X_test_DT = sel.transform(X_test)

In [111]:
sel.get_feature_names_out(X_train.columns)

array(['Glucose', 'BMI', 'DiabetesPedigreeFunction'], dtype=object)

In [112]:
sel.estimator_.feature_importances_

array([0.10322608, 0.28613871, 0.08363414, 0.04225284, 0.05087119,
       0.16427504, 0.16229953, 0.10730246])

In [113]:
X_train.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [114]:
# RFE Recursive Feature Selection

In [115]:
from sklearn.feature_selection import RFE

In [116]:
rfe = RFE(DecisionTreeClassifier(criterion='entropy',random_state=7,class_weight='balanced'),
          n_features_to_select =0.5)

In [117]:
rfe.fit(X_train,Y_train)

In [118]:
rfe.ranking_

array([2, 1, 3, 5, 4, 1, 1, 1])