# feature selection

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
df.rename({'DiabetesPedigreeFunction':'pedigree'},axis=1,inplace=True)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,pedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [8]:
featSelector = SelectKBest(chi2,k=5)

In [9]:
X=df.iloc[:,:-1]
y=df['Outcome']
print(X.shape, y.shape)

(768, 8) (768,)


In [10]:
featSelector.fit(X,y)

SelectKBest(k=5, score_func=<function chi2 at 0x00000083447E03A0>)

In [11]:
import numpy as np
np.set_printoptions(precision=2)

In [12]:
print(featSelector.scores_)

[ 111.52 1411.89   17.61   53.11 2175.57  127.67    5.39  181.3 ]


In [13]:
print(f'COLS {featSelector.feature_names_in_}\nSEL {featSelector.get_feature_names_out()}')

COLS ['Pregnancies' 'Glucose' 'BloodPressure' 'SkinThickness' 'Insulin' 'BMI'
 'pedigree' 'Age']
SEL ['Pregnancies' 'Glucose' 'Insulin' 'BMI' 'Age']


In [14]:
features = featSelector.transform(X)
print(features.shape)

(768, 5)


In [15]:
features

array([[  6. , 148. ,   0. ,  33.6,  50. ],
       [  1. ,  85. ,   0. ,  26.6,  31. ],
       [  8. , 183. ,   0. ,  23.3,  32. ],
       ...,
       [  5. , 121. , 112. ,  26.2,  30. ],
       [  1. , 126. ,   0. ,  30.1,  47. ],
       [  1. ,  93. ,   0. ,  30.4,  23. ]])

In [16]:
scaler = StandardScaler()
scaledX = scaler.fit_transform(features)
xtrain, xtest, ytrain, ytest = train_test_split(scaledX,y,test_size=.2,random_state=0)
m = KNeighborsClassifier(n_neighbors=9)
m.fit(xtrain, ytrain)
ypred = m.predict(xtest)
cm = confusion_matrix(ytest, ypred)
print(cm)
print(classification_report(ytest,ypred))

[[93 14]
 [16 31]]
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       107
           1       0.69      0.66      0.67        47

    accuracy                           0.81       154
   macro avg       0.77      0.76      0.77       154
weighted avg       0.80      0.81      0.80       154



# wrapper method implementation

In [17]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [18]:
clf = LogisticRegression(solver='liblinear')
rfe = RFE(clf)
rfe.fit(X,y)

RFE(estimator=LogisticRegression(solver='liblinear'))

In [19]:
print('Features selected',rfe.n_features_)

Features selected 4


In [20]:
rfe.support_

array([ True,  True, False, False, False,  True,  True, False])

In [21]:
rfe.ranking_

array([1, 1, 2, 4, 5, 1, 1, 3])

In [22]:
X[rfe.get_feature_names_out()]

Unnamed: 0,Pregnancies,Glucose,BMI,pedigree
0,6,148,33.6,0.627
1,1,85,26.6,0.351
2,8,183,23.3,0.672
3,1,89,28.1,0.167
4,0,137,43.1,2.288
...,...,...,...,...
763,10,101,32.9,0.171
764,2,122,36.8,0.340
765,5,121,26.2,0.245
766,1,126,30.1,0.349


In [23]:
features = X[rfe.get_feature_names_out()]
scaler = StandardScaler()
scaledX = scaler.fit_transform(features)
xtrain, xtest, ytrain, ytest = train_test_split(scaledX,y,test_size=.2,random_state=0)
m = KNeighborsClassifier(n_neighbors=9)
m.fit(xtrain, ytrain)
ypred = m.predict(xtest)
cm = confusion_matrix(ytest, ypred)
print(cm)
print(classification_report(ytest,ypred))

[[89 18]
 [16 31]]
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       107
           1       0.63      0.66      0.65        47

    accuracy                           0.78       154
   macro avg       0.74      0.75      0.74       154
weighted avg       0.78      0.78      0.78       154



In [24]:
from joblib import dump

In [25]:
model_dict = {
    'title':'classification model for diabetes prediction',
    'classifier':m,
    'scaler':scaler,
}
dump(model_dict,'diabetes.pkl')

['diabetes.pkl']