In [1]:
import numpy as np
import pandas as pd


In [2]:
df=pd.read_csv('diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


1--> have Diabetes
0--> doesn't have Diabetes

In [4]:
df.shape

(768, 9)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


### converting values into array for faster computation

In [6]:
arr=df.values
x=arr[:,0:8]
y=arr[:,8]


### Chi Square test

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [8]:
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(x, y)

In [9]:
# Summarize scores

np.set_printoptions(precision=3)
print(fit.scores_)



[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


In [10]:
features = fit.transform(x)

# Summarize selected features

print(features[0:5,:])

[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


### Recursive feature elimination

In [11]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [12]:
# feature extraction

model = LogisticRegression()
rfe = RFE(estimator=model, n_features_to_select=3)  # Provide named arguments
fit = rfe.fit(x, y)

print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))



Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 4 5 6 1 1 3]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Applying Ridge regression 

In [13]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)
ridge.fit(x, y)
ridge

In [14]:
# A helper method for pretty-printing the coefficients
def pretty_print_coefs(coefs, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)
                                   for coef, name in lst)

print ("Ridge model:", pretty_print_coefs(ridge.coef_))


Ridge model: 0.021 * X0 + 0.006 * X1 + -0.002 * X2 + 0.0 * X3 + -0.0 * X4 + 0.013 * X5 + 0.145 * X6 + 0.003 * X7


### Variance Threshold

In [15]:
from sklearn.feature_selection import VarianceThreshold
thresh = VarianceThreshold(threshold=0.1)
var = thresh.fit_transform(x,y)

In [16]:
var[0]

array([  6.   , 148.   ,  72.   ,  35.   ,   0.   ,  33.6  ,   0.627,
        50.   ])

#### Selecting features based on importance

In [17]:
from sklearn.feature_selection import SelectFromModel


sfm = SelectFromModel(estimator=ridge,threshold=0.1).fit(x, y)

selected_feature_indices = sfm.get_support(indices=True)


selected_features = df.columns[selected_feature_indices]

print(f"Features selected by SelectFromModel: {selected_features} ")


Features selected by SelectFromModel: Index(['DiabetesPedigreeFunction'], dtype='object') 


#### convert your NumPy array to a Pandas DataFrame before calculating the correlation. 

In [18]:
X_df = pd.DataFrame(x, columns=df.columns[:-1])
X_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0
...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0


## correlation based

In [19]:
correlation_matrix=X_df.corr()

threshold = 0.5

highly_correlated_features = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            highly_correlated_features.add(colname)

            
# Print highly correlated features
print("Highly correlated features:")
print(highly_correlated_features)

Highly correlated features:
{'Age'}


# The End