In [30]:
#Importar bibliotecas
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler

In [4]:
#Remover o limite de colunas vizualizadas no dataset
pd.options.display.max_columns = None

In [5]:
#Ler o dataset
data = pd.read_csv("./datasets/diabetes.csv")

In [23]:
#Exibir
data.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [6]:
y = data['Outcome']
x = data.drop(['Outcome'], axis = 1)

In [53]:
def feature_selections_pearson(x, y, num_features):   
    cor_list = []
    
    feature_names = x.columns.tolist()
    
    for feature in feature_names:
        cor = np.corrcoef(x[feature],y)[0,1]
        cor_list.append(cor)    
    
    cor_list = [ 0 if np.isnan(x) else x for x in cor_list]
    
    cor_feature = x.iloc[:,np.argsort(np.abs(cor_list))[-num_features:]].columns.tolist()
    
    cor_support = [True if i in cor_feature else False for i in feature_names]
    
    return cor_support, cor_feature

cor_support ,cor_feature = feature_selections_pearson(x,y,5)

print("Features selecionadas:",cor_feature)

Features selecionadas: ['DiabetesPedigreeFunction', 'Pregnancies', 'Age', 'BMI', 'Glucose']


In [41]:
chi_selector = SelectKBest(chi2, k= 5)

chi_selector.fit(x,y)

chi_support = chi_selector.get_support()

chi_feature = x.loc[:,chi_support].columns.tolist()

print(chi_feature)

['Pregnancies', 'Glucose', 'Insulin', 'BMI', 'Age']


In [14]:
rfe_selector = RFE(estimator=LogisticRegression(max_iter=200),n_features_to_select=5,step=1)

rfe_selector.fit(x,y)

rfe_support = rfe_selector.get_support()

rfe_feature = x.loc[:,rfe_support].columns.tolist()

print(rfe_feature)

['Pregnancies', 'Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']


In [31]:
lasso_selector = SelectFromModel(LogisticRegression(solver="liblinear",penalty="l1",max_iter=200), max_features=5)

lasso_selector.fit(x,y)

lasso_support = lasso_selector.get_support()

lasso_feature = x.loc[:,lasso_support].columns.tolist()

print(lasso_feature)

['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction']


In [49]:
rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=10),max_features=5)

rf_selector.fit(x,y)

rf_support = rf_selector.get_support()

rf_feature = x.loc[:,rf_support].columns.tolist()

print(rf_feature)

['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']


In [66]:
feature_names = x.columns.tolist()

ft_df = pd.DataFrame({"Feature":feature_names, "Pearson":cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':lasso_support,
                    'Random Forest':rf_support})

ft_df["total"] = np.sum(ft_df, axis=1)

ft_df = ft_df.sort_values(["total","Feature"], ascending=False)
ft_df.index = range(1,len(ft_df)+1)

ft_df.head(10)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,total
1,Glucose,True,True,True,True,True,5
2,BMI,True,True,True,True,True,5
3,Pregnancies,True,True,True,True,False,4
4,DiabetesPedigreeFunction,True,False,True,True,True,4
5,Age,True,True,True,False,True,4
6,Insulin,False,True,False,False,False,1
7,BloodPressure,False,False,False,True,False,1
8,SkinThickness,False,False,False,False,False,0


In [67]:
feature_selections = ft_df["Feature"][:5].values

print(feature_selections)

['Glucose' 'BMI' 'Pregnancies' 'DiabetesPedigreeFunction' 'Age']
