#### 1. Importing all the libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

#### 2. Define the parameters 

In [10]:
location = 'C:/Users/mmoran1/Documents/TAU/Thesis/Datasets'
file='diabetic_data' #adult #diabetic_data #Census_Income_KDD
filename= file +'_int.csv'

#### 3. Reading the files

In [11]:
data = pd.read_csv(location + '/' + filename, index_col=0)

#### 4.Separating features and label (need to change it to parameters) 

In [12]:
X = data.iloc[:,0:data.shape[1]-1]  # all rows, all the features and no labels
y = data.iloc[:, -1]  # all rows, label only

Columns=X
print (data.head(2))

       race  gender  age  admission_type_id  discharge_disposition_id  \
25415     3       0    8                  0                         5   
54687     3       1    5                  4                         0   

       admission_source_id  time_in_hospital  num_lab_procedures  \
25415                    6                 4                  60   
54687                    0                 1                  63   

       num_procedures  num_medications     ...      citoglipton  insulin  \
25415               0               11     ...                0        1   
54687               0               22     ...                0        3   

       glyburide-metformin  glipizide-metformin  glimepiride-pioglitazone  \
25415                    1                    0                         0   
54687                    1                    0                         0   

       metformin-rosiglitazone  metformin-pioglitazone  change  diabetesMed  \
25415                        0     

## Feture selection methods

In [13]:
a=data.columns.values
b=pd.DataFrame(a)
b.to_csv(filename + '_features.csv')
print(b)

                           0
0                       race
1                     gender
2                        age
3          admission_type_id
4   discharge_disposition_id
5        admission_source_id
6           time_in_hospital
7         num_lab_procedures
8             num_procedures
9            num_medications
10         number_outpatient
11          number_emergency
12          number_inpatient
13                    diag_1
14                    diag_2
15                    diag_3
16          number_diagnoses
17             max_glu_serum
18                 A1Cresult
19                 metformin
20               repaglinide
21               nateglinide
22            chlorpropamide
23               glimepiride
24             acetohexamide
25                 glipizide
26                 glyburide
27               tolbutamide
28              pioglitazone
29             rosiglitazone
30                  acarbose
31                  miglitol
32              troglitazone
33            

##### 1. Removing features with low variance

VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples.
As an example, suppose that we have a dataset with boolean features, and we want to remove all features that are either one or zero (on or off) in more than 80% of the samples. Boolean features are Bernoulli random variables, and the variance of such variables is given by
{Var}[X] = p(1 - p)

In [6]:
from sklearn.feature_selection import VarianceThreshold
columns = data.columns
sel = VarianceThreshold(threshold=(.6 * (1 - .6)))
sel.fit_transform(data)
features_id = sel.get_support(indices = True).tolist() #returns an array of integers corresponding to nonremoved features
features_name = [column for column in data[features_id]] #Array of all nonremoved features' names
print(features_id)
print(features_name)

[0, 2, 10, 11]
['Age', 'DayOfTheWeek', 'Sms_Reminder', 'AwaitingTime']


#####  2. Univariate feature selection

Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator. Scikit-learn exposes feature selection routines as objects that implement the transform method:
* <b>SelectKBest</b> removes all but the k highest scoring features
* <b>SelectPercentile</b> removes all but a user-specified highest scoring percentage of features


In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
#ch2 = SelectKBest(chi2, k = 20).fit(X, y)
ch2 = SelectPercentile(chi2, percentile=80).fit(X, y)
features_id = ch2.get_support(indices = True).tolist() #returns an array of integers corresponding to nonremoved features
features_name = [column for column in X[features_id]] #Array of all nonremoved features' names
print(features_id)
print(features_name)

[0, 1, 2, 3, 4, 5, 7, 8, 11]
['Age', 'Gender', 'DayOfTheWeek', 'Diabetes', 'Alcoolism', 'HiperTension', 'Smokes', 'Scholarship', 'AwaitingTime']


More Statistical tests:
* <b>f_classif</b>
    ANOVA F-value between label/feature for classification tasks.
* <b>mutual_info_classif</b>
    Mutual information for a discrete target.
* <b>chi2</b>
    Chi-squared stats of non-negative features for classification tasks.
* <b>f_regression</b>
    F-value between label/feature for regression tasks.
* <b>mutual_info_regression</b>
    Mutual information for a continious target.
* <b>SelectPercentile</b>
    Select features based on percentile of the highest scores.
* <b>SelectFpr</b>
    Select features based on a false positive rate test.
* <b>SelectFdr</b>
    Select features based on an estimated false discovery rate.
* <b>SelectFwe</b>
    Select features based on family-wise error rate.
* <b>GenericUnivariateSelect</b>
    Univariate feature selector with configurable mode.

##### 3.Tree-based feature selection

In [12]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)
Learner = tree.DecisionTreeClassifier()
Learner = Learner.fit(X_train, y_train)
importances = pd.DataFrame({'feature':Columns.columns,'importance':np.round(Learner.feature_importances_,3)})
#importances = importances.sort_values('importance',ascending=False).set_index('feature')
print(importances)

         feature  importance
0            Age       0.331
1         Gender       0.066
2   DayOfTheWeek       0.142
3       Diabetes       0.035
4      Alcoolism       0.010
5   HiperTension       0.051
6        Handcap       0.022
7         Smokes       0.017
8    Scholarship       0.026
9   Tuberculosis       0.001
10  Sms_Reminder       0.012
11  AwaitingTime       0.288
