# Machine Learning for Medicine
# Feature Selection
> * **Merrouche Aymen**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from sklearn.datasets import make_moons
from sklearn import linear_model, datasets

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Data

## Import medical data :

### Breast cancer data set : 
> Describes wheather a breast mas is malignant or not based cell nuclei characteristics.

In [3]:
# import the data
breast_cancer = pd.read_csv("data/Breast.txt",sep=" ")
breast_cancer_y = breast_cancer.values[:,30] # Classes
breast_cancer_X = breast_cancer.values[:,0:29] # Observations

In [4]:
breast_cancer_X

array([[ 1.82821197, -0.35332152,  1.68447255, ..., -0.14661996,
         1.08612862, -0.24367526],
       [ 1.5784992 ,  0.45578591,  1.56512598, ...,  0.85422232,
         1.95328166,  1.15124203],
       [-0.76823332,  0.25350905, -0.59216612, ...,  1.98783917,
         2.17387323,  6.04072615],
       ...,
       [ 0.70166686,  2.04377549,  0.67208442, ...,  0.32647934,
         0.41370467, -1.10357792],
       [ 1.83672491,  2.33440316,  1.98078127, ...,  3.1947936 ,
         2.28797231,  1.9173959 ],
       [-1.80681144,  1.22071793, -1.81279344, ..., -1.30468267,
        -1.7435287 , -0.04809589]])

In [5]:
breast_cancer_y

array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1.,  1.,  1.,  1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,  1., -1., -1.,
       -1., -1., -1., -1., -1., -1.,  1., -1.,  1.,  1.,  1.,  1.,  1.,
       -1., -1.,  1., -1., -1.,  1.,  1.,  1.,  1., -1.,  1., -1., -1.,
        1.,  1.,  1.,  1., -1.,  1., -1., -1.,  1., -1.,  1., -1., -1.,
        1.,  1.,  1., -1., -1.,  1., -1., -1., -1.,  1.,  1.,  1., -1.,
        1.,  1., -1., -1.,  1.,  1.,  1., -1., -1.,  1.,  1.,  1.,  1.,
       -1.,  1.,  1., -1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,
       -1., -1.,  1., -1., -1.,  1.,  1.,  1., -1., -1.,  1., -1.,  1.,
       -1., -1.,  1., -1., -1.,  1.,  1., -1.,  1.,  1., -1.,  1.,  1.,
        1.,  1., -1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1., -1.,
        1.,  1.,  1.,  1., -1., -1.,  1., -1.,  1.,  1., -1., -1.,  1.,
        1., -1., -1.,  1.,  1.,  1.,  1., -1.,  1.,  1., -1., -1

In [6]:
print("Number of examples :", breast_cancer_X.shape[0])
print("Dimension of the problem :", breast_cancer_X.shape[1])
print("Number of features :", np.unique(breast_cancer_y).shape[0])

Number of examples : 568
Dimension of the problem : 29
Number of features : 2


### For the Golub et al. 1999 dataset :
> Add description.

In [7]:
golub_X = pd.read_csv('data/Golub_X',sep=' ') # Observations
golub_y = pd.read_csv('data/Golub_y',sep=' ') # Classes

In [8]:
print("Number of examples :", golub_X.shape[0])
print("Dimension of the problem :", golub_X.shape[1])
print("Number of features :", np.unique(golub_y).shape[0])

Number of examples : 71
Dimension of the problem : 3562
Number of features : 2


In [9]:
medical_data = {"breast" : (pd.DataFrame(breast_cancer_X), breast_cancer_y), "golub" : (golub_X, golub_y.values)}

In [10]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
# classifiers to test
classifiers = {"dt" : DecisionTreeClassifier, "svm": SVC, "gb" : GradientBoostingClassifier}

# 1 - Feature Selection by Low Variance Feature Deletion :
> In this simple heaursitc approach, we delete features (variables) which have a variance that is lower than a threshold (we expect that variables with a low variance doesn't encompass discriminative information). We test this method on our two medical datasets for different values of the threshold. Furthermore, we compare the results of the different values by feedinf the reduced data matrix to three different classifiers : SVM classifier, Gradient boosting classifierand a decision tree classifier :

### To select what threshold to use, we display the varaince of each feature in our two medical datasets :

In [11]:
for data in medical_data :
    print("--------",data,"----------")
    print(medical_data[data][0].var())

-------- breast ----------
0     0.999641
1     0.994182
2     0.998919
3     1.000055
4     0.997425
5     0.982749
6     0.989351
7     0.990453
8     0.993091
9     0.992789
10    0.990831
11    1.001200
12    0.987608
13    0.990850
14    1.001683
15    0.998705
16    1.000839
17    1.000994
18    0.999436
19    1.000313
20    0.995486
21    0.998505
22    0.992405
23    0.994700
24    0.998748
25    0.989688
26    0.993915
27    0.992466
28    0.988420
dtype: float64
-------- golub ----------
0.708070978820836    0.026607
0.928074245939675    0.041344
0.553591160220995    0.039379
0.449211908931699    0.043680
0.36376604850214     0.044423
                       ...   
0.268886043533931    0.038709
0.161897295178361    0.022331
0.322953736654804    0.040869
0.754658385093168    0.019602
0.57089552238806     0.022804
Length: 3562, dtype: float64


In [12]:
thresholds = {"breast" : [0, 0.6, 1], "golub" : [0, 0.05, 0.04]}

In [13]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
for data in medical_data:
    print("--------",data,"----------")
    X_train, X_test, y_train, y_test = train_test_split(medical_data[data][0], medical_data[data][1], test_size=0.33, random_state=42)
    print(" Original Dimension : ",X_train.shape[1])
    for threshold in thresholds[data] :
        print("\t --------- threshold = ",threshold,"---------")
        sel = VarianceThreshold(threshold=threshold)
        sel.fit(X_train)
        X_train_t = sel.transform(X_train)
        X_test_t = sel.transform(X_test)
        print("\t Reduced Dimension : ",X_train_t.shape[1])
        for classifier in classifiers:
            print("\t\t --------",classifier,"----------")
            clf = classifiers[classifier]().fit(X_train_t, y_train.ravel())
            print("\t\t score after reduction : ",clf.score(X_test_t, y_test))

-------- breast ----------
 Original Dimension :  29
	 --------- threshold =  0 ---------
	 Reduced Dimension :  29
		 -------- dt ----------
		 score after reduction :  0.9042553191489362
		 -------- svm ----------
		 score after reduction :  0.973404255319149
		 -------- gb ----------
		 score after reduction :  0.9627659574468085
	 --------- threshold =  0.6 ---------
	 Reduced Dimension :  28
		 -------- dt ----------
		 score after reduction :  0.9148936170212766
		 -------- svm ----------
		 score after reduction :  0.973404255319149
		 -------- gb ----------
		 score after reduction :  0.9574468085106383
	 --------- threshold =  1 ---------
	 Reduced Dimension :  13
		 -------- dt ----------
		 score after reduction :  0.898936170212766
		 -------- svm ----------
		 score after reduction :  0.9574468085106383
		 -------- gb ----------
		 score after reduction :  0.9468085106382979
-------- golub ----------
 Original Dimension :  3562
	 --------- threshold =  0 ---------
	 Reduce

# 2 - Univariate feature selection with statistical tests :

> we test the effectiveness of this method by feeding the reduced data matrix to three different classifiers : SVM classifier, Gradient boosting classifierand a decision tree classifier :

In [14]:
from sklearn.feature_selection import SelectFdr, chi2
from sklearn.model_selection import train_test_split
for data in medical_data:
    print("--------",data,"----------")
    X_train, X_test, y_train, y_test = train_test_split(medical_data[data][0], medical_data[data][1], test_size=0.33, random_state=42)
    print(" Original Dimension : ",X_train.shape[1])
    sel = SelectFdr(alpha=0.01)
    sel.fit(X_train, y_train)
    X_train_t = sel.transform(X_train)
    X_test_t = sel.transform(X_test)
    print("\t Reduced Dimension : ",X_train_t.shape[1])
    for classifier in classifiers:
        print("\t\t --------",classifier,"----------")
        print("\t\t score with no reduction : ",classifiers[classifier]().fit(X_train, y_train.ravel()).score(X_test, y_test))
        clf = classifiers[classifier]().fit(X_train_t, y_train.ravel())
        print("\t\t score after reduction : ",clf.score(X_test_t, y_test))

-------- breast ----------
 Original Dimension :  29
	 Reduced Dimension :  24
		 -------- dt ----------
		 score with no reduction :  0.925531914893617
		 score after reduction :  0.9202127659574468
		 -------- svm ----------
		 score with no reduction :  0.973404255319149
		 score after reduction :  0.9787234042553191
		 -------- gb ----------
		 score with no reduction :  0.9521276595744681
		 score after reduction :  0.973404255319149
-------- golub ----------
 Original Dimension :  3562
	 Reduced Dimension :  94
		 -------- dt ----------
		 score with no reduction :  0.9166666666666666
		 score after reduction :  0.875
		 -------- svm ----------
		 score with no reduction :  0.875
		 score after reduction :  0.9583333333333334
		 -------- gb ----------
		 score with no reduction :  0.8333333333333334
		 score after reduction :  0.8333333333333334


# 3 - L1 based feature selection :

In [15]:
from sklearn.feature_selection import SelectFromModel

# 3 - 1 - Logistic regression penalized by the L1 penalty term :

In [16]:
from sklearn import linear_model
for alpha in [0.1, 0.01, 0.03, 0.05, 0.001]:
    print("++ alpha = ", alpha)
    lr = linear_model.Lasso(alpha)
    print("---------------------- Logistic Regression penalized by the L1 penalty term ---------------------")
    for data in medical_data:
        print("--------",data,"----------")
        X_train, X_test, y_train, y_test = train_test_split(medical_data[data][0], medical_data[data][1], test_size=0.33, random_state=42)
        print(" Original Dimension : ",X_train.shape[1])

        lr.fit(X_train, y_train)
        model = SelectFromModel(lr, prefit=True)
        X_train_t = model.transform(X_train)
        X_test_t = model.transform(X_test)
        print("\t Reduced Dimension : ",X_train_t.shape[1])
        print("\t\t Accuracy score : ",lr.score(X_test, y_test))

++ alpha =  0.1
---------------------- Logistic Regression penalized by the L1 penalty term ---------------------
-------- breast ----------
 Original Dimension :  29
	 Reduced Dimension :  5
		 Accuracy score :  0.6914601676351175
-------- golub ----------
 Original Dimension :  3562
	 Reduced Dimension :  0
		 Accuracy score :  -0.005100347065036814
++ alpha =  0.01
---------------------- Logistic Regression penalized by the L1 penalty term ---------------------
-------- breast ----------
 Original Dimension :  29
	 Reduced Dimension :  14
		 Accuracy score :  0.702081907541241
-------- golub ----------
 Original Dimension :  3562
	 Reduced Dimension :  27
		 Accuracy score :  0.7430387022168904
++ alpha =  0.03
---------------------- Logistic Regression penalized by the L1 penalty term ---------------------
-------- breast ----------
 Original Dimension :  29
	 Reduced Dimension :  11
		 Accuracy score :  0.7066900748272553
-------- golub ----------
 Original Dimension :  3562
	 Red

# 3 - 2 - support vector machine penalized by the L1 penalty term :

In [17]:
from sklearn.svm import LinearSVC
for C in [0.1, 0.3 ,0.5 , 0.01, 0.001] :
    print("++ C = ", C)
    lsvc = LinearSVC(C=C, penalty="l1", dual=False)
    print("---------------------- Logistic Regression penalized by the L1 penalty term ---------------------")

    for data in medical_data:
        print("--------",data,"----------")
        X_train, X_test, y_train, y_test = train_test_split(medical_data[data][0], medical_data[data][1], test_size=0.33, random_state=42)
        print(" Original Dimension : ",X_train.shape[1])

        lsvc.fit(X_train, y_train)
        model = SelectFromModel(lsvc, prefit=True)
        X_train_t = model.transform(X_train)
        X_test_t = model.transform(X_test)
        print("\t Reduced Dimension : ",X_train_t.shape[1])
        print("\t\t Accuracy score : ",lsvc.score(X_test, y_test))

++ C =  0.1
---------------------- Logistic Regression penalized by the L1 penalty term ---------------------
-------- breast ----------
 Original Dimension :  29
	 Reduced Dimension :  14
		 Accuracy score :  0.973404255319149
-------- golub ----------
 Original Dimension :  3562
	 Reduced Dimension :  4
		 Accuracy score :  0.8333333333333334
++ C =  0.3
---------------------- Logistic Regression penalized by the L1 penalty term ---------------------
-------- breast ----------
 Original Dimension :  29
	 Reduced Dimension :  15
		 Accuracy score :  0.973404255319149
-------- golub ----------
 Original Dimension :  3562
	 Reduced Dimension :  17
		 Accuracy score :  0.9166666666666666
++ C =  0.5
---------------------- Logistic Regression penalized by the L1 penalty term ---------------------
-------- breast ----------
 Original Dimension :  29
	 Reduced Dimension :  18
		 Accuracy score :  0.973404255319149
-------- golub ----------
 Original Dimension :  3562
	 Reduced Dimension :  

# 3 - 1 - Elastic Net, a compromise between the L1 and L2 penalty terms :

In [18]:
from sklearn.linear_model import ElasticNet
for alpha in [0.1, 0.01, 0.03, 0.05, 0.001]:
    print("++ alpha = ", alpha)
    eln = ElasticNet(alpha, l1_ratio=0.7)

    for data in medical_data:
        print("--------",data,"----------")
        X_train, X_test, y_train, y_test = train_test_split(medical_data[data][0], medical_data[data][1], test_size=0.33, random_state=42)
        print(" Original Dimension : ",X_train.shape[1])

        eln.fit(X_train, y_train)
        model = SelectFromModel(eln, prefit=True)
        X_train_t = model.transform(X_train)
        X_test_t = model.transform(X_test)
        print("\t Reduced Dimension : ",X_train_t.shape[1])
        print("\t\t Accuracy score : ",eln.score(X_test, y_test))

++ alpha =  0.1
-------- breast ----------
 Original Dimension :  29
	 Reduced Dimension :  8
		 Accuracy score :  0.6971187769445253
-------- golub ----------
 Original Dimension :  3562
	 Reduced Dimension :  2
		 Accuracy score :  0.11559885941836068
++ alpha =  0.01
-------- breast ----------
 Original Dimension :  29
	 Reduced Dimension :  12
		 Accuracy score :  0.7046129669750714
-------- golub ----------
 Original Dimension :  3562
	 Reduced Dimension :  39
		 Accuracy score :  0.7494702118910411
++ alpha =  0.03
-------- breast ----------
 Original Dimension :  29
	 Reduced Dimension :  9
		 Accuracy score :  0.7054564989723608
-------- golub ----------
 Original Dimension :  3562
	 Reduced Dimension :  18
		 Accuracy score :  0.7311857493837736
++ alpha =  0.05
-------- breast ----------
 Original Dimension :  29
	 Reduced Dimension :  8
		 Accuracy score :  0.7052644168506177
-------- golub ----------
 Original Dimension :  3562
	 Reduced Dimension :  12
		 Accuracy score : 

# 4 - Comparing the results :

> **Golub Dataset :**
> * The best classifier for the Golub dataset is the linear support vector classifier for $C \in \{0.3, 0.5\}$, the number of selected variable for this classifer was 17/3562. The score on the test set was $0.916$. Followed by the logistic regression classifier for $\alpha = 0.001$, the number of selected variable for this classifer was 46/3562. The score on the test set was $0.774$. Finally, the less performant classifier was the ElasticNet classifier for $\alpha = 0.001$, the number of selected variable for this classifer was 51/3562. The score on the test set was $0.753$.

> **Breast dataset :**
> * The best classifier for the Breast dataset is the linear support vector classifier for $C = 0.01$, the number of selected variable for this classifer was 5/29. The score on the test set was $0.957$. Followed by the ElasticNet classifier for $\alpha = 0.001$, the number of selected variable for this classifer was 7/29. The score on the test set was $0.758$  Finally, the less performant classifier was the logistic regression classifier for $\alpha = 0.001$, the number of selected variable for this classifer was 24/29. The score on the test set was $0.757$.