## 지도학습 feature selection
## 분류 분석 데이터 (유방암 양성/악성 예측)
### 다중 t-검정 기반 feature selection 

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

In [None]:
df = pd.read_csv('wdbc.csv')
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum().sum()

In [None]:
df

In [None]:
X = df.drop(['diagnosis','ID'], axis=1)
y = df['diagnosis']
y = np.where(y=="M", 1, 0)
xvar = X.columns
xvar

In [None]:
X1 = df[y==0]
X2 = df[y==1]

In [None]:
var_select=[]
for i in range(0,20):
    f, p = f_oneway(X1[xvar[i]], X2[xvar[i]])
    var_select.append((xvar[i], p))

var_select = pd.DataFrame(var_select, columns=['Feature', 'P-value'])
var_select['t-test Result'] = var_select['P-value']<0.05

In [None]:
var_select
np.round(var_select[var_select['t-test Result']==True],4)

### Wrapper 방법을 이용한 feature selection

In [None]:
import feature_selection as fsel

#### 후진제거법

In [None]:
result_back = fsel.backwardSelection(X, y, model_type ="logistic", elimination_criteria = "aic")

In [None]:
result_back

In [None]:
var_select['Backward_Result'] = False
var_select.loc[[item in result_back for item in list(var_select['Feature'])], 'Backward_Result'] = True
var_select

#### 전진제거법

In [None]:
result_forward = fsel.forwardSelection(X, y, model_type ="logistic", elimination_criteria = "aic")

In [None]:
result_forward

In [None]:
var_select['Forward_Result'] = False
var_select.loc[[item in result_forward for item in list(var_select['Feature'])], 'Forward_Result'] = True
var_select

#### Genetic 알고리즘

In [None]:
import genetic_algorithm as genetic

In [None]:
# X, Y의 이름 정의 
target = 'diagnosis'
df['diagnosis']=np.where(df['diagnosis']=="M", 1, 0)

In [None]:
# Execute Genetic Algorithm to obtain Important Feature
np.random.seed(0)
#feature_set, acc_score = genetic.ga(df, xvar, target, 10, 1000)
feature_set, acc_score = genetic.ga(df, xvar, target, 10, 100)

In [None]:
# Print List of Features
print('Optimal Feature Set\n',X.columns[feature_set==1],'\nAIC =', round(1/acc_score))

In [None]:
var_select['Genetic_Result'] = False
var_select.loc[[item in X.columns[feature_set==1] for item in list(var_select['Feature'])], 'Genetic_Result'] = True
var_select

#### Decision Tree 방법

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X, y)
clf.feature_importances_

In [None]:
result_tree = X.columns[clf.feature_importances_>0.001]

In [None]:
var_select['Tree_Result'] = False
var_select.loc[[item in result_tree for item in list(var_select['Feature'])], 'Tree_Result'] = True
var_select.drop(['P-value'],axis=1)