# 지도학습 feature selection
## 분류 분석 데이터 (불량/정상 제품)
### 다중 t-검정 기반 feature selection 

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

In [None]:
df = pd.read_csv('product.csv')
df.info()

In [None]:
print(df.shape)
print(df.isnull().sum().sum())

In [None]:
X = df.drop('Defect', axis=1)
y = df['Defect']
y = np.where(y=="NG", 1, 0)
xvar = X.columns
xvar

In [None]:
X1 = X[y==0]
X2 = X[y==1]

In [None]:
var_select=[]
for i in range(0,71):
    f, p = f_oneway(X1[xvar[i]], X2[xvar[i]])
    var_select.append((xvar[i], p))

var_select = pd.DataFrame(var_select, columns=['Para', 'P-value'])
var_select['Result0.05'] = var_select['P-value']<0.05
var_select['Result0.01'] = var_select['P-value']<0.01

In [None]:
np.round(var_select[var_select['Result0.05']==True],4)

In [None]:
var_select[['Result0.05','Result0.01']].sum()

### Wrapper 방법을 이용한 feature selection

In [None]:
import feature_selection as fsel

#### 후진제거법

In [None]:
result_back = fsel.backwardSelection(X, y, model_type ="logistic", elimination_criteria = "aic")

#### 전진제거법

In [None]:
result_forward = fsel.forwardSelection(X, y, model_type ="logistic", elimination_criteria = "aic")

####  sklearn 패키지의 RFE 함수 (비추천)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
#표준화를 반드시 먼저 수행
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(X_std)
X_std.columns = xvar

In [None]:
model = LogisticRegression(solver="liblinear")
rfe = RFE(model, n_features_to_select=4) # X변수의 갯수를 지정해야 함
rfe.fit(X_std, y)

print("Num Features: ",rfe.n_features_) 
print("Selected Features: ",xvar[rfe.support_])

#### Genetic 알고리즘

In [None]:
import genetic_algorithm as genetic

In [None]:
# X, Y의 이름 정의 
target = 'Defect'
df['Defect']=np.where(df['Defect']=="NG", 1, 0)

In [None]:
# Execute Genetic Algorithm to obtain Important Feature
np.random.seed(0)
#feature_set, acc_score = genetic.ga(df, xvar, target, 10, 1000)
feature_set, acc_score = genetic.ga(df, xvar, target, 10, 100)

In [None]:
# Print List of Features
print('Optimal Feature Set\n',X.columns[feature_set==1],'\nAIC =', round(1/acc_score))

### Embedding 방법을 이용한 feature selection

#### Decision Tree 방법

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X, y)
clf.feature_importances_

In [None]:
X.columns[clf.feature_importances_>0.001]