# 지도학습 feature selection
## 분류 분석 데이터 (불량/정상 제품)
### 다중 t-검정 기반 feature selection 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

In [2]:
df = pd.read_csv('product.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 893 entries, 0 to 892
Data columns (total 72 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Defect  893 non-null    object 
 1   Para01  893 non-null    float64
 2   Para02  893 non-null    float64
 3   Para03  893 non-null    float64
 4   Para04  893 non-null    float64
 5   Para05  893 non-null    float64
 6   Para06  893 non-null    float64
 7   Para07  893 non-null    float64
 8   Para08  893 non-null    float64
 9   Para09  893 non-null    float64
 10  Para10  893 non-null    float64
 11  Para11  893 non-null    float64
 12  Para12  893 non-null    float64
 13  Para13  893 non-null    float64
 14  Para14  893 non-null    float64
 15  Para15  893 non-null    float64
 16  Para16  893 non-null    float64
 17  Para17  893 non-null    float64
 18  Para18  893 non-null    float64
 19  Para19  893 non-null    float64
 20  Para20  893 non-null    float64
 21  Para21  893 non-null    float64
 22  Pa

In [3]:
print(df.shape) # 데이터의 사이즈
print(df.isnull().sum().sum()) # missing value sum을 한번 하면 72개의 변수별로 계산, sum 2번이면 전체 계산

(893, 72)
0


In [4]:
X = df.drop('Defect', axis=1)
y = df['Defect']
y = np.where(y=="NG", 1, 0) # 공정 결과를 숫자로 변경, not good = 1, good = 0
xvar = X.columns
xvar

Index(['Para01', 'Para02', 'Para03', 'Para04', 'Para05', 'Para06', 'Para07',
       'Para08', 'Para09', 'Para10', 'Para11', 'Para12', 'Para13', 'Para14',
       'Para15', 'Para16', 'Para17', 'Para18', 'Para19', 'Para20', 'Para21',
       'Para22', 'Para23', 'Para24', 'Para25', 'Para26', 'Para27', 'Para28',
       'Para29', 'Para30', 'Para31', 'Para32', 'Para33', 'Para34', 'Para35',
       'Para36', 'Para37', 'Para38', 'Para39', 'Para40', 'Para41', 'Para42',
       'Para43', 'Para44', 'Para45', 'Para46', 'Para47', 'Para48', 'Para49',
       'Para50', 'Para51', 'Para52', 'Para53', 'Para54', 'Para55', 'Para56',
       'Para57', 'Para58', 'Para59', 'Para60', 'Para61', 'Para62', 'Para63',
       'Para64', 'Para65', 'Para66', 'Para67', 'Para68', 'Para69', 'Para70',
       'Para71'],
      dtype='object')

In [5]:
# t test를 하기 위해 데이터를 둘로 쪼갬
X1 = X[y==0] # 정상
X2 = X[y==1] # 불량

In [6]:
var_select=[]
for i in range(0,71):
    f, p = f_oneway(X1[xvar[i]], X2[xvar[i]]) # <----- f_oneway 함수로 t test 수행, 사실 One way ANOVA
    var_select.append((xvar[i], p)) # t test는 one way ANOVA에 속함, var_select는 p value를 계속 저장함

var_select = pd.DataFrame(var_select, columns=['Para', 'P-value'])
var_select['Result0.05'] = var_select['P-value']<0.05
var_select['Result0.01'] = var_select['P-value']<0.01

In [9]:
np.round(var_select[var_select['Result0.05']==True],4) # p pave가 0.05로 작은 것으로 필터링하고 반올림, 소수점 이하 4째자리까지만 출력

Unnamed: 0,Para,P-value,Result0.05,Result0.01
0,Para01,0.0181,True,False
5,Para06,0.0,True,True
8,Para09,0.0,True,True
11,Para12,0.0,True,True
14,Para15,0.0,True,True
17,Para18,0.0,True,True
23,Para24,0.0,True,True
26,Para27,0.0,True,True
31,Para32,0.0253,True,False
32,Para33,0.0004,True,True


In [8]:
var_select[['Result0.05','Result0.01']].sum()

Result0.05    21
Result0.01    18
dtype: int64

### Wrapper 방법을 이용한 feature selection

In [11]:
import feature_selection as fsel # 패키지가 없어서 누군가 코드를 개발해 놓은 것이 있음 (파이썬 코드를 짜놓았음, 폴더에 넣어져 있음)

#### 후진제거법

In [12]:
# elimination_criteria 기준
result_back = fsel.backwardSelection(X, y, model_type ="logistic", elimination_criteria = "aic")

Character Variables (Dummies Generated, First Dummies Dropped): []
Optimization terminated successfully.
         Current function value: 0.076216
         Iterations 22
Eliminated : Para05
Optimization terminated successfully.
         Current function value: 0.076216
         Iterations 22
Eliminated : Para42
Optimization terminated successfully.
         Current function value: 0.076217
         Iterations 22
Eliminated : Para45
Optimization terminated successfully.
         Current function value: 0.076221
         Iterations 22
Eliminated : Para18
Optimization terminated successfully.
         Current function value: 0.076227
         Iterations 21
Eliminated : Para31
Optimization terminated successfully.
         Current function value: 0.076248
         Iterations 21
Eliminated : Para69
Optimization terminated successfully.
         Current function value: 0.076276
         Iterations 20
Eliminated : Para40
Optimization terminated successfully.
         Current function value: 0

#### 전진제거법

In [13]:
result_forward = fsel.forwardSelection(X, y, model_type ="logistic", elimination_criteria = "aic")

Character Variables (Dummies Generated, First Dummies Dropped): []
Optimization terminated successfully.
         Current function value: 0.168939
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.165699
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.168745
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.168938
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.168815
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.168922
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.147576
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.167590
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.168791
         Iterations 7
Optimization te

Optimization terminated successfully.
         Current function value: 0.132659
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.134094
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.133846
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.133491
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.131772
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.134095
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.132902
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.134086
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.133785
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.133904
  

Optimization terminated successfully.
         Current function value: 0.128027
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.129038
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.128293
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.128340
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.128887
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.129074
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.128827
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.128661
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.129009
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.

Optimization terminated successfully.
         Current function value: 0.123649
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.123663
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.122569
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.123671
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.121779
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.122588
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.123666
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.123651
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.122839
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.

####  sklearn 패키지의 RFE 함수 (비추천)

In [14]:
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [15]:
# REF를 사용하기 위해서는 표준화를 반드시 먼저 수행 (단점1)
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(X_std)
X_std.columns = xvar

In [16]:
model = LogisticRegression(solver="liblinear")
rfe = RFE(model, n_features_to_select=4) # X변수를 몇 개를 선택해야하는지 미리 지정해야함 (단점2)
rfe.fit(X_std, y)

print("Num Features: ",rfe.n_features_) 
print("Selected Features: ",xvar[rfe.support_])

Num Features:  4
Selected Features:  Index(['Para09', 'Para12', 'Para24', 'Para66'], dtype='object')


#### Genetic 알고리즘

In [None]:
import genetic_algorithm as genetic

In [None]:
# X, Y의 이름 정의 
target = 'Defect'
df['Defect']=np.where(df['Defect']=="NG", 1, 0)

In [None]:
# Execute Genetic Algorithm to obtain Important Feature
np.random.seed(0)
#feature_set, acc_score = genetic.ga(df, xvar, target, 10, 1000)
feature_set, acc_score = genetic.ga(df, xvar, target, 10, 100)

In [None]:
# Print List of Features
print('Optimal Feature Set\n',X.columns[feature_set==1],'\nAIC =', round(1/acc_score))

### Embedding 방법을 이용한 feature selection

#### Decision Tree 방법

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X, y)
clf.feature_importances_

In [None]:
X.columns[clf.feature_importances_>0.001]