In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score

In [3]:
def grid(estimator, scoring, cv, X, y):
    return GridSearchCV(
    estimator = estimator,
    param_grid = params,
    cv=cv,
    verbose=True,
    n_jobs=-3
    ).fit(X, y)

In [4]:
params = {
    "n_estimators": [100, 200, 300, 400, 500],
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 4, 5, 6, 7],
    "max_features": ["auto", "sqrt", "log2"],
    "bootstrap": [True, False],
    "warm_start": [True, False]
  },

# 카테고리 1, 4, 5, 6, 8, 9, 11, 12번째 feature

In [5]:
cat_features = ['0', '3', '4', '5', '7', '8', '10', '11']
len(cat_features)

8

In [6]:
data = pd.read_csv('dataset/australian/data.csv')
data.dropna(axis=0, inplace=True)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,22.08,11.46,2,4,4,1.585,0,0,0,1,2,100,1213,0
1,0,22.67,7.0,2,8,4,0.165,0,0,0,0,2,160,1,0
2,0,29.58,1.75,1,4,4,1.25,0,0,0,1,2,280,1,0
3,0,21.67,11.5,1,5,3,0.0,1,1,11,1,2,0,1,1
4,1,20.17,8.17,2,6,4,1.96,1,1,14,0,2,60,159,1


In [7]:
data.shape

(690, 15)

# 1. discretize each features
# 2. fast correlation based filter
# 3. check performance
- raw vs mdlp = 변화없음
- fcbf raw vs fcbf mdlp = ?

In [8]:
X = data.iloc[:, :14]
y = data['14']

In [9]:
cat_X = data.loc[:, cat_features]
cat_X.head()

Unnamed: 0,0,3,4,5,7,8,10,11
0,1,2,4,4,0,0,1,2
1,0,2,8,4,0,0,0,2
2,0,1,4,4,0,0,1,2
3,0,1,5,3,1,1,1,2
4,1,2,6,4,1,1,0,2


In [10]:
con_features = set(X.columns) - set(cat_features)
con_features

{'1', '12', '13', '2', '6', '9'}

In [11]:
con_X = data.loc[:, con_features]
con_X.head()

Unnamed: 0,12,2,1,6,13,9
0,100,11.46,22.08,1.585,1213,0
1,160,7.0,22.67,0.165,1,0
2,280,1.75,29.58,1.25,1,0
3,0,11.5,21.67,0.0,1,11
4,60,8.17,20.17,1.96,159,14


In [12]:
from discretization.mdlp import *
mdlp = MDLP(con_features=con_features, base=2, max_cutpoints=5)

In [13]:
con_X_dis = mdlp.fit_transform(con_X, y)

In [14]:
X_dis = pd.concat([cat_X, con_X_dis], axis=1)

In [17]:
estimator = RandomForestClassifier()

In [18]:
grid_search = grid(estimator, 'accuracy', 10, X, y)

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  60 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-3)]: Done 378 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-3)]: Done 628 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-3)]: Done 978 tasks      | elapsed:   37.6s
[Parallel(n_jobs=-3)]: Done 1428 tasks      | elapsed:   55.8s
[Parallel(n_jobs=-3)]: Done 1978 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-3)]: Done 2628 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-3)]: Done 3378 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-3)]: Done 4228 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-3)]: Done 5178 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  4.3min finished


In [19]:
grid_search.best_score_

0.86956521739130432

In [20]:
grid_search = grid(estimator, 'f1', 10, X, y)

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  60 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-3)]: Done 310 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-3)]: Done 560 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-3)]: Done 910 tasks      | elapsed:   34.8s
[Parallel(n_jobs=-3)]: Done 1360 tasks      | elapsed:   53.2s
[Parallel(n_jobs=-3)]: Done 1910 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-3)]: Done 2560 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-3)]: Done 3310 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-3)]: Done 4160 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-3)]: Done 5110 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  4.3min finished


In [21]:
grid_search.best_score_

0.86956521739130432

In [22]:
grid_search = grid(estimator, 'accuracy', 10, X_dis, y)

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  60 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-3)]: Done 311 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-3)]: Done 561 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-3)]: Done 911 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-3)]: Done 1361 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-3)]: Done 1911 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-3)]: Done 2561 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-3)]: Done 3311 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-3)]: Done 4161 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-3)]: Done 5111 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  3.8min finished


In [23]:
grid_search.best_score_

0.86811594202898545

In [24]:
grid_search = grid(estimator, 'f1', 10, X_dis, y)

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  60 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-3)]: Done 324 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-3)]: Done 574 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-3)]: Done 924 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-3)]: Done 1374 tasks      | elapsed:   52.4s
[Parallel(n_jobs=-3)]: Done 1924 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-3)]: Done 2574 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-3)]: Done 3324 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-3)]: Done 4174 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-3)]: Done 5124 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  3.9min finished


In [25]:
grid_search.best_score_

0.86956521739130432

In [26]:
from fcbf.feature_selection import *
selected = fcbf(X_dis, y, base=2)

In [27]:
selected_features = selected[0]
selected_features

[('7', 0.42791772566979708),
 ('9', 0.17871573608333077),
 ('13', 0.12403498302183787),
 ('6', 0.11060370163391238),
 ('4', 0.048648934269122118),
 ('12', 0.039026327090565603),
 ('3', 0.032757270846763803)]

In [28]:
remove_history = selected[1]
remove_history

{'12': [],
 '13': [],
 '4': [],
 '6': [],
 '7': [{'2': 0.043777054762339319},
  {'5': 0.03966521895353995},
  {'1': 0.033779660239261343},
  {'11': 0.025721196772722429},
  {'10': 0.0060414211863744635}],
 '9': [{'8': 0.82653374883879427}, {'0': 0.0025244982201832471}]}

In [29]:
best_features = ['7', '9', '13', '6']

In [30]:
grid_search = grid(estimator, 'accuracy', 10, X.loc[:,best_features], y)

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  60 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-3)]: Done 306 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-3)]: Done 556 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-3)]: Done 906 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-3)]: Done 1356 tasks      | elapsed:   53.1s
[Parallel(n_jobs=-3)]: Done 1906 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-3)]: Done 2556 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-3)]: Done 3306 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-3)]: Done 4156 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-3)]: Done 5106 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  4.2min finished


In [31]:
grid_search.best_score_

0.85797101449275359

In [32]:
grid_search = grid(estimator, 'f1', 10, X.loc[:,best_features], y)

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  40 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-3)]: Done 249 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-3)]: Done 499 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-3)]: Done 849 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-3)]: Done 1299 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-3)]: Done 1849 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-3)]: Done 2499 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-3)]: Done 3249 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-3)]: Done 4099 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-3)]: Done 5049 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  4.1min finished


In [33]:
grid_search.best_score_

0.85652173913043483

In [34]:
grid_search = grid(estimator, 'accuracy', 10, X_dis.loc[:, best_features], y)

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  60 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-3)]: Done 321 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-3)]: Done 571 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-3)]: Done 921 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-3)]: Done 1371 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-3)]: Done 1921 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-3)]: Done 2571 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-3)]: Done 3321 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-3)]: Done 4171 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-3)]: Done 5121 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  3.6min finished


In [35]:
grid_search.best_score_

0.8623188405797102

In [36]:
grid_search = grid(estimator, 'f1', 10, X_dis.loc[:, best_features], y)

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  60 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-3)]: Done 306 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-3)]: Done 556 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-3)]: Done 906 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-3)]: Done 1356 tasks      | elapsed:   49.9s
[Parallel(n_jobs=-3)]: Done 1906 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-3)]: Done 2556 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-3)]: Done 3306 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-3)]: Done 4156 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-3)]: Done 5106 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  3.7min finished


In [37]:
grid_search.best_score_

0.86376811594202896

# whether to use mdlp or not
> 성능에 그렇게 큰 차이는 없다

# whether to use correlation based filter feature selection
> 전부를 쓰던 추출한 일부를 쓰던 성능에 그렇게 큰 차이는 없다