In [19]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score

In [21]:
def grid(estimator, scoring, cv, X, y):
    return GridSearchCV(
    estimator = estimator,
    param_grid = params,
    cv=cv,
    verbose=True,
    n_jobs=-3
    ).fit(X, y)

In [22]:
params = {
    "n_estimators": [100, 200, 300, 400, 500],
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 4, 5, 6, 7],
    "max_features": ["auto", "sqrt", "log2"],
    "bootstrap": [True, False],
    "warm_start": [True, False]
  },

In [23]:
data = pd.read_csv('dataset/ionosphere/data.csv')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,b
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g


In [24]:
data['1'].value_counts()

0    351
Name: 1, dtype: int64

In [25]:
data.drop(['1'], axis=1, inplace=True)

In [26]:
X = data.drop('34', axis=1)
y = data['34']

In [27]:
len(X.columns)

33

In [28]:
X.head()

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,24,25,26,27,28,29,30,31,32,33
0,1,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,0.85243,...,0.56811,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453
1,1,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,0.50874,...,-0.20332,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447
2,1,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,0.73082,...,0.57528,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238
3,1,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,0.0,...,1.0,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0
4,1,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,0.52798,...,0.03286,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697


In [29]:
from discretization.mdlp import *
mdlp = MDLP(con_features=X.columns, base=2, max_cutpoints=5, n_jobs=4)

In [31]:
%%time
X_dis = mdlp.fit_transform(X, y)

CPU times: user 14.2 ms, sys: 16.7 ms, total: 30.8 ms
Wall time: 22.1 s


In [15]:
from fcbf.feature_selection import *
result = fcbf(X_dis, y, threshold=0, base=2)

In [16]:
result[0]

[('4', 0.34477893686864958),
 ('27', 0.29022156419603506),
 ('26', 0.2624786660949005),
 ('8', 0.2373995901098461),
 ('13', 0.2163907153256526),
 ('3', 0.2110259601955207)]

In [21]:
best_features = ['4', '27', '26', '8', '13', '3']

In [22]:
estimator = RandomForestClassifier()

In [23]:
y.value_counts()

g    225
b    126
Name: 34, dtype: int64

In [25]:
grid_search = grid(estimator, 'f1', 10, X, y)
grid_search.best_score_

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  40 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-3)]: Done 249 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-3)]: Done 499 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-3)]: Done 849 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-3)]: Done 1299 tasks      | elapsed:   51.6s
[Parallel(n_jobs=-3)]: Done 1849 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-3)]: Done 2499 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-3)]: Done 3249 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-3)]: Done 4099 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-3)]: Done 5049 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  5.0min finished


0.94017094017094016

In [26]:
grid_search = grid(estimator, 'f1', 10, X.loc[:, best_features], y)
grid_search.best_score_

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  60 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-3)]: Done 311 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-3)]: Done 561 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-3)]: Done 911 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-3)]: Done 1361 tasks      | elapsed:   49.5s
[Parallel(n_jobs=-3)]: Done 1911 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-3)]: Done 2561 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-3)]: Done 3311 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-3)]: Done 4161 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-3)]: Done 5111 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  4.0min finished


0.92592592592592593

In [27]:
grid_search = grid(estimator, 'f1', 10, X_dis, y)
grid_search.best_score_

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  40 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-3)]: Done 340 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-3)]: Done 840 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-3)]: Done 1301 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-3)]: Done 1751 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-3)]: Done 2301 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-3)]: Done 2951 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-3)]: Done 3701 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-3)]: Done 4551 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-3)]: Done 5501 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  3.6min finished


0.95441595441595439

In [28]:
grid_serach = grid(estimator, 'f1', 10, X_dis.loc[:, best_features], y)
grid_serach.best_score_

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  60 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-3)]: Done 305 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-3)]: Done 555 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-3)]: Done 905 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-3)]: Done 1355 tasks      | elapsed:   45.6s
[Parallel(n_jobs=-3)]: Done 1905 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-3)]: Done 2555 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-3)]: Done 3305 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-3)]: Done 4155 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-3)]: Done 5105 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  3.4min finished


0.93162393162393164