In [1]:
import sklearn
import sklearn.datasets
import pandas as pd
from sklearn import linear_model
import operator
import numpy as np
import itertools
from sklearn import metrics
from scipy.optimize import minimize 
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression

from fangorn.files_prep import get_data, data_to_pandas
from fangorn.preprocessing import splitting, feature_selection
from fangorn.training import classifiers

from category_encoders import OneHotEncoder

In [2]:
X_train = pd.read_csv('arcene_train.data', sep=' ', header=None)
y_train = pd.read_csv('arcene_train.labels', sep=' ', header=None)

X_valid = pd.read_csv('arcene_valid.data', sep=' ', header=None)
y_valid = pd.read_csv('arcene_valid.labels', sep=' ', header=None)

In [3]:
def numpy_discretize(X_train, X_test, max_gran=10):
    """
    multi-granularity discretization
    method. The basic idea is simple: instead of using a fine-tuned
    granularity, we discretize each numerical feature into several, rather
    than only one, categorical features, each with a different granularity.
    
    min granularity = 3
    
    Sometimes de edge values did not permit to execute correct discretization
    if this happens the step is not executed
    """
    
    # separa dados numericos que precisam de binarizacao
    is_numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_features = X_train.select_dtypes(include=is_numeric)
    discrete_features = []
    print(f"Discretizing {len(numeric_features.columns)} features...")
    feat_count = 0
    for feat in numeric_features:
        if feat_count % 50 == 0:
            print(f" Working in {feat}")
        X_train_np = X_train[[feat]].to_numpy()
        X_test_np = X_test[[feat]].to_numpy()
        for gran in range(10, max_gran+1):
            try:
                D_train = np.zeros([X_train.shape[0], 1])
                D_test = np.zeros([X_test.shape[0], 1])
                # calc numpy histogram and apply to features
                hist, bin_edges = np.histogram(X_train_np[:, 0], bins=gran)
                D_train[:, 0] = np.digitize(X_train_np[:,0], bin_edges, right=False)
                D_test[:, 0] = np.digitize(X_test_np[:,0], bin_edges, right=False)

                # apply back to pandas
                X_train[f"{feat}_{gran}"] = D_train
                X_test[f"{feat}_{gran}"] = D_test
            except:
                print(f"Not possible to correct work on cut {feat} > {gran}")
                break
        
        feat_count += 1
        X_train = X_train.drop(feat, axis=1)
        X_test = X_test.drop(feat, axis=1)
        
    return X_train, X_test

In [4]:
X_train_discrete, X_test_discrete = numpy_discretize(X_train.copy(), X_valid.copy())

Discretizing 10001 features...
 Working in 0
 Working in 50
 Working in 100
 Working in 150
 Working in 200
 Working in 250
 Working in 300
 Working in 350
 Working in 400
 Working in 450
 Working in 500
 Working in 550
 Working in 600
 Working in 650
 Working in 700
 Working in 750
 Working in 800
 Working in 850
 Working in 900
 Working in 950
 Working in 1000
 Working in 1050
 Working in 1100
 Working in 1150
 Working in 1200
 Working in 1250
 Working in 1300
 Working in 1350
 Working in 1400
 Working in 1450
 Working in 1500
 Working in 1550
 Working in 1600
 Working in 1650
 Working in 1700
 Working in 1750
 Working in 1800
 Working in 1850
 Working in 1900
 Working in 1950
 Working in 2000
 Working in 2050
 Working in 2100
 Working in 2150
 Working in 2200
 Working in 2250
 Working in 2300
 Working in 2350
 Working in 2400
 Working in 2450
 Working in 2500
 Working in 2550
 Working in 2600
 Working in 2650
 Working in 2700
 Working in 2750
 Working in 2800
 Working in 2850
 Worki

In [5]:
import pymit

def hjmi_selector(X, y, bins, max_features):
    
    X = X.to_numpy()
    Y = y.to_numpy().ravel()
    bins = 10

    [tmp, features] = X.shape
    D = np.zeros([tmp, features])

    for i in range(features):
        N, E = np.histogram(X[:,i], bins=bins)
        D[:,i] = np.digitize(X[:,i], E, right=False)

    selected_features = []
    j_h = 0
    hjmi = None

    for i in range(0,max_features):
        JMI = np.zeros([features], dtype=np.float)
        for X_k in range(features):
            if X_k in selected_features:
                continue
            jmi_1 = pymit.I(D[:,X_k], Y, bins=[bins,2])
            jmi_2 = 0
            for X_j in selected_features:
                tmp1 = pymit.I(D[:,X_k], D[:,X_j], bins=[bins,bins])
                tmp2 = pymit.I_cond(D[:,X_k], D[:,X_j], Y, bins=[bins,bins,2])
                jmi_2 += tmp1 - tmp2
            if len(selected_features) == 0:
                JMI[X_k] += j_h + jmi_1
            else:
                JMI[X_k] += j_h + jmi_1 - jmi_2/len(selected_features)
        
        f = JMI.argmax()
        j_h = JMI[f]
        if (hjmi == None) or ((j_h - hjmi)/hjmi > 0.03):
            r = 0
            if hjmi != None:
                r = ((j_h - hjmi)/hjmi) 

            hjmi = j_h
            selected_features.append(f)
            print("{:0>3d} {:>3d} {} - {}".format(len(selected_features), f, j_h, r))
        else:
            break
    return selected_features

In [6]:
selected_features = hjmi_selector(X_train_discrete.copy(), y_train.copy(), bins=10, max_features=99)

001 7647 0.34408924535842605 - 0
002 3920 0.728588866520957 - 1.1174415543328324
003 2146 1.1466533521469633 - 0.573800266290483
004 4367 1.6147014196891383 - 0.40818619390578176
005 131 2.096194570800886 - 0.298193303876852
006 5573 2.54426955247536 - 0.2137563887990034
007 829 2.9948120955917648 - 0.17708129340229095
008 5872 3.4476955788856 - 0.15122267068456824
009 5231 3.895558287992757 - 0.1299020458331533
010 3780 4.336451342146714 - 0.1131784000031311
011 6845 4.780944406909994 - 0.10250156860822474
012 4465 5.215701414840886 - 0.09093538241158583
013 2097 5.667638058867056 - 0.08664925540795305
014 7563 6.111956398123292 - 0.07839567993603555
015 4240 6.543160249579703 - 0.07055087166341935
016 8825 6.972529246750594 - 0.06562104255332449
017 6984 7.402789314584174 - 0.061707890007645955
018 731 7.833103224711253 - 0.05812861771971825
019 9407 8.258159770742989 - 0.054264131831022126
020 8359 8.682040943611577 - 0.051328768713135686
021 9465 9.110715668400303 - 0.0493748794290

In [8]:
filtered_train = X_train_discrete[X_train_discrete.columns[selected_features]]
filtered_test = X_test_discrete[X_test_discrete.columns[selected_features]]

In [None]:
filtered_train

In [9]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(filtered_train, y_train)


  This is separate from the ipykernel package so we can avoid doing imports until


KNeighborsClassifier(n_neighbors=3)

In [10]:
from sklearn.metrics import accuracy_score
accuracy_score(y_valid, neigh.predict(filtered_test))

0.78

In [36]:
MADELON_TRAIN = "madelon_train.data"
MADELON_TRAIN_LABELS = "madelon_train.labels"

data_raw = np.loadtxt(MADELON_TRAIN,dtype=np.float)
labels = np.loadtxt(MADELON_TRAIN_LABELS,dtype=np.float)

X = data_raw
Y = labels
[tmp, features] = X.shape
D = np.zeros([tmp, features])
for i in range(features):
    N, E = np.histogram(X[:,i], bins=10)
    D[:,i] = np.digitize(X[:,i], E, right=False)

In [20]:
import numpy
import pymit

MADELON_TRAIN = "madelon_train.data"
MADELON_TRAIN_LABELS = "madelon_train.labels"

data_raw = numpy.loadtxt(MADELON_TRAIN,dtype=numpy.float)
labels = numpy.loadtxt(MADELON_TRAIN_LABELS,dtype=numpy.float)

X = data_raw
Y = labels
bins = 10

[tmp, features] = X.shape
D = numpy.zeros([tmp, features])

for i in range(features):
    N, E = numpy.histogram(X[:,i], bins=bins)
    D[:,i] = numpy.digitize(X[:,i], E, right=False)

# max_features = 200
# selected_features = []
# j_h = 0
# hjmi = None

# for i in range(0,max_features):
#     JMI = numpy.zeros([features], dtype=numpy.float)
#     for X_k in range(features):
#         if X_k in selected_features:
#             continue
#         jmi_1 = pymit.I(D[:,X_k], Y, bins=[bins,2])
#         jmi_2 = 0
#         for X_j in selected_features:
#             tmp1 = pymit.I(D[:,X_k], D[:,X_j], bins=[bins,bins])
#             tmp2 = pymit.I_cond(D[:,X_k], D[:,X_j], Y, bins=[bins,bins,2])
#             jmi_2 += tmp1 - tmp2
#         if len(selected_features) == 0:
#             JMI[X_k] += j_h + jmi_1
#         else:
#             JMI[X_k] += j_h + jmi_1 - jmi_2/len(selected_features)
#     f = JMI.argmax()
#     j_h = JMI[f]
#     if (hjmi == None) or ((j_h - hjmi)/hjmi > 0.03):
#         r = 0
#         if hjmi != None:
#             r = ((j_h - hjmi)/hjmi) 
        
#         hjmi = j_h
#         selected_features.append(f)
#         print("{:0>3d} {:>3d} {} - {}".format(len(selected_features), f, j_h, r))
#     else:
#         break    

# expected_features=[241, 338, 378, 105, 472, 475, 433, 64, 128, 442, 453, 336, 48, 493, 281, 318, 153, 28, 451, 455]
# assert(expected_features == selected_features)

NameError: name 'asd' is not defined