In [1]:
# import linecache
# from collections import Counter
# import os
# import tracemalloc
#
# def display_top(snapshot, key_type='lineno', limit=3):
#     snapshot = snapshot.filter_traces((
#         tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
#         tracemalloc.Filter(False, "<unknown>"),
#     ))
#     top_stats = snapshot.statistics(key_type)
#
#     print("Top %s lines" % limit)
#     for index, stat in enumerate(top_stats[:limit], 1):
#         frame = stat.traceback[0]
#         # replace "/path/to/module/file.py" with "module/file.py"
#         filename = os.sep.join(frame.filename.split(os.sep)[-2:])
#         print("#%s: %s:%s: %.1f KiB"
#               % (index, filename, frame.lineno, stat.size / 1024))
#         line = linecache.getline(frame.filename, frame.lineno).strip()
#         if line:
#             print('    %s' % line)
#
#     other = top_stats[limit:]
#     if other:
#         size = sum(stat.size for stat in other)
#         print("%s other: %.1f KiB" % (len(other), size / 1024))
#     total = sum(stat.size for stat in top_stats)
#     print("Total allocated size: %.1f KiB" % (total / 1024))

# tracemalloc.start()
# counts = Counter()
# experimentation(classification_dataset_names[0])
# snapshot = tracemalloc.take_snapshot()
# display_top(snapshot)

In [2]:
from ex_func import *
from experiment_functions import *
import pandas as pd
from pmlb import fetch_data, classification_dataset_names
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import pickle



In [3]:
def get_feature_type(x, include_binary=False):
    x.dropna(inplace=True)
    if not check_if_all_integers(x):
        return 'continuous'
    else:
        if x.nunique() > 10:
            return 'continuous'
        if include_binary:
            if x.nunique() == 2:
                return 'binary'
        return 'categorical'

def get_target_type(x, include_binary=False):
    x.dropna(inplace=True)
    if x.dtype=='float64':
        return 'continuous'
    elif x.dtype=='int64':
        if include_binary:
            if x.nunique() == 2:
                return 'binary'
        return 'categorical'
    else:
        raise ValueError("Error getting type")

def check_if_all_integers(x):
    "check a pandas.Series is made of all integers."
    return all(float(i).is_integer() for i in x.unique())
def corr_data_for(df):
    TARGET_NAME = 'target'
    feat_names = [col for col in df.columns if col!=TARGET_NAME]
    types = [get_feature_type(df[col], include_binary=True) for col in feat_names]
    col = pd.DataFrame(feat_names,types)
    num_col = col[col.index == 'continuous']
    bin_col = col[col.index == 'binary']
    cat_col = col[col.index == 'categorical']
    cat_col = cat_col[0].tolist()
    dummy_col = pd.get_dummies(data=df, columns=cat_col)
    add_col = dummy_col.shape[1] - df.shape[1]

    if (add_col < df.shape[0] *0.3) & (dummy_col.shape[1] <  df.shape[0]) & (df.shape[0] < 10000) & (df.shape[1] < 100):
        df = dummy_col
        df.columns = df.columns.str.replace('.','_',regex=True)
        y = df['target']
        X = df.loc[:, df.columns != 'target']
        del df
        rows_data, columns_data = X.shape
        print('Dataset Information')
        print('Rows:',rows_data,)
        print('Columns:',columns_data)
        print('Number of classes:',y.nunique())
        print('Continous columns:', len(num_col))
        print('Binary columns:', len(bin_col))
        print('Categorical columns:',len(cat_col))
        print('-------------------------------------------------')
    else:
        del df
        return pd.DataFrame, pd.DataFrame
    return y, X

In [4]:
def split_function(y,X,it):
    sc = StandardScaler()
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = it, stratify=y)
    X_col = X_train.columns
    X_test.name = "X_test"
    X_train.name = "X_train"
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    X_train = pd.DataFrame(X_train,columns=X_col)
    X_test = pd.DataFrame(X_test,columns=X_col)
    return X_train, X_test, y_train, y_test

In [5]:
# classification_dataset_names = classification_dataset_names[30:33]

In [6]:
def experimentation(classification_dataset):
    res_rul = {}
    names = ['Reg-CART','CART','ORT','OCT','ORT-H','OCT-H','ORT+ORT-H','OCT+OCT-H']
    algorithms = ['LN','SVM','NB','KNN']
    pipelines = [LN_pipeline,SVM_pipeline,NB_pipeline,KNN_pipeline]

    iters = 5
    df = fetch_data(classification_dataset)
    print('Numer of NANs: ',df.isna().sum().sum())
    y, X = corr_data_for(df)

    del df
    if X.empty:
        return {}

    print(color.BOLD + '\n\n    ----------------------------------------- {} -----------------------------------------'.format(classification_dataset) + color.END)
    for it in range(iters):
        X_train, X_test, y_train, y_test = split_function(y,X,it)
        col_len = len(X_train.columns)
        factors = [0.5,1,1.2,1.4,1.6,1.8,2,2.5,3]

        models, performance = generate_tree(X_train, y_train, X_test, y_test, n_num=1, feat_size=len(X.columns),  max_iter_hy=2,sub_paths=True,depth_grid=range(2,3), depth_grid_hy=range(1,3), complexity_bi = 0.001, complexity_hy=0.001,  Reg_CART=False, ORT=False, ORT_H=False, Clas_CART=True, OCT=True, OCT_H=False)

        for perf,name in zip(performance,names):
            if not not perf:
                res_rul[(classification_dataset,name,it,1)] = sum(perf) / len(perf)

        act_name = []
        act_rules = []
        for model,name in zip(models,names):
            if (all(model)) & (not not model) & (None not in model):
                act_name.append(name)
                act_rules.append(model)

        datasets = gen_train_and_test_features(act_rules ,act_name , X_train, X_test)

        for model in datasets.keys():
            X_train_rules_and_features, X_test_rules_and_features = datasets[model][0]
            X_train_only_rules, X_test_only_rules = datasets[model][1]

            for algorithm,pipeline in zip(algorithms,pipelines):
                res_rul[(classification_dataset,model + f'_{algorithm}_rules',it,'all')] = pipeline(X_train_only_rules, X_test_only_rules, y_train, y_test)
                res_rul[(classification_dataset,model + f'_{algorithm}_rules_and_features',it,'all')] = pipeline(X_train_rules_and_features, X_test_rules_and_features, y_train, y_test)

            for fact in factors:
                if (round(len(X_train_rules_and_features.columns)*fact) <= X_train.shape[0]) & (round(col_len*fact) <= len(X_train_rules_and_features.columns)):
                    min_feat_rule = round(col_len*fact)

                    if (round(col_len*fact) > len(X_train_only_rules.columns)) & (fact != 0.5):
                        len_rule = 1
                        min_rule = len(X_train_only_rules.columns)
                    else:
                        len_rule = fact
                        min_rule = min(round(col_len*fact),len(X_train_only_rules.columns))

                    cols = SelectKBest(k=min_feat_rule).fit(X_train_rules_and_features,y_train).get_feature_names_out()
                    cols_rule = SelectKBest(k=min_rule).fit(X_train_only_rules,y_train).get_feature_names_out()

                else:
                     continue

                for algorithm,pipeline in zip(algorithms,pipelines):
                    res_rul[(classification_dataset,model + f'_{algorithm}_rules',it,len_rule)] = pipeline(X_train_only_rules[cols_rule], X_test_only_rules[cols_rule], y_train, y_test)
                    res_rul[(classification_dataset,model + f'_{algorithm}_rules_and_features',it,fact)] = pipeline(X_train_rules_and_features[cols], X_test_rules_and_features[cols], y_train, y_test)

        for algorithm,pipeline in zip(algorithms,pipelines):
            res_rul[(classification_dataset,algorithm,it,1)] = pipeline(X_train, X_test, y_train, y_test)
        del X_train, X_test
    return res_rul

In [7]:
# for data in classification_dataset_names:
#     data = fetch_data(data)
#     print(data.shape)

In [8]:
# orig = {}
# for classification_dataset in classification_dataset_names:
#     res_rul = experimentation(classification_dataset,1)
#     orig.update(res_rul)

In [9]:
from joblib import delayed
from tqdm import tqdm
res_rul = ProgressParallel(n_jobs=15)(delayed(experimentation)(data) for data in classification_dataset_names)

0it [00:00, ?it/s]

ValueError: at least one array or dtype is required

In [24]:
result = {}
for d in res_rul:
    result.update(d)

In [44]:
# import os
# files = os.listdir('C:/Users/paulr/PycharmProjects/pythonProject/ORRFA-2/')

In [45]:
# list_of_dfs = []
# for file in files:
#     if file.endswith('pickle'):
#         with open(file, 'rb') as handle:
#             b = pickle.load(handle)
#             # df = pd.concat({k:json_normalize(v, 'scores', ['best']) for k,v in d.items()})
#             # df = df.reset_index(level=1, drop=True).rename_axis('names').reset_index()
# #             list_of_dfs.append(b)
# big_df = pd.concat(list_of_dfs, ignore_index=True)#ignore_index to reset index of big_df
# big_df.head()

In [25]:
k = pd.DataFrame(result,index=[0])

In [31]:
k.to_csv('results_gridsearch.csv')

In [26]:
k = pd.DataFrame(result,index=[0])
k = k.stack(level=2).sort_index()
k = k.stack(level=2).sort_index()
k = k.swaplevel(axis=1)
k = k.droplevel(0)

In [27]:
k

Unnamed: 0_level_0,Unnamed: 1_level_0,CART,CART_KNN_rules,CART_KNN_rules_and_features,CART_LN_rules,CART_LN_rules_and_features,CART_NB_rules,CART_NB_rules_and_features,CART_SVM_rules,CART_SVM_rules_and_features,KNN,...,OCT_LN_rules,OCT_LN_rules_and_features,OCT_NB_rules,OCT_NB_rules_and_features,OCT_SVM_rules,OCT_SVM_rules_and_features,ORT,ORT-H,Reg-CART,SVM
Unnamed: 0_level_1,Unnamed: 1_level_1,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1,...,yeast,yeast,yeast,yeast,yeast,yeast,yeast,yeast,yeast,yeast
0,0.5,,0.528125,0.503125,0.521875,0.48125,0.496875,0.496875,0.521875,0.50625,,...,0.503378,0.445946,0.162162,0.418919,0.503378,0.422297,,,,
0,1,0.521875,0.528125,0.484375,0.521875,0.490625,0.496875,0.496875,0.521875,0.503125,0.5,...,0.574324,0.523649,0.199324,0.216216,0.574324,0.540541,,,,0.577703
0,1.2,,,0.50625,,0.496875,,0.496875,,0.5375,,...,0.560811,0.581081,0.199324,0.334459,0.574324,0.597973,,,,
0,1.4,,,,,,,,,,,...,0.574324,0.587838,0.199324,0.337838,0.574324,0.594595,,,,
0,1.6,,,,,,,,,,,...,,0.577703,,0.266892,,0.60473,,,,
0,1.8,,,,,,,,,,,...,,0.587838,,0.219595,,0.594595,,,,
0,2,,,,,,,,,,,...,,0.581081,,0.206081,,0.601351,,,,
0,2.5,,,,,,,,,,,...,,,,,,,,,,
0,3,,,,,,,,,,,...,,,,,,,,,,
0,all,,0.528125,0.5,0.521875,0.496875,0.496875,0.496875,0.521875,0.553125,,...,0.574324,0.587838,0.199324,0.216216,0.574324,0.60473,,,,


In [28]:
# k.to_csv('result_girdsearch.csv')

In [29]:
k.swaplevel(axis=0).mean(level=0).mean(level=0,axis=1).iloc[9].sort_values(ascending=False)

  k.swaplevel(axis=0).mean(level=0).mean(level=0,axis=1).iloc[9].sort_values(ascending=False)
  k.swaplevel(axis=0).mean(level=0).mean(level=0,axis=1).iloc[9].sort_values(ascending=False)


OCT_LN_rules_and_features           0.814747
OCT+OCT-H_LN_rules_and_features     0.814747
OCT_SVM_rules_and_features          0.810290
OCT+OCT-H_SVM_rules_and_features    0.810290
CART_SVM_rules_and_features         0.809512
CART_LN_rules_and_features          0.806706
OCT_LN_rules                        0.788219
OCT+OCT-H_LN_rules                  0.788219
OCT_SVM_rules                       0.788065
OCT+OCT-H_SVM_rules                 0.788065
CART_KNN_rules_and_features         0.784306
OCT_KNN_rules_and_features          0.783579
OCT+OCT-H_KNN_rules_and_features    0.783579
OCT+OCT-H_KNN_rules                 0.771865
OCT_KNN_rules                       0.771865
CART_LN_rules                       0.770939
CART_SVM_rules                      0.770553
CART_KNN_rules                      0.758693
OCT+OCT-H_NB_rules_and_features     0.706582
OCT_NB_rules_and_features           0.706582
CART_NB_rules_and_features          0.696269
OCT_NB_rules                        0.689191
OCT+OCT-H_

In [30]:
k.swaplevel(axis=0).mean(level=0).mean(level=0,axis=1).iloc[1].sort_values(ascending=False)

  k.swaplevel(axis=0).mean(level=0).mean(level=0,axis=1).iloc[1].sort_values(ascending=False)
  k.swaplevel(axis=0).mean(level=0).mean(level=0,axis=1).iloc[1].sort_values(ascending=False)


OCT_LN_rules_and_features           0.807594
OCT+OCT-H_LN_rules_and_features     0.807594
OCT_SVM_rules_and_features          0.806406
OCT+OCT-H_SVM_rules_and_features    0.806406
CART_SVM_rules_and_features         0.802285
CART_LN_rules_and_features          0.797856
SVM                                 0.796436
OCT_LN_rules                        0.791212
OCT+OCT-H_LN_rules                  0.791212
OCT+OCT-H_SVM_rules                 0.791018
OCT_SVM_rules                       0.791018
OCT+OCT-H_KNN_rules_and_features    0.787205
OCT_KNN_rules_and_features          0.787205
OCT                                 0.786857
CART_KNN_rules_and_features         0.783092
LN                                  0.780143
OCT_KNN_rules                       0.774815
OCT+OCT-H_KNN_rules                 0.774815
CART_LN_rules                       0.769131
CART_SVM_rules                      0.768811
CART                                0.767891
KNN                                 0.762949
CART_KNN_r

In [33]:
t=k.mean(level=0,axis=1)
t = t.mean(axis=0)
t.sort_values(ascending = False)

  t=k.mean(level=0,axis=1)


Logistic_Regression                 0.790850
OCT                                 0.777091
OCT_LG_rules_and_features           0.770963
OCT+OCT-H_LG_rules_and_features     0.770963
CART_LG_rules_and_features          0.769029
Support Vector Machine              0.768332
OCT_SVM_rules_and_features          0.756744
OCT+OCT-H_SVM_rules_and_features    0.756744
CART_SVM_rules_and_features         0.754992
CART                                0.748986
K-Nearest-Neighbor                  0.740759
OCT_KNN_rules_and_features          0.731947
OCT+OCT-H_KNN_rules_and_features    0.731947
CART_KNN_rules_and_features         0.730521
OCT_LG_rules                        0.715176
OCT+OCT-H_LG_rules                  0.715176
OCT+OCT-H_NB_rules_and_features     0.708839
OCT_NB_rules_and_features           0.708839
OCT+OCT-H_SVM_rules                 0.707052
OCT_SVM_rules                       0.707052
Naive Bayes                         0.706945
CART_NB_rules_and_features          0.705503
OCT_KNN_ru

In [None]:
y = k.swaplevel(axis=1)
y = y.var(level=0,axis=1)
y = y.mean(axis=0)
good_tests = y[y < 0.01].index
good = list(good_tests)

In [None]:
vaild_results = k.iloc[:,k.columns.isin(good, level=1)]
vaild_results=vaild_results.mean(level=0,axis=1)
vaild_results.mean(axis=0)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(nrows = 5, ncols = 4, gridspec_kw = {"hspace": 0.25})
import seaborn as sns
fig.set_size_inches(30, 25)
iteration = 0

for m in range(5):
    for j in range(4):

        dataset = classification_dataset_names[:20][iteration]

        columns = [i for i in k.columns if dataset in i]
        sns.boxplot(k[columns], ax = ax[m, j])

        ax[m, j].set_title(dataset)

        ax[m, j].set_xticklabels(['CART Rules', "OCT Rules", "Logistic Regression", "RuleFit", "ORRFA"])

        iteration += 1



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots()
sns.boxplot(data = performance_by_iter)
fig.set_size_inches(20, 10)
ax.set_xticklabels(performance_by_iter.columns.values)
# ax.set_ylim(0.93, 0.995)
ax.tick_params(rotation = 0, labelsize = 14)
ax.set_ylabel("Accuracy", fontsize = 14)
ax.set_title("Accuracy of Logistic Regression, RuleFit and ORRFA", fontsize = 15)
# ax.set_ylabel()
plt.savefig('Benchmark ORRFA.png')

In [74]:
from pmlb import fetch_data, classification_dataset_names

In [10]:
for classification_dataset in classification_dataset_names:
    res_rul = {}
    names = ['Reg-CART','CART','ORT','OCT','ORT-H','OCT-H','ORT+ORT-H','OCT+OCT-H']
    algorithms = ['LN','SVM','NB','KNN']
    pipelines = [LN_pipeline,SVM_pipeline,NB_pipeline,KNN_pipeline]
    classification_dataset =classification_dataset
    iters = 1
    df = fetch_data(classification_dataset)
    print('Numer of NANs: ',df.isna().sum().sum())
    y, X = corr_data_for(df)

    del df
    if X.empty:
        continue
    print(color.BOLD + '\n\n    ----------------------------------------- {} -----------------------------------------'.format(classification_dataset) + color.END)
    for it in range(iters):
        X_train, X_test, y_train, y_test = split_function(y,X,it)
        col_len = len(X_train.columns)
        factors = [0.5,1,1.2,1.4,1.6,1.8,2,2.5,3]

        models, performance = generate_tree(X_train, y_train, X_test, y_test, n_num=1, feat_size=len(X.columns),  max_iter_hy=2,sub_paths=False,depth_grid=range(1,6), depth_grid_hy=range(1,3), complexity_bi = 0.001, complexity_hy=0.001,  Reg_CART=False, ORT=False, ORT_H=False, Clas_CART=True, OCT=True, OCT_H=False)

        for perf,name in zip(performance,names):
            if not not perf:
                res_rul[(classification_dataset,name,it,1)] = sum(perf) / len(perf)

        act_name = []
        act_rules = []
        for model,name in zip(models,names):
            if (all(model)) & (not not model) & (None not in model):
                act_name.append(name)
                act_rules.append(model)

        datasets = gen_train_and_test_features(act_rules ,act_name , X_train, X_test)

        for model in datasets.keys():
            X_train_rules_and_features, X_test_rules_and_features = datasets[model][0]
            X_train_only_rules, X_test_only_rules = datasets[model][1]

            for algorithm,pipeline in zip(algorithms,pipelines):
                res_rul[(classification_dataset,model + f'_{algorithm}_rules',it,'all')] = pipeline(X_train_only_rules, X_test_only_rules, y_train, y_test)
                res_rul[(classification_dataset,model + f'_{algorithm}_rules_and_features',it,'all')] = pipeline(X_train_rules_and_features, X_test_rules_and_features, y_train, y_test)

            for fact in factors:
                if (round(len(X_train_rules_and_features.columns)*fact) <= X_train.shape[0]) & (round(col_len*fact) <= len(X_train_rules_and_features.columns)):
                    min_feat_rule = round(col_len*fact)

                    if (round(col_len*fact) > len(X_train_only_rules.columns)) & (fact != 0.5):
                        len_rule = 1
                        min_rule = len(X_train_only_rules.columns)
                    else:
                        len_rule = fact
                        min_rule = min(round(col_len*fact),len(X_train_only_rules.columns))

                    cols = SelectKBest(k=min_feat_rule).fit(X_train_rules_and_features,y_train).get_feature_names_out()
                    cols_rule = SelectKBest(k=min_rule).fit(X_train_only_rules,y_train).get_feature_names_out()

                else:
                     continue

                for algorithm,pipeline in zip(algorithms,pipelines):
                    res_rul[(classification_dataset,model + f'_{algorithm}_rules',it,len_rule)] = pipeline(X_train_only_rules[cols_rule], X_test_only_rules[cols_rule], y_train, y_test)
                    res_rul[(classification_dataset,model + f'_{algorithm}_rules_and_features',it,fact)] = pipeline(X_train_rules_and_features[cols], X_test_rules_and_features[cols], y_train, y_test)

        for algorithm,pipeline in zip(algorithms,pipelines):
            res_rul[(classification_dataset,algorithm,it,1)] = pipeline(X_train, X_test, y_train, y_test)
        del X_train, X_test

Numer of NANs:  0
Numer of NANs:  0
Dataset Information
Rows: 1600
Columns: 60
Number of classes: 2
Continous columns: 0
Binary columns: 0
Categorical columns: 20
-------------------------------------------------
[1m

    ----------------------------------------- GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1 -----------------------------------------[0m
Classification CART mean performance:  0.51875


Classification OCT performance:  0.64375


Numer of NANs:  0
Dataset Information
Rows: 1600
Columns: 56
Number of classes: 2
Continous columns: 0
Binary columns: 2
Categorical columns: 18
-------------------------------------------------
[1m

    ----------------------------------------- GAMETES_Epistasis_2_Way_20atts_0.4H_EDM_1_1 -----------------------------------------[0m
Classification CART mean performance:  0.50625


Classification OCT performance:  0.8125


Numer of NANs:  0
Dataset Information
Rows: 1600
Columns: 60
Number of classes: 2
Continous columns: 0
Binary columns: 0
Cate

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Dataset Information
Rows: 72
Columns: 3
Number of classes: 6
Continous columns: 3
Binary columns: 0
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- confidence -----------------------------------------[0m
Classification CART mean performance:  0.6666666666666667


Classification OCT performance:  0.8




  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Numer of NANs:  0
Dataset Information
Rows: 1473
Columns: 21
Number of classes: 3
Continous columns: 2
Binary columns: 3
Categorical columns: 4
-------------------------------------------------
[1m

    ----------------------------------------- contraceptive -----------------------------------------[0m
Classification CART mean performance:  0.5694915254237288


Classification OCT performance:  0.5457627118644068


Numer of NANs:  0
Dataset Information
Rows: 160
Columns: 6
Number of classes: 2
Continous columns: 0
Binary columns: 6
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- corral -----------------------------------------[0m
Classification CART mean performance:  1.0


Classification OCT performance:  1.0


Numer of NANs:  0
Dataset Information
Rows: 690
Columns: 34
Number of classes: 2
Continous columns: 7
Binary columns: 3
Categorical columns: 5
--------------------------------------

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Dataset Information
Rows: 768
Columns: 8
Number of classes: 2
Continous columns: 8
Binary columns: 0
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- diabetes -----------------------------------------[0m
Classification CART mean performance:  0.7077922077922079


Classification OCT performance:  0.7207792207792207


Numer of NANs:  0
Dataset Information
Rows: 3772
Columns: 35
Number of classes: 2
Continous columns: 6
Binary columns: 19
Categorical columns: 4
-------------------------------------------------
[1m

    ----------------------------------------- dis -----------------------------------------[0m
Classification CART mean performance:  0.9920529801324504


Fitted OptimalTreeClassifier:
  1) Predict: 1 (98.48%), [46,2971], 3017 points, error 0.01525 is of depth 1 - cannot give out rules
Classification OCT performance:  0.9841059602649007


Numer of NANs:  0
Numer of NANs:  0
Dataset 

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Dataset Information
Rows: 163
Columns: 9
Number of classes: 2
Continous columns: 9
Binary columns: 0
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- glass2 -----------------------------------------[0m
Classification CART mean performance:  0.8484848484848485


Classification OCT performance:  0.8181818181818181


Numer of NANs:  0
Dataset Information
Rows: 306
Columns: 3
Number of classes: 2
Continous columns: 3
Binary columns: 0
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- haberman -----------------------------------------[0m
Classification CART mean performance:  0.7419354838709677


Fitted OptimalTreeClassifier:
  1) Predict: 1 (73.36%), [179,65], 244 points, error 0.2664 is of depth 1 - cannot give out rules
Classification OCT performance:  0.7419354838709677


Numer of NANs:  0
Dataset Information
Rows: 160

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Dataset Information
Rows: 500
Columns: 7
Number of classes: 2
Continous columns: 3
Binary columns: 1
Categorical columns: 1
-------------------------------------------------
[1m

    ----------------------------------------- irish -----------------------------------------[0m
Classification CART mean performance:  1.0


Classification OCT performance:  1.0




  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Numer of NANs:  0
Dataset Information
Rows: 3196
Columns: 38
Number of classes: 2
Continous columns: 0
Binary columns: 35
Categorical columns: 1
-------------------------------------------------
[1m

    ----------------------------------------- kr_vs_kp -----------------------------------------[0m
Classification CART mean performance:  0.9515625


Classification OCT performance:  0.96875


Numer of NANs:  0
Numer of NANs:  0
Numer of NANs:  0
Dataset Information
Rows: 3200
Columns: 24
Number of classes: 10
Continous columns: 0
Binary columns: 24
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- led24 -----------------------------------------[0m
Classification CART mean performance:  0.684375


Classification OCT performance:  0.6953125


Numer of NANs:  0
Dataset Information
Rows: 3200
Columns: 7
Number of classes: 10
Continous columns: 0
Binary columns: 7
Categorical columns: 0
-----------

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Numer of NANs:  0
Numer of NANs:  0
Dataset Information
Rows: 2000
Columns: 76
Number of classes: 10
Continous columns: 76
Binary columns: 0
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- mfeat_fourier -----------------------------------------[0m
Classification CART mean performance:  0.595


Classification OCT performance:  0.7424999999999999


Numer of NANs:  0
Dataset Information
Rows: 2000
Columns: 64
Number of classes: 10
Continous columns: 64
Binary columns: 0
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- mfeat_karhunen -----------------------------------------[0m
Classification CART mean performance:  0.6925


Classification OCT performance:  0.7475


Numer of NANs:  0
Dataset Information
Rows: 2000
Columns: 19
Number of classes: 10
Continous columns: 3
Binary columns: 0
Categorical columns: 3
-----------

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Dataset Information
Rows: 5404
Columns: 5
Number of classes: 2
Continous columns: 5
Binary columns: 0
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- phoneme -----------------------------------------[0m
Classification CART mean performance:  0.8362627197039778


Classification OCT performance:  0.8344125809435707


Numer of NANs:  0
Dataset Information
Rows: 768
Columns: 8
Number of classes: 2
Continous columns: 8
Binary columns: 0
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- pima -----------------------------------------[0m
Classification CART mean performance:  0.7272727272727273


Classification OCT performance:  0.7532467532467533


Numer of NANs:  0
Numer of NANs:  0
Dataset Information
Rows: 88
Columns: 22
Number of classes: 2
Continous columns: 0
Binary columns: 2
Categorical columns: 6
------------------

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Numer of NANs:  0
Numer of NANs:  0
Dataset Information
Rows: 315
Columns: 27
Number of classes: 5
Continous columns: 0
Binary columns: 7
Categorical columns: 5
-------------------------------------------------
[1m

    ----------------------------------------- solar_flare_1 -----------------------------------------[0m
Classification CART mean performance:  0.7936507936507937


Classification OCT performance:  0.7142857142857143




  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Dataset Information
Rows: 1066
Columns: 38
Number of classes: 6
Continous columns: 0
Binary columns: 4
Categorical columns: 8
-------------------------------------------------
[1m

    ----------------------------------------- solar_flare_2 -----------------------------------------[0m
Classification CART mean performance:  0.7383177570093458


Classification OCT performance:  0.7476635514018692




  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Dataset Information
Rows: 208
Columns: 60
Number of classes: 2
Continous columns: 60
Binary columns: 0
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- sonar -----------------------------------------[0m
Classification CART mean performance:  0.7380952380952381


Classification OCT performance:  0.6666666666666667


Numer of NANs:  0
Dataset Information
Rows: 675
Columns: 132
Number of classes: 18
Continous columns: 0
Binary columns: 1
Categorical columns: 34
-------------------------------------------------
[1m

    ----------------------------------------- soybean -----------------------------------------[0m
Classification CART mean performance:  0.6370370370370371


Classification OCT performance:  0.8148148148148149




  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Dataset Information
Rows: 4601
Columns: 57
Number of classes: 2
Continous columns: 57
Binary columns: 0
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- spambase -----------------------------------------[0m
Classification CART mean performance:  0.9163952225841476


Classification OCT performance:  0.9229098805646037


Numer of NANs:  0
Dataset Information
Rows: 267
Columns: 22
Number of classes: 2
Continous columns: 0
Binary columns: 22
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- spect -----------------------------------------[0m
Classification CART mean performance:  0.8703703703703703


Classification OCT performance:  0.8518518518518519


Numer of NANs:  0
Dataset Information
Rows: 349
Columns: 44
Number of classes: 2
Continous columns: 44
Binary columns: 0
Categorical columns: 0
----------------------------

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Dataset Information
Rows: 512
Columns: 9
Number of classes: 2
Continous columns: 0
Binary columns: 9
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- threeOf9 -----------------------------------------[0m
Classification CART mean performance:  0.9611650485436893


Classification OCT performance:  0.9029126213592233


Numer of NANs:  0
Dataset Information
Rows: 958
Columns: 27
Number of classes: 2
Continous columns: 0
Binary columns: 0
Categorical columns: 9
-------------------------------------------------
[1m

    ----------------------------------------- tic_tac_toe -----------------------------------------[0m
Classification CART mean performance:  0.9010416666666666


Classification OCT performance:  0.875


Numer of NANs:  0
Dataset Information
Rows: 959
Columns: 44
Number of classes: 2
Continous columns: 42
Binary columns: 0
Categorical columns: 2
---------------------------------------

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Numer of NANs:  0
Dataset Information
Rows: 973
Columns: 9
Number of classes: 2
Continous columns: 0
Binary columns: 9
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- xd6 -----------------------------------------[0m
Classification CART mean performance:  0.9076923076923077


Classification OCT performance:  0.8769230769230769


Numer of NANs:  0
Dataset Information
Rows: 1479
Columns: 8
Number of classes: 9
Continous columns: 8
Binary columns: 0
Categorical columns: 0
-------------------------------------------------
[1m

    ----------------------------------------- yeast -----------------------------------------[0m
Classification CART mean performance:  0.5810810810810811


Classification OCT performance:  0.5777027027027026




In [11]:
res_rul

{('yeast', 'Reg-CART', 0, 1): nan,
 ('yeast', 'CART', 0, 1): 0.5810810810810811,
 ('yeast', 'ORT', 0, 1): nan,
 ('yeast', 'OCT', 0, 1): 0.5777027027027026,
 ('yeast', 'ORT-H', 0, 1): nan,
 ('yeast', 'OCT-H', 0, 1): nan,
 ('yeast', 'CART_LN_rules', 0, 'all'): 0.581081081081081,
 ('yeast', 'CART_LN_rules_and_features', 0, 'all'): 0.6114864864864865,
 ('yeast', 'CART_SVM_rules', 0, 'all'): 0.581081081081081,
 ('yeast', 'CART_SVM_rules_and_features', 0, 'all'): 0.6081081081081081,
 ('yeast', 'CART_NB_rules', 0, 'all'): 0.2668918918918919,
 ('yeast', 'CART_NB_rules_and_features', 0, 'all'): 0.27702702702702703,
 ('yeast', 'CART_KNN_rules', 0, 'all'): 0.5540540540540541,
 ('yeast', 'CART_KNN_rules_and_features', 0, 'all'): 0.5743243243243243,
 ('yeast', 'CART_LN_rules', 0, 0.5): 0.46621621621621623,
 ('yeast', 'CART_LN_rules_and_features', 0, 0.5): 0.4155405405405405,
 ('yeast', 'CART_SVM_rules', 0, 0.5): 0.46621621621621623,
 ('yeast', 'CART_SVM_rules_and_features', 0, 0.5): 0.4121621621621

In [81]:
df

Unnamed: 0,N0,N1,N2,N3,N4,N5,N6,N7,N8,N9,...,N11,N12,N13,N14,N15,N16,N17,P1,P2,target
0,0,0,2,1,1,0,0,2,0,1,...,2,1,0,0,0,0,0,0,1,1
1,0,0,1,0,0,1,1,0,0,1,...,0,1,0,0,0,0,0,0,0,1
2,0,0,0,1,0,2,0,0,0,0,...,1,1,0,0,2,2,0,0,0,1
3,0,1,0,2,0,1,0,2,0,0,...,1,1,0,1,1,0,1,1,0,1
4,0,0,1,1,0,0,0,1,1,0,...,1,0,0,1,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0,1,1,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,1,0
1596,0,1,1,2,0,1,1,1,1,1,...,1,1,0,0,1,0,2,1,1,0
1597,0,0,1,2,1,1,0,0,0,0,...,0,0,0,0,2,0,1,0,0,0
1598,0,1,0,1,0,0,0,1,0,1,...,0,1,0,0,0,1,0,1,1,0


In [86]:
TARGET_NAME = 'target'
feat_names = [col for col in df.columns if col!=TARGET_NAME]
types = [get_feature_type(df[col], include_binary=True) for col in feat_names]
col = pd.DataFrame(feat_names,types)
num_col = col[col.index == 'continuous']
bin_col = col[col.index == 'binary']
cat_col = col[col.index == 'categorical']
cat_col = cat_col[0].tolist()
dummy_col = pd.get_dummies(data=df, columns=cat_col)
add_col = dummy_col.shape[1] - df.shape[1]

if (add_col < df.shape[0] *0.3) & (dummy_col.shape[1] <  df.shape[0]) & (df.shape[0] < 10000) & (df.shape[1] < 100):
    df = dummy_col
    df.columns = df.columns.str.replace('.','_',regex=True)
    y = df['target']
    X = df.loc[:, df.columns != 'target']
    del df
    rows_data, columns_data = X.shape
    print('Dataset Information')
    print('Rows:',rows_data,)
    print('Columns:',columns_data)
    print('Number of classes:',y.nunique())
    print('Continous columns:', len(num_col))
    print('Binary columns:', len(bin_col))
    print('Categorical columns:',len(cat_col))
    print('-------------------------------------------------')
    else:
        del df
        return pd.DataFrame, pd.DataFrame
    return y, X

Dataset Information
Rows: 1600
Columns: 56
Number of classes: 2
Continous columns: 0
Binary columns: 2
Categorical columns: 18
-------------------------------------------------
