In [1]:
import pandas as pd
import math
import numpy as np
import pickle as pkl
import sys
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from collections import Counter

pd.set_option('display.max_columns', None)
sys.path.append('../')

K = 5

In [2]:
with open('datasets.pkl','rb') as f:
    datasets = pkl.load(f)
datasets[0][1]

Unnamed: 0,0,1,2,3,4,5,6,7
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550
...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960


In [3]:
df_example = pd.DataFrame({
    'S':[91.29, 68.57,  82, 47.83,  9.33,   28.57,  15.63,  4.94,   17.72,  0,      8.36,   0,      0,      5.88,   0],
    'B':[7.88,  31.43,  17, 39.13,  63.67,  54.29,  62.5,   61.73,  44.44,  35.29,  20.6,   41.67,  48.84,  47.06,  0],
    'R':[0,     0,      1,  8.7,    10.33,  2.86,   6.25,   18.52,  18.32,  35.29,  20.6,   29.17,  11.63,  7.84,   8.16],
    'O':[0.83,  0,      0,  4.35,   16.67,  14.29,  15.63,  14.81,  19.52,  29.41,  50.45,  29.17,  39.53,  39.22,  91.84],
}, index=['breast_w','new_thyroid','nursery','car','credit_g','ecoli','hepatitis','haberman','cmc','glass','abalone','post_operative','solar_flare','yeast','balance_scale'])
df_example

Unnamed: 0,S,B,R,O
breast_w,91.29,7.88,0.0,0.83
new_thyroid,68.57,31.43,0.0,0.0
nursery,82.0,17.0,1.0,0.0
car,47.83,39.13,8.7,4.35
credit_g,9.33,63.67,10.33,16.67
ecoli,28.57,54.29,2.86,14.29
hepatitis,15.63,62.5,6.25,15.63
haberman,4.94,61.73,18.52,14.81
cmc,17.72,44.44,18.32,19.52
glass,0.0,35.29,35.29,29.41


In [4]:
import sys
sys.path.append('../')
from taxonomy.distance import HVDM
from taxonomy.classification import Taxonomy

df_classification = pd.DataFrame(columns=['S','B','R','O'])

for (d, df, target, name) in datasets:
    if name in ["nursery",'car','cmc','abalone','credit_g','solar_flare','hepatitis','post_operative']: continue
    print("Computing... ", name)
    
    # Encode target
    y = LabelEncoder().fit_transform(target)

    # Catetorical features to numeric
    X = df.copy()
    cat_columns = df.select_dtypes(exclude=['int64','float64']).columns
    X[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])
    X = X.to_numpy()

    # compute HVDM dist matrix
    nominal = [True if t not in ['float64','int64'] else False for t in df.dtypes]

    
    hvdm = HVDM()
    dist_matrix = hvdm.fit(X,y,nominal)

    t = Taxonomy()
    tax = t.fit(dist_matrix,y)
    df_classification.loc[f"{name}"] = tax['percentage']

round((df_classification-df_example).dropna(),2).applymap(lambda x: f"{x:.2f}✅" if abs(x) < 1 else f"{x:.2f}❌")

Computing...  abalone
Computing...  balance_scale
Computing...  breast_w
Computing...  car
Computing...  cmc
Computing...  credit_g
Computing...  ecoli
Computing...  glass
Computing...  haberman
Computing...  hepatitis
Computing...  new_thyroid
Computing...  nursery
Computing...  post_operative
Computing...  solar_flare
Computing...  yeast


Unnamed: 0,S,B,R,O
balance_scale,0.00✅,0.00✅,0.00✅,-0.00✅
breast_w,-0.00✅,0.00✅,0.00✅,-0.00✅
ecoli,0.00✅,-0.00✅,-0.00✅,-0.00✅
glass,0.00✅,0.00✅,0.00✅,0.00✅
haberman,-0.00✅,-0.00✅,-0.00✅,0.00✅
new_thyroid,0.00✅,-0.00✅,0.00✅,0.00✅
yeast,0.00✅,-0.00✅,0.00✅,-0.00✅
