In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import sklearn as sk
import math

In [4]:
dist = pd.read_csv('../dist_data.csv', sep=';')

In [5]:
dist_clean = dist[dist['dist'] != "[[], []]"]

In [6]:
x = dist_clean.iloc[:, :-1].values
y = dist_clean.iloc[:, 1].values

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30)

In [8]:
# clean dist Database ( get x[i] and return dict)

def clean_sample(sample):
    sample_list = sample[0].replace(', [', '; [').split(';')
    sample_x = sample_list[0]
    sample_Px = sample_list[1]

    sample_x_list = []
    sample_Px_list = []
    
    for x in sample_x.split(','):
        sample_x_list.append(x.replace(' ', '').replace('[', '').replace(']', ''))
    for Px in sample_Px.split(','):
        sample_Px_list.append(Px.replace(' ', '').replace('[', '').replace(']', ''))

    dict_sample = {}
    for i in range(len(sample_x_list)):
        dict_sample[sample_x_list[i]] = sample_Px_list[i]
    return dict_sample


In [9]:
def aggregate_distro(distro1, distro2):
    xPQx_dict={}
    for xd1, Pxd1 in distro1.items():
        if xd1 == "" or Pxd1 == "":
            continue
        Pxd2 = distro2.get(xd1)
        if Pxd2 is None:
            Pxd2 = 0.0
        xPQx_dict.update({xd1:[Pxd1, Pxd2]})
                    
    for xd2, Pxd2 in distro2.items():
        if xd2 == "" or Pxd2 == "":
            continue
        Pxd1 = distro1.get(xd2)
        if Pxd1 is None:
            Pxd1 = 0.0
            xPQx_dict.update({xd2:[0.0, Pxd2]})
    
    # {443: [Pxi, Qxi], 22: [Pxi, Qxi], ...}
    return xPQx_dict


In [10]:
def calculate_distance(func_distance, total_distro): #dict_distro : {'443': [P443, Q443], '22': [P22, Q22]}
    distance = func_distance(total_distro)
    return distance

In [18]:
# Kullback-Leibler

#dict_distro : {'443': [P443, Q443], '22': [P22, Q22]}
def kl(dict_distro):
    sum1 = 0 
    for i in dict_distro.values():
        Pi = float(i[0])
        Qi = float(i[1])
        if Pi == 0.0:
            sum1 += 0.0
            continue
        if Qi == 0.0:
            continue
        sum1 += Pi * math.log2(Pi / Qi)

    sum2 = 0 
    for i in dict_distro.values():
        Pi = float(i[1])
        Qi = float(i[0])
        if Pi == 0.0:
            sum2 += 0.0
            continue
        if Qi == 0.0:
            continue
        sum2 += Pi * math.log2(Pi / Qi)
        
    if   sum1 < 0 and sum2 > 0 :
        return sum2
    elif sum2 < 0 and sum1 > 0:
        return sum1
    elif sum1 < 0 and sum2 < 0:
        return 9999
    elif sum1 > 0 and sum2 > 0:
        return min(sum1, sum2)
    else:
        return -1

In [12]:
# Jensen–Shannon 

def js(dict_distro):
    #dict_distro : {'443': [P443, Q443], '22': [P22, Q22]}
    PMQx_list = []
    for i in dict_distro.values():
        Mx = (float(i[0]) + float(i[1]))/2
        PMQx_list.append([float(i[0]), Mx, float(i[1])])

    PM_list = []
    QM_list = []
    for i in PMQx_list:
        PM_list.append([i[0], i[1]])
        QM_list.append([i[2], i[1]])

    sum1 = kl_for_js(PM_list)
    sum2 = kl_for_js(QM_list)

    return (sum1 + sum2)/2

def kl_for_js(PQx_list):
    sum = 0 
    for i in PQx_list:
        Pi = i[0]
        Qi = i[1]
        if float(Pi) == 0.0:
            sum += 0.0
            continue
        sum += float(Pi) * math.log2(float(Pi) / float(Qi))
        
    return sum

In [16]:
# Me Distance
from math import sqrt

def me(dict_distro):
    #dict_distro : {'443': [P443, Q443], '22': [P22, Q22]}
    sum = 0
    for i in dict_distro.values():
        y1 = i[0]
        y2 = i[1]
        sum += (y2 - y1) ** 2

    return sqrt(sum)

In [14]:
# choose best label per threshold

def Predict(label_distance, threshold):
    y_pred = []
    for i in range(len(final_distro)):
        t = pd.DataFrame(label_distance[i], columns=['Label', 'Kl Result'])
        # tmp = t[t['Kl Result'] < threshold]
        # grouped_df = tmp.groupby(['Label']).size().reset_index(name='Count')
        # sorted_df = grouped_df.sort_values(by=['Count'], ascending=False)
        tmp = t[t['Kl Result'] < threshold]
        tmp['Count'] = tmp.groupby(['Label']).transform('size')
        tmp = tmp.sort_values(by=['Count', 'Kl Result'], ascending=[False, True])        
        if tmp.values.size == 0:
            y_pred.append(0)
        else:
            y_pred.append(tmp.head(1)['Label'].values[0])
    return y_pred

In [41]:
distro1 = x_test
distro1_label = y_test
distro2 = x_train
distro2_label = y_train

distro = []
distro.append({'443' : 0.8, '22'  : 0.1, '80'  : 0.1})
distro.append({'443' : 0.3, '8080' : 0.3, '22'  : 0.3, '80'  : 0.1})
distro.append({'443' : 1.0})
distro.append({'22' : 0.9, '8080' : 0.1})
distro.append({'9090' : 0.5, '21' : 0.5})

total_distro = aggregate_distro(distro[0], distro[1])
print(total_distro)

distance = calculate_distance(kl, total_distro)
print("Kl distance is: ", distance)

distance = calculate_distance(js, total_distro)
print("js distance is: ", distance)

distance = calculate_distance(me, total_distro)
print("me distance is: ", distance)

{'443': [0.8, 0.3], '22': [0.1, 0.3], '80': [0.1, 0.1], '8080': [0.0, 0.3]}
Kl distance is:  0.05097750043269361
js distance is:  0.2728013599658333
me distance is:  0.6164414002968976


In [40]:
distro = []
distro.append({'443' : 0.8, '22'  : 0.1, '80'  : 0.1})
distro.append({'443' : 0.3, '8080' : 0.3, '22'  : 0.3, '80'  : 0.1})
distro.append({'443' : 1.0})
distro.append({'22' : 0.9, '8080' : 0.1})
distro.append({'9090' : 0.5, '21' : 0.5})

total_distro = []
for i in range(5):
    for j in range(5):
        total_distro.append(aggregate_distro(distro[i], distro[j]))

max_distance = 0
distances = []
for iter in total_distro:
    distance = calculate_distance(me, iter)
    distances.append([distance, iter])
    if distance > max_distance:
        max_distance = distance
    
normalized_distances = []
for distance_iter in distances:
    distance = distance_iter[0]
    iter = distance_iter[1]
    normalized_distance = distance / max_distance
    normalized_distances.append([normalized_distance, iter])

for normal_iter in normalized_distances:
    print("distance is: ", round(normal_iter[0],2), "distro is: ", normal_iter[1])

distance is:  0.0 distro is:  {'443': [0.8, 0.8], '22': [0.1, 0.1], '80': [0.1, 0.1]}
distance is:  0.46 distro is:  {'443': [0.8, 0.3], '22': [0.1, 0.3], '80': [0.1, 0.1], '8080': [0.0, 0.3]}
distance is:  0.18 distro is:  {'443': [0.8, 1.0], '22': [0.1, 0.0], '80': [0.1, 0.0]}
distance is:  0.85 distro is:  {'443': [0.8, 0.0], '22': [0.1, 0.9], '80': [0.1, 0.0], '8080': [0.0, 0.1]}
distance is:  0.8 distro is:  {'443': [0.8, 0.0], '22': [0.1, 0.0], '80': [0.1, 0.0], '9090': [0.0, 0.5], '21': [0.0, 0.5]}
distance is:  0.46 distro is:  {'443': [0.3, 0.8], '8080': [0.3, 0.0], '22': [0.3, 0.1], '80': [0.1, 0.1]}
distance is:  0.0 distro is:  {'443': [0.3, 0.3], '8080': [0.3, 0.3], '22': [0.3, 0.3], '80': [0.1, 0.1]}
distance is:  0.61 distro is:  {'443': [0.3, 1.0], '8080': [0.3, 0.0], '22': [0.3, 0.0], '80': [0.1, 0.0]}
distance is:  0.52 distro is:  {'443': [0.3, 0.0], '8080': [0.3, 0.1], '22': [0.3, 0.9], '80': [0.1, 0.0]}
distance is:  0.65 distro is:  {'443': [0.3, 0.0], '8080': [0.