In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
import math

In [3]:
dist = pd.read_csv('../dist_data.csv', sep=';')

In [4]:
dist_clean = dist[dist['dist'] != "[[], []]"]

In [5]:
x = dist_clean.iloc[:, :-1].values
y = dist_clean.iloc[:, 1].values

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30)

In [8]:
# clean dist Database ( get x[i] and return dict)

def clean_sample(sample):
    sample_list = sample[0].replace(', [', '; [').split(';')
    sample_x = sample_list[0]
    sample_Px = sample_list[1]

    sample_x_list = []
    sample_Px_list = []
    
    for x in sample_x.split(','):
        sample_x_list.append(x.replace(' ', '').replace('[', '').replace(']', ''))
    for Px in sample_Px.split(','):
        sample_Px_list.append(Px.replace(' ', '').replace('[', '').replace(']', ''))

    dict_sample = {}
    for i in range(len(sample_x_list)):
        dict_sample[sample_x_list[i]] = sample_Px_list[i]
    return dict_sample

def preprocess_distros(distro1, distro2):
    label_distance = []
    for i, sample_x_test in enumerate(distro1):
        #print(i)
        label_pre_distro = []
        for j, sample_x_train in enumerate(distro2):
            #distro1_with_all.append(aggregate_distro(clean_sample(sample_x_test), clean_sample(sample_x_train)))
            label_pre_distro.append([distro2_label[j], me(aggregate_distro(clean_sample(sample_x_test), clean_sample(sample_x_train)))])
        label_distance.append(label_pre_distro)
    return label_distance

def aggregate_distro(distro1, distro2):
    xPQx_dict={}
    for xd1, Pxd1 in distro1.items():
        if xd1 == "" or Pxd1 == "":
            continue
        Pxd2 = distro2.get(xd1)
        if Pxd2 is None:
            Pxd2 = 0.0
        xPQx_dict.update({xd1:[Pxd1, Pxd2]})
                    
    for xd2, Pxd2 in distro2.items():
        if xd2 == "" or Pxd2 == "":
            continue
        Pxd1 = distro1.get(xd2)
        if Pxd1 is None:
            Pxd1 = 0.0
            xPQx_dict.update({xd2:[0.0, Pxd2]})
    
    # {443: [Pxi, Qxi], 22: [Pxi, Qxi], ...}
    return xPQx_dict


In [16]:
# Me Distance
from math import sqrt

def me(dict_distro):
    #dict_distro : {'443': [P443, Q443], '22': [P22, Q22]}
    sum = 0
    for i in dict_distro.values():
        y1 = float(i[0])
        y2 = float(i[1])
        sum += (y2 - y1) ** 2

    return sqrt(sum)

In [17]:
# choose best label per threshold

def Predict(label_distance, threshold):
    y_pred = []
    for i in range(len(final_distro)):
        t = pd.DataFrame(label_distance[i], columns=['Label', 'Kl Result'])
        # tmp = t[t['Kl Result'] < threshold]
        # grouped_df = tmp.groupby(['Label']).size().reset_index(name='Count')
        # sorted_df = grouped_df.sort_values(by=['Count'], ascending=False)
        tmp = t[t['Kl Result'] < threshold]
        tmp['Count'] = tmp.groupby(['Label']).transform('size')
        tmp = tmp.sort_values(by=['Count', 'Kl Result'], ascending=[False, True])        
        if tmp.values.size == 0:
            y_pred.append(0)
        else:
            y_pred.append(tmp.head(1)['Label'].values[0])
    return y_pred

In [18]:
distro1 = x_test
distro1_label = y_test
distro2 = x_train
distro2_label = y_train

final_distro = preprocess_distros(distro1, distro2)


In [None]:
max = 0
for iter in final_distro:
    for j in iter:
        if j[1] > max:
            max = j[1]

print(max)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

prediction = {}
for i in np.arange(1.5, 0.5, -0.05):
    #print(i)
    y_pred = Predict(final_distro, i)
    res1 = accuracy_score(y_test,y_pred)
    res2 = f1_score(y_test, y_pred, average="weighted")
    res3 = f1_score(y_test, y_pred, average="micro")
    res4 = f1_score(y_test, y_pred, average="macro")
    prediction[round(i,2)] = [res1, res2, res3, res4]


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.DataFrame.from_dict(prediction, orient='index', columns=['Accuracy', 'F1_weighted', 'F1_micro', 'F1_macro'])

fig, axx = plt.subplots(2, 2, figsize=(20,15))

axx[0, 0].set_title('Accuracy')
sns.lineplot(y=data['Accuracy'].values, x=data.index, ax=axx[0, 0])
print(f"Max Accuracy is: {data['Accuracy'].max()} and threshold is: {data[data['Accuracy'] == data['Accuracy'].max()].index[0]}")



axx[0, 1].set_title('F1_weighted')
sns.lineplot(y=data['F1_weighted'].values, x=data.index, ax=axx[0, 1])
print(f"Max F1_weighted is: {data['F1_weighted'].max()} and threshold is: {data[data['F1_weighted'] == data['F1_weighted'].max()].index[0]}")

axx[1, 0].set_title('F1_micro')
sns.lineplot(y=data['F1_micro'].values, x=data.index, ax=axx[1, 0])
print(f"Max F1_micro is: {data['F1_micro'].max()} and threshold is: {data[data['F1_micro'] == data['F1_micro'].max()].index[0]}")

axx[1, 1].set_title('F1_macro')
sns.lineplot(y=data['F1_macro'].values, x=data.index, ax=axx[1, 1])
print(f"Max F1_macro is: {data['F1_macro'].max()} and threshold is: {data[data['F1_macro'] == data['F1_macro'].max()].index[0]}")

plt.tight_layout()

In [40]:
# distro = []
# distro.append({'443' : 0.8, '22'  : 0.1, '80'  : 0.1})
# distro.append({'443' : 0.3, '8080' : 0.3, '22'  : 0.3, '80'  : 0.1})
# distro.append({'443' : 1.0})
# distro.append({'22' : 0.9, '8080' : 0.1})
# distro.append({'9090' : 0.5, '21' : 0.5})

# total_distro = []
# for i in range(5):
#     for j in range(5):
#         total_distro.append(aggregate_distro(distro[i], distro[j]))

# max_distance = 0
# distances = []
# for iter in total_distro:
#     distance = calculate_distance(me, iter)
#     distances.append([distance, iter])
#     if distance > max_distance:
#         max_distance = distance
    
# normalized_distances = []
# for distance_iter in distances:
#     distance = distance_iter[0]
#     iter = distance_iter[1]
#     normalized_distance = distance / max_distance
#     normalized_distances.append([normalized_distance, iter])

# for normal_iter in normalized_distances:
#     print("distance is: ", round(normal_iter[0],2), "distro is: ", normal_iter[1])

distance is:  0.0 distro is:  {'443': [0.8, 0.8], '22': [0.1, 0.1], '80': [0.1, 0.1]}
distance is:  0.46 distro is:  {'443': [0.8, 0.3], '22': [0.1, 0.3], '80': [0.1, 0.1], '8080': [0.0, 0.3]}
distance is:  0.18 distro is:  {'443': [0.8, 1.0], '22': [0.1, 0.0], '80': [0.1, 0.0]}
distance is:  0.85 distro is:  {'443': [0.8, 0.0], '22': [0.1, 0.9], '80': [0.1, 0.0], '8080': [0.0, 0.1]}
distance is:  0.8 distro is:  {'443': [0.8, 0.0], '22': [0.1, 0.0], '80': [0.1, 0.0], '9090': [0.0, 0.5], '21': [0.0, 0.5]}
distance is:  0.46 distro is:  {'443': [0.3, 0.8], '8080': [0.3, 0.0], '22': [0.3, 0.1], '80': [0.1, 0.1]}
distance is:  0.0 distro is:  {'443': [0.3, 0.3], '8080': [0.3, 0.3], '22': [0.3, 0.3], '80': [0.1, 0.1]}
distance is:  0.61 distro is:  {'443': [0.3, 1.0], '8080': [0.3, 0.0], '22': [0.3, 0.0], '80': [0.1, 0.0]}
distance is:  0.52 distro is:  {'443': [0.3, 0.0], '8080': [0.3, 0.1], '22': [0.3, 0.9], '80': [0.1, 0.0]}
distance is:  0.65 distro is:  {'443': [0.3, 0.0], '8080': [0.