In [1]:
import pandas as pd
import numpy as np
from pybnesian import hc, CLGNetworkType, SemiparametricBNType
#from drawdata import draw_scatter
import matplotlib.pyplot as plt
import math

from pymoo.core.problem import ElementwiseProblem
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.optimize import minimize

from bayesace.algorithms.face import face_algorithm
from bayesace.utils import *

from sklearn.preprocessing import OneHotEncoder, StandardScaler
import time


In [2]:
df = pd.read_csv("toy-3class.csv")

df["class"] = df["z"].astype('category')
#df["z"] = df["z"].astype('category')
df = df.drop("z", axis = 1 )
feature_columns = [i for i in df.columns if i != "class"]
df[feature_columns] = StandardScaler().fit_transform(df[feature_columns].values)


learned = hc(df, bn_type = CLGNetworkType(), operators = ["arcs"], score = "validated-lik")
learned.num_arcs()

learned.fit(df)

In [6]:
learned_kde = hc(df, bn_type = SemiparametricBNType(), operators = ["arcs", "node_type"], score = "validated-lik")
learned_kde.fit(df)


In [7]:
learned_kde.cpd("x")

[HCKDE] P(x | y, class)
+-------+-------------------------------------------+
|       |                   x | y                   |
+-------+-------------------------------------------+
| class |                                           |
+-------+-------------------------------------------+
|   a   | [CKDE] P(x | y) = CKDE with 201 instances |
|   b   | [CKDE] P(x | y) = CKDE with 145 instances |
|   c   | [CKDE] P(x | y) = CKDE with 74 instances  |
+-------+-------------------------------------------+

In [8]:
learned.nodes()

['x', 'y', 'class']

In [9]:
df_nc = df.drop("class", axis = 1)
path = straight_path(df_nc.iloc[[0]], df_nc.iloc[[202]], chunks = 10)
#sep = euclidean_distance(path.iloc[0],path.iloc[1])
#np.sum(-np.log(likelihood(path,learned)) *sep)
#path

In [None]:
t0 = time.time()
def f_tilde(x) :
    return np.abs(-np.log(x)**1)
y_pred = OneHotEncoder().fit_transform(df.loc[:,["class"]].values).toarray()
res = face_algorithm(dataset = df.drop("class",axis = 1), y_pred = y_pred, instance = 0, y_instance = 0, graph_type= "kde", distance_threshold=200, bn=learned, chunks = 2, f_tilde = f_tilde)
print(time.time() - t0)
print(res)

In [None]:
t0 = time.time()
y_pred = OneHotEncoder().fit_transform(df.loc[:,["class"]].values).toarray()
res=face_algorithm(dataset = df.drop("class",axis = 1), y_pred = y_pred, instance = 0, y_instance = 0, graph_type= "integral", distance_threshold=200, bn=learned, chunks = 2, f_tilde = f_tilde)
print(time.time() - t0)
print(res)

In [None]:
plt.scatter(df["x"],df["y"])

In [None]:
df.iloc[202]

In [None]:
type(separate_dataset_and_class(df.iloc[0]))

In [None]:
synt = learned.sample(1000).to_pandas()

In [None]:
def euclidean_distance(x_cfx, x_og) :
    # Make sure attributes go in the same order
    #x_og = x_og[x_cfx.index]
    
    # Return Euclidean distance
    return np.sqrt(np.sum((x_cfx.values-x_og.values)**2))

def delta_distance(x_cfx, x_og, eps = 0.1) :
    abs_distance = abs(x_cfx.values-x_og.values)
    return sum(map(lambda i: i > eps, abs_distance[0]))

def likelihood(x_cfx,bn) :
    class_cpd = bn.cpd("class")
    class_values = class_cpd.variable_values()
    cfx = x_cfx.copy()
    n_samples = x_cfx.shape[0]
    likelihood = 0
    for v in class_values :
        cfx["class"] = pd.Categorical([v]*n_samples, categories = class_values)
        likelihood = likelihood + math.e**bn.logl(cfx)
    return likelihood

def log_likelihood(x_cfx,bn) :
    return np.log(likelihood(x_cfx,bn))

def accuracy(x_cfx, y_og : str | list, bn) :
    class_cpd = bn.cpd("class")
    class_values = class_cpd.variable_values()
    cfx = x_cfx.copy()
    if isinstance(y_og,str) :
        cfx["class"] = pd.Categorical([y_og], categories = class_values)
    else :
        cfx["class"] = pd.Categorical(y_og, categories = class_values)
    prob = math.e**bn.logl(cfx)
    ll = likelihood(x_cfx,bn)
    if ll>0 :
        return prob/ll
    else :
        return 1


def straight_path(x_1, x_2) :
    spacing = int(euclidean_distance(x_1,x_2)/10)
    points = np.zeros(shape=(x_2.shape[1],spacing))
    for i,att in enumerate(x_2.columns) :
        points[i] = np.linspace(x_1[att].values[0],x_2[att].values[0],spacing)
    to_ret = pd.DataFrame(columns = x_2.columns, index = range(0,spacing))
    to_ret[:] = points.transpose()
    return to_ret

def path(df_vertex) :
    to_ret = pd.DataFrame(columns = df_vertex.columns)
    for i in range(len(df_vertex.index)-1) :
        x_1 = df_vertex.iloc[[i]]
        x_2 = df_vertex.iloc[[i+1]]
        to_ret = pd.concat([to_ret,straight_path(x_1, x_2)])
    return to_ret.reset_index()

def avg_path_logl(x_cfx, x_og, bn, penalty) :
    likelihood_path = (-log_likelihood(straight_path(x_og, x_cfx),bn)+1)**penalty
    return np.sum(likelihood_path)

def increase_monotonically(path,bn) :
    pass

In [None]:
x_cfx = synt.iloc[[100]]
x_cfx = x_cfx.drop("class",axis = 1)
x_og = synt.iloc[[200]]
x_og = x_og.drop("class",axis = 1)
path_x = straight_path(x_cfx,x_og)

In [None]:
path(pd.concat([x_og,x_cfx,x_og]))

In [None]:
pd.concat([x_og,x_cfx])

In [None]:
isinstance("a",str)

In [None]:
accuracy(x_og,"a",learned)

In [None]:
class BNCfx(ElementwiseProblem):

    def __init__(self):
        super().__init__(n_var=2,
                         n_obj=4,
                         n_ieq_constr=2,
                         xl=np.array([0]*2),
                         xu=np.array([700]*2))

    def _evaluate(self, x, out, *args, **kwargs):
        x_cfx = x_og.copy()
        x_cfx[:] = x
        f1 = euclidean_distance(x_cfx,x_og)
        f2 = -log_likelihood(x_cfx, learned)
        f3 = avg_path_logl(x_cfx, x_og, learned, penalty = 20)
        f4 = delta_distance(x_cfx,x_og, eps = 15)
        out["F"] = np.column_stack([f1, f2, f3, f4])
        
        g1 = -0.1#-likelihood(x_cfx, learned)+0.0000001
        g2 = accuracy(x_cfx,y_og,learned)-0.05
        out["G"] = np.column_stack([g1,g2])

In [None]:
problem = BNCfx()

x_og = synt.iloc[[300]].copy()
x_og = x_og.drop("class",axis = 1)
x_og.columns = ["x","y"]
x_og["x"] = 192.069
x_og["y"] = 444.152 + -1.215*192.069 + np.sqrt(16335.215)*2
y_og = "a"
#x_cfx["class"] = pd.Categorical("a", categories = ["a","b","c"])

algorithm = NSGA2(pop_size=100)

res = minimize(problem,
               algorithm,
               ('n_gen', 10),
               seed=1,
               verbose=True)

# calculate a hash to show that all executions end with the same result
print("hash", res.F.sum())

In [None]:
res.X

In [None]:
x_og

In [None]:
to_plot = df.drop("class", axis = 1)
colours = df["class"].to_numpy()
colours[colours == "a"] = "green"
colours[colours == "b"] = "yellow"
colours[colours == "c"] = "blue"
plt.scatter(to_plot["x"],to_plot["y"], color = colours)
plt.scatter(res.X.transpose()[0],res.X.transpose()[1],color = "red")
plt.scatter(x_og["x"],x_og["y"], color = "black")
plt.show()

In [None]:
x_test = x_og.copy()
x_test.x = 450.0
x_test.y = 425.0
print("Logl:",log_likelihood(x_test,learned))
print("Path logl:",avg_path_logl(x_test,x_og,learned))
print("Accuracy:",accuracy(x_test,"a",learned))

In [None]:
0.9**2

In [None]:
x_test = x_og.copy()
x_test.x = 700.0
x_test.y = 700.0
print("Logl:",log_likelihood(x_test,learned))
print("Path logl:",avg_path_logl(x_test,x_og,learned))
print("Accuracy:",accuracy(x_test,"a",learned))

In [None]:
x_1 = x_og
x_2 = x_1.copy()
x_2.x = 600
x_2.y = 50
class BestPathFinder(ElementwiseProblem):
    def __init__(self, n_vertex = 1, penalty = 1):
        super().__init__(n_var=2*n_vertex,
                         n_obj=1,
                         n_ieq_constr=1,
                         xl=np.array([0]*2*n_vertex),
                         xu=np.array([700]*2*n_vertex))
        self.n_vertex = n_vertex
        self.penalty = penalty

    def _evaluate(self, x, out, *args, **kwargs):
        df_vertex = pd.DataFrame(columns = x_1.columns, data=np.resize(x,new_shape=(self.n_vertex, 2)))
        df_vertex = pd.concat([x_1,df_vertex,x_2])
        df_vertex = df_vertex.reset_index()
        path_x = path(df_vertex)
        likelihood_path = (-log_likelihood(path_x,learned)+1)**self.penalty
        f1 = np.sum(likelihood_path)
        out["F"] = np.column_stack([f1])

        g1 = -0.1#-likelihood(x_cfx, learned)+0.0000001
        out["G"] = np.column_stack([g1])


problem = BestPathFinder(n_vertex = 3, penalty = 5)
algorithm = NSGA2(pop_size=100)

res = minimize(problem,
               algorithm,
               ('n_gen', 10),
               seed=1,
               verbose=True)

In [None]:
df_vertex = pd.DataFrame(columns = x_1.columns, data=np.resize(res.X,new_shape=(3, 2)))
df_vertex = pd.concat([x_1,df_vertex, x_2])
to_plot = df.drop("class", axis = 1)
colours = df["class"].to_numpy()
colours[colours == "a"] = "green"
colours[colours == "b"] = "yellow"
colours[colours == "c"] = "blue"
plt.scatter(to_plot["x"],to_plot["y"], color = colours)
plt.plot(df_vertex.x,df_vertex.y,color = "red")
plt.show()

In [None]:
x_mid = x_og.copy()
x_mid[:] = np.flip(res.X)

In [None]:
likelihood_path = (-log_likelihood(path(pd.concat([x_1,x_mid,x_2]).reset_index()),learned)+1)
np.sum(likelihood_path)

In [None]:
pd.concat([x_1,x_mid,x_2])

In [None]:
x_1 = x_og
x_2 = x_1.copy()
x_2.x = 600
x_2.y = 50
n_vertex = 3
n_attr = 2
penalty = 1

def fitness_func(ga_instance, solution, solution_idx):
    x = solution
    df_vertex = pd.DataFrame(columns = x_1.columns, data=np.resize(x,new_shape=(n_vertex, n_attr)))
    df_vertex = pd.concat([x_1,df_vertex,x_2])
    df_vertex = df_vertex.reset_index()
    path_x = path(df_vertex)
    likelihood_path = -(-log_likelihood(path_x,learned)+1)**penalty
    return np.sum(likelihood_path)

fitness_function = fitness_func

num_generations = 50
num_parents_mating = 4

sol_per_pop = 100
num_genes = n_vertex*n_attr

init_range_low = 0
init_range_high = 700

parent_selection_type = "sss"
keep_parents = 1

crossover_type = "single_point"

mutation_type = "random"
mutation_percent_genes = 10

ga_instance = pygad.GA(num_generations=num_generations,
                       num_parents_mating=num_parents_mating,
                       fitness_func=fitness_function,
                       sol_per_pop=sol_per_pop,
                       num_genes=num_genes,
                       init_range_low=init_range_low,
                       init_range_high=init_range_high,
                       parent_selection_type=parent_selection_type,
                       keep_parents=keep_parents,
                       crossover_type=crossover_type,
                       mutation_type=mutation_type,
                       mutation_percent_genes=mutation_percent_genes)

ga_instance.run()

solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))

In [None]:
def best_path(x_1,x_2, vertex = 0) :
    if vertex == 0 :
        return straight_path(x_1,x_2)

In [None]:
res.X

In [None]:
df_vertex = pd.DataFrame(columns=x_1.columns, data=np.resize(solution, new_shape=(3, 2)))
df_vertex = pd.concat([x_1, df_vertex, x_2])
to_plot = df.drop("class", axis=1)
colours = df["class"].to_numpy()
colours[colours == "a"] = "green"
colours[colours == "b"] = "yellow"
colours[colours == "c"] = "blue"
plt.scatter(to_plot["x"], to_plot["y"], color=colours)
plt.plot(df_vertex.x, df_vertex.y, color="red")
plt.show()

In [None]:
y_pred = OneHotEncoder().fit_transform(df.loc[:,["class"]].values).toarray()
face_algorithm(dataset = df.drop("class",axis = 1), y_pred = y_pred, instance = 0, y_instance = 0, graph_type= "integral", distance_threshold=50, bn=learned)