In [1]:
import numpy as np
from kmeans import  kmeans_pp_init, random_init, phi, KMeans
from dataset import gaussian_blobs
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.neighbors import NearestNeighbors
from plotly.subplots import make_subplots
from scipy.spatial.distance import mahalanobis
from scipy.spatial.distance import cdist
from joblib import Parallel, delayed
import pandas as pd
from experiments import exponent_experiment, get_n_outliers

In [2]:
samples = gaussian_blobs(d=1.5, uniform_noise=50, n_samples=500)

In [3]:
kmeanspp_centers = kmeans_pp_init(samples, 5, seed=5, p=4)
random_centers = random_init(samples, 5, seed=10)

In [4]:
fig = make_subplots(rows=1, cols=2, subplot_titles=["kmeans++", "random initialization"])
labels = (1 + np.arange(kmeanspp_centers.shape[0])).astype(str)
text = [f"<b>{label}</b>" for label in labels]
trace_X = go.Scatter(x=samples[:, 0], y=samples[:, 1], mode="markers", marker=dict(color="#a5b3cf"), showlegend=False)
fig.add_traces([trace_X, trace_X], rows=1, cols=[1, 2])

fig.add_traces(px.scatter(x=kmeanspp_centers[:, 0], y=kmeanspp_centers[:, 1], color=labels, text=text).data, rows=1, cols=1)
fig.add_traces(px.scatter(x=random_centers[:, 0], y=random_centers[:, 1], color=labels, text=text).data, rows=1, cols=2)

fig.update_layout(title="Initialization", margin={"t":50, "r":5, "l":5, "b":5}, width=700, height=300, showlegend=False)
fig.update_yaxes(scaleanchor="x1", scaleratio=1, row=1, col=1)
fig.update_yaxes(scaleanchor="x2", scaleratio=1, row=1, col=2)
fig.update_traces(textposition='top center')
fig.update_annotations(font=dict())


In [5]:
n_rows = 2
n_cols = 3
offset_p = 1
p_range = np.arange(2, offset_p * n_cols * n_rows + 2, offset_p)
list_centers = [kmeans_pp_init(samples, 5, seed=4, p=p) for p in p_range]
fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=[f"p={p}" for p in p_range], vertical_spacing=.1)
labels = (1 + np.arange(kmeanspp_centers.shape[0])).astype(str)
text = [f"<b>{label}</b>" for label in labels]

trace_X = go.Scatter(x=samples[:, 0], y=samples[:, 1], mode="markers", marker=dict(color="#a5b3cf"), showlegend=False)

n_plot = 0
for i in range(1, n_rows+1):
    for j in range(1, n_cols+1):
        fig.add_trace(trace_X, row=i, col=j)
        centers = list_centers[n_plot]
        fig.add_traces(px.scatter(x=centers[:, 0], y=centers[:, 1], color=labels, text=text).data, rows=i, cols=j)
        n_plot += 1
        fig.update_yaxes(scaleanchor=f"x{n_plot}", scaleratio=1, row=i, col=j)
        

fig.update_layout(title="Exponent influence", margin={"t":50, "r":5, "l":5, "b":5}, width=800, height=500, showlegend=False)

fig.update_traces(textposition='top center')
fig.update_annotations(font=dict())

In [6]:
n_rows = 4
n_cols = 5
list_p = [2, 3, 4, 6, 15]

subplot_titles = []

for i in range(n_rows):
    for j in range(n_cols):
        subplot_titles.append(f"p={list_p[j]}")
fig = make_subplots(rows=n_rows, cols=n_cols, vertical_spacing=.05, subplot_titles=subplot_titles)
labels = (1 + np.arange(kmeanspp_centers.shape[0])).astype(str)
text = [f"<b>{label}</b>" for label in labels]

trace_X = go.Scatter(x=samples[:, 0], y=samples[:, 1], mode="markers", marker=dict(color="#a5b3cf"), showlegend=False)
n_plot = 0
for i in range(1, n_rows+1):
    for j in range(1, n_cols+1):
        centers = kmeans_pp_init(samples, 5, seed=i, p=list_p[j-1])
        fig.add_trace(trace_X, row=i, col=j)
        fig.add_traces(px.scatter(x=centers[:, 0], y=centers[:, 1], color=labels, text=text).data, rows=i, cols=j)
        fig.layout.annotations[n_plot].update(text=f"p={list_p[j-1]}")

        n_plot += 1
        fig.update_yaxes(scaleanchor=f"x{n_plot}", scaleratio=1, row=i, col=j)

fig.update_layout(title="Exponent influence", margin={"t":50, "r":5, "l":5, "b":5}, width=900, height=900, showlegend=False)

fig.update_traces(textposition='top center')
fig.update_annotations(font=dict())

In [7]:
n_experiments = 1000
list_p = np.array([2, 3, 4, 5, 6, 7])
p_outliers = exponent_experiment(list_p, n_experiments, "outliers")

In [8]:
n_experiments = 50
list_p = np.array([2, 3, 4, 5, 6, 7])
p_phi = exponent_experiment(list_p, n_experiments, "kmeans+phi")

In [9]:
fig = go.Figure()


for i, p in enumerate(list_p):
    fig.add_trace(go.Violin(x=[p for _ in range(n_experiments)],
                            y=p_outliers[i],
                            name=f"{p}",
                            box_visible=True,
                            meanline_visible=True))
fig.update_layout(
    title="Exponent influence on the average number of selected outliers",
    margin={"t":50, "r":5, "l":5, "b":5},
    width=600,
    height=300,
    showlegend=True,
    legend_title_text="p"
    )
fig.update_yaxes(title="Average number of outliers")
fig.update_xaxes(title="p")
fig.show()


In [10]:
fig = px.bar(x=list_p, y=p_outliers.mean(axis=1), color=list_p.astype(str))

fig.update_layout(
    title="Exponent influence on the average number of selected outliers",
    margin={"t":50, "r":5, "l":5, "b":5},
    width=600,
    height=300,
    showlegend=True,
    legend_title_text="p"
    )
fig.update_yaxes(title="Average number of outliers")
fig.update_xaxes(title="p")

fig.show()

In [11]:
fig = px.bar(x=list_p, y=p_phi.mean(axis=1), color=list_p.astype(str))

fig.update_layout(
    title="Exponent influence on phi",
    margin={"t":50, "r":5, "l":5, "b":5},
    width=600,
    height=300,
    showlegend=True,
    legend_title_text="p"
    )
fig.update_yaxes(title="Average phi value")
fig.update_xaxes(title="p")

fig.show()

In [12]:
fig = go.Figure()


for i, p in enumerate(list_p):
    fig.add_trace(go.Violin(x=[p for _ in range(n_experiments)],
                            y=p_phi[i],
                            name=f"{p}",
                            box_visible=True,
                            meanline_visible=True))
fig.update_layout(
    title="Exponent influence on phi",
    margin={"t":50, "r":5, "l":5, "b":5},
    width=600,
    height=300,
    showlegend=True,
    legend_title_text="p"
    )
fig.update_yaxes(title="Average phi value")
fig.update_xaxes(title="p")

fig.show()

In [13]:
def exponent_experiment_report(list_metric, list_p):
    list_dict = []
    for i in range(len(list_p)):
        dict_exp = {
            "p":list_p[i],
            "mean":list_metric[i].mean(),
            "std":list_metric[i].std(),
            "min":list_metric[i].min(),
            "max":list_metric[i].max(),
        }
        list_dict.append(dict_exp)
    df_experiment = pd.DataFrame(list_dict)
    df_experiment = df_experiment.set_index('p', drop=True)
    with pd.option_context("display.precision", 3):
        print(df_experiment)

    

In [14]:
exponent_experiment_report(p_phi, list_p)

    mean    std    min    max
p                            
2  2.512  0.338  1.948  3.345
3  2.534  0.317  1.948  3.360
4  2.541  0.341  1.948  3.342
5  2.582  0.368  1.960  3.663
6  2.575  0.375  1.948  3.622
7  2.643  0.432  1.948  3.783
