# MVD 8. cvičení

## 1. část - Vytvoření dat

Použijte stejný kód z minulého cvičení pro vytvoření dat. Navíc vytvořte jeden větší dataset, ve kterém sjednotíte výstupy různých funkcí (např. make_blobs + make_circles).

In [1]:
from sklearn.datasets import make_blobs, make_moons, make_circles
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.cluster import DBSCAN
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "browser"

In [2]:
data,labels = make_blobs(n_samples=100,n_features=2,centers=2)
df1 = pd.DataFrame(data, columns=['x', 'y'])
df1['labels'] = labels.astype('str')
df1['clas'] = 'UNCLASSIFIED'

In [30]:
data,labels = make_blobs(n_samples=200,n_features=2,centers=4)
df2 = pd.DataFrame(data, columns=['x', 'y'])
df2['labels'] = labels.astype('str')

In [42]:
data,labels = make_moons(n_samples=200,noise=.1)
df3 = pd.DataFrame(data, columns=['x', 'y'])
df3['labels'] = labels.astype('str')

In [5]:
data,labels = make_circles(n_samples=200,noise=.1,factor=.2)
df4 = pd.DataFrame(data, columns=['x', 'y'])
df4['labels'] = labels.astype('str')

In [12]:
data1,labels1 = make_blobs(n_samples=600,n_features=2,centers=3)
data2,labels2 = make_circles(n_samples=200,noise=.1,factor=0.1)
data = np.insert(data1,0,data2,axis=0)
labels = np.insert(labels1,0, labels2+4,axis=0)
df5 = pd.DataFrame(data, columns=['x', 'y'])
df5['labels'] = labels.astype('str')

In [13]:
def print_dataset(df):
    fig = px.scatter(df, y='y', x='x', color='labels')
    fig.show()

In [31]:
print_dataset(df2)

In [21]:
def print_subplots(df, eps, min_pts, lib_labels = None):
    rows = 2 if lib_labels is None else 3
    fig = make_subplots(rows=rows, cols=1,
                        subplot_titles=("Original", "My func", "Sklearn lib"))
    plot1 = px.scatter(df, x='x', y='y', color='labels')
    plot2 = px.scatter(df, x='x', y='y', color='clas')
    fig.add_traces(plot1.data, rows=1, cols=1)
    fig.add_traces(plot2.data, rows=2, cols=1)
    if lib_labels is not None:
        plot3 = px.scatter(x=df['x'], y=df['y'], color = lib_labels.astype(str))
        fig.add_traces(plot3.data, rows=3, cols=1)
    fig.update_layout(height=1000, width=1000,
                      legend_tracegroupgap=180,
                      showlegend=False,
                      title_text="Eps: " + str(eps) + ", Min_pts: " + str(min_pts))
    fig.show()

## 2. část - Implementace DBSCAN algoritmu
Dle přednášky implementujte DBSCAN algoritmus.

In [22]:
def find_neighbours(point, df, eps):
    cluster = []
    for idx,other in df.iterrows():
        if np.sqrt((other['x']-point['x'])**2 + (other['y']-point['y'])**2) < eps:
            cluster.append(idx)
    return cluster

def dbscan(df, eps, min_pts):
    cluster_id = 0
    for point_idx,point in df.iterrows():
        if df.at[point_idx,'clas'] != "UNCLASSIFIED":
            continue
        neigbhours = find_neighbours(point, df, eps)
        if len(neigbhours) < min_pts:
            df.at[point_idx,'clas'] = -1
            continue
        df.at[point_idx,'clas']=cluster_id
        while neigbhours:
            idx = neigbhours.pop(0)
            if df.at[idx,'clas'] == -1:
                df.at[idx,'clas']=cluster_id
            if df.at[idx,'clas'] != "UNCLASSIFIED":
                continue
            df.at[idx,'clas']=cluster_id
            new_neigh = find_neighbours(df.iloc[idx], df, eps)
            if len(new_neigh) >= min_pts:
                neigbhours.extend(new_neigh)
        cluster_id += 1

## 3. část - Vyhodnocení 
Aplikujte Váš DBSCAN na vytvořené datasety. Experimentálně najděte parametry ```Eps``` a ```MinPts```.

In [23]:
def evaluate(df, eps = 0.5, min_pts = 5):
    df['clas'] = 'UNCLASSIFIED'
    dbscan(df, eps, min_pts)
    comparasion(df, eps, min_pts)

In [44]:
evaluate(df1, eps = 2, min_pts=2)
evaluate(df2, eps = 2, min_pts=2)
evaluate(df3, eps = .2, min_pts=2)
evaluate(df4, eps = .3, min_pts=5)
evaluate(df5, eps = 0.43, min_pts=6)

Equal outputs:  True
Equal outputs:  True
Equal outputs:  True
Equal outputs:  True
Equal outputs:  True


## 4. část - Porovnání s výstupem z knihovny

Porovnejte výstup s výstupem z knihovny sklearn. Dokumentaci naleznete [zde](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html).

In [25]:
def comparasion(df, eps, min_pts):
    data = df.loc[:,['x','y']].values
    db = DBSCAN(eps = eps, min_samples = min_pts).fit(data)
    res = np.array_equal(db.labels_, df['clas'].to_numpy())
    print("Equal outputs: ", res)
    print_subplots(df, eps, min_pts, db.labels_)