# Libraries

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Dataset Overview

In [None]:
fig, ax = plt.subplots(2,2)

for dataset_i, dataset_name in enumerate(["Compound", "D31", "pathbased", "spiral"]):
    path = "./data/"+dataset_name+".txt"
    dataset = pd.read_csv(path, sep="\t", names=["X", "Y", "Class"])

    for class_id in set(dataset["Class"]):
        ax[int(dataset_i/2)][dataset_i%2].scatter(
            dataset[dataset["Class"]==class_id]["X"],
            dataset[dataset["Class"]==class_id]["Y"]
        )
    ax[int(dataset_i/2)][dataset_i%2].title.set_text(dataset_name)

fig.set_size_inches(11, 11)
plt.show()

In [None]:
dataset = pd.read_csv("./data/rings.txt", sep="\t", names=["Class", "X", "Y", "Z"])

fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(projection='3d')

for class_id in set(dataset["Class"]):
    ax.scatter(dataset[dataset["Class"]==class_id]["X"],
               dataset[dataset["Class"]==class_id]["Y"],
               dataset[dataset["Class"]==class_id]["Z"]
    )
    
ax.title.set_text("rings")

plt.show()

# DBSCAN

In [None]:
def find_neighbours(dataset, data, e):
    remain_columns = list(set(dataset.columns) - set(["Class", "Status", "Cluster"]))
    
    for i, data2 in dataset.iterrows():
        dataset.at[i, "Distance"] = np.linalg.norm(data[remain_columns] - data2[remain_columns])
        
    dataset_output = dataset[dataset["Distance"]<e]
    dataset_output = dataset_output.drop([data.name])
    dataset_output = dataset_output.drop(columns = ["Distance"])
    
    del dataset["Distance"]
    
    return dataset_output

In [None]:
from queue import Queue

def dbscan(dataset, e, min_pts):
    del dataset["Class"]
    dataset["Status"] = "Unknown"
    dataset["Cluster"] = -1
    
    number_cluster = 0
    for i, data in dataset.iterrows():
        print(f"i = {i}")
        print(dataset)
        if data["Status"] == "Unknown":
            neighbours = find_neighbours(dataset, data, e)
            
            if len(neighbours) < min_pts:
                dataset.at[i, "Status"] = "Outlier"
            else:
                dataset.at[i, "Status"] = "Core"
                dataset.at[i, "Cluster"] = number_cluster
                                
                q = Queue()
                
                for j, data_n in neighbours.iterrows():
                    q.put((j, data_n))
                    
                while(not q.empty()):
                    j, data_n = q.get()
                          
                    if data_n["Status"] == "Outlier":
                        dataset.at[j, "Status"] = "Border"
                        dataset.at[j, "Cluster"] = number_cluster
                    
                    if data_n["Status"] == "Unknown":
                        print(f"In while item = {data_n.name}")
                        
                        if data_n.name == 5:
                            print(f"dataset = \n {dataset}\n")
                        
                        dataset.at[j, "Cluster"] = number_cluster
                        neighbours_n = find_neighbours(dataset, data_n, e)
                        
                        if len(neighbours_n) <min_pts:
                            dataset.at[j, "Status"] = "Border"
                        else:
                            dataset.at[j, "Status"] = "Core"
                            for j, data_n in neighbours_n.iterrows():
                                q.put((j, data_n)) 
                                
                number_cluster += 1
                

In [167]:
dataset = pd.read_csv("./data/spiral.txt", sep="\t", names=["Class", "X", "Y"])

In [None]:
dataset_clustered = dbscan(dataset, e=5, min_pts=5)

i = 0
         X  Y   Status  Cluster
0     7.95  3  Unknown       -1
1     7.30  3  Unknown       -1
2     6.65  3  Unknown       -1
3     6.00  3  Unknown       -1
4     5.55  3  Unknown       -1
..     ... ..      ...      ...
307  13.85  2  Unknown       -1
308  14.05  2  Unknown       -1
309  14.25  2  Unknown       -1
310  14.50  2  Unknown       -1
311  14.60  2  Unknown       -1

[312 rows x 4 columns]
In while item = 1
In while item = 2
In while item = 3
In while item = 4
In while item = 5
dataset = 
          X  Y   Status  Cluster
0     7.95  3     Core        0
1     7.30  3     Core        0
2     6.65  3     Core        0
3     6.00  3     Core        0
4     5.55  3     Core        0
..     ... ..      ...      ...
307  13.85  2  Unknown       -1
308  14.05  2  Unknown       -1
309  14.25  2  Unknown       -1
310  14.50  2  Unknown       -1
311  14.60  2  Unknown       -1

[312 rows x 4 columns]

In while item = 6
In while item = 7
In while item = 8
In while item = 9
In 

In [116]:
dataset

Unnamed: 0,X,Y,Status,Cluster,Distance,0,1
0,7.95,3.0,Unknown,-1.0,0.000000,,
1,7.30,3.0,Unknown,-1.0,0.650000,,
2,6.65,3.0,Unknown,-1.0,1.300000,,
3,6.00,3.0,Unknown,-1.0,1.950000,,
4,5.55,3.0,Unknown,-1.0,2.400000,,
...,...,...,...,...,...,...,...
309,14.25,2.0,Unknown,-1.0,6.378871,,
310,14.50,2.0,Unknown,-1.0,6.625896,,
311,14.60,2.0,Unknown,-1.0,6.724768,,
Status,,,,,,Core,


In [38]:
list(set([1, 2, 3]) - set([3, 4]))

[1, 2]