# DBSCAN
This notebook uses DBSCAN as a clustering density based approach.



In [None]:
from sklearn.cluster import DBSCAN
from os import path
import pandas as pd
from sklearn.preprocessing import StandardScaler

std_scaler=StandardScaler()

RACES_PATH=path.join("..","dataset","engineered_races.csv")
#CYCLIST_PATH=path.join("..","dataset","engineered_cyclists.csv")

races_df=pd.read_csv(RACES_PATH)

data=races_df.copy()

num_cols=races_df.select_dtypes(include=['number']).columns

data[list(map(lambda x:f"std_{x}",num_cols))]=std_scaler.fit_transform(data[num_cols])

cat_cols=races_df.select_dtypes(include=['object','bool']).columns

print(cat_cols)

data_cat_enc=pd.get_dummies(data[cat_cols])

data[map(lambda x:f"enc_{x}",data_cat_enc.columns)]=data_cat_enc

data

A few notes are due before starting, first the eps are difficulty to setup for now a good strategy would be to take inspiration using the first paper the introduced the algorithm, which you can find [here](https://dl.acm.org/doi/10.5555/3001460.3001507), and use the distance from the k-th NN varying K until we find a good eps value for us.



In [None]:
from scipy.spatial.distance import silhoutte_score
import itertools as it

In [None]:
#use diferent scales for eps values
eps_values=average_concentration * np.array([10, 5, 2.5, 1, 0.1, 0.01, 0.0001])
metric=['euclidean','cosine','l1']
#NOTE: this might have to be revisited for it's just to try if evveryting works

dimension=data.shape[0]
min_pts=dimension*2
#using the method seen at laboratory to select initial values
maximum_distance = abs(data.max() - data.min()).sum().item()
average_concentration = dimension / maximum_distance

# useful for reference
db_scan_mapping={
    -1:'noisy',
    0:'border',
    1:'core'

}

results=pd.DataFrame()

for idx,(eps,metric) in enumerate(it.product(eps_values,metric)):
    db_scan=DBSCAN(
        eps=eps,
        min_samples=min_pts,
        metric=metric,
    ).fit(data)
    #NOTE: the noisy labels are NOT take into account 
    point_labels=list(map(lambda x:db_scan_mapping[x],db_scan.labels_))
    results=pd.concat([
        results,
        {
            'index':idx,
            'eps':eps,
            'metric':metric,
            'min_samples':min_pts,
            'silhoutte_score':silhoutte_score(data,db_scan.labels_)
        }
    ])


results.sort_values(by='silhoutte_score')

In [None]:
best_idx=results['silhoutte_score'].argmax()
best_params=results.iloc[best_idx]
best_eps=best_params['eps']
best_metric=best_params['metric']

best_dbscan=DBSCAN(eps=best_eps,metric=best_metric,min_samples=min_pts)
