# DBSCAN
This notebook uses DBSCAN as a clustering density based approach.



Note: before starting, for reasons of time we could,for this delivery, do the clustering on the full dataset so for now we decided to employ some sort of data reduction as to make it feasible to run such an algorithm

The approach we used was:
- aggregation of races features for each races instance (the year races couple in the dataset)
- remove useless features that don't mean anything after aggregation e.g. stages
- chunking on the dataset reduction every two years to get small enough clusters

In [1]:
import datetime
from sklearn.cluster import DBSCAN
from os import path
import pandas as pd
from sklearn.preprocessing import StandardScaler

def most_frequent(series):
    return series.mode()[0] if not series.mode().empty else series.iloc[0]
RACES_PATH=path.join("..","dataset","engineered_races.csv")
races_df=pd.read_csv(RACES_PATH)

#aggregation of races to reduce dataset size and improve clustering quality
aggregated_data=races_df.groupby(['date','stage','std_name','cyclist']).agg({
    'profile':most_frequent,
    'is_tarmac':most_frequent,
    'difficulty_level':most_frequent,

    'points':'sum',

    'length':'mean',
    'climb_total':'mean',
    'competitive_age':'mean',
    'startlist_quality':'mean',
    'delta':'mean',
    'performance_index':'mean',
    'difficulty':'mean',
    'convenience_score':'mean',
    'difficulty_score':'mean',
    'gain_ratio':'mean',

    'cyclist_age':'first',
    'position':'first',
    'cyclist_team':'first',
}).reset_index()


#convert to timestamp(units are useless since it's getting normalized)
aggregated_data['date']=pd.to_datetime(aggregated_data['date'])
aggregated_data['day']=aggregated_data['date'].dt.day
aggregated_data['month']=aggregated_data['date'].dt.month
aggregated_data['year']=aggregated_data['date'].dt.year

#one hot encoding difficulty
ohe_diff_lvl=pd.get_dummies(races_df['difficulty_level']).astype(float)
aggregated_data[ohe_diff_lvl.columns]=ohe_diff_lvl

clustering_data=aggregated_data.copy()
aggregated_data=aggregated_data.drop(columns='date')


#dividing into chunks
dec_cut=pd.date_range(
    start=clustering_data['date'].min(),
    end=clustering_data['date'].max(),
    freq='2YE'
)
#apply chunks
clustering_data['decade']=pd.cut(
    clustering_data['date'],
    bins=dec_cut,
)
#remove useless columns
clustering_data=clustering_data.drop(columns="date")

clustering_data

Unnamed: 0,stage,std_name,cyclist,profile,is_tarmac,difficulty_level,points,length,climb_total,competitive_age,...,cyclist_age,position,cyclist_team,day,month,year,easy,hard,moderate,decade
0,result,omloop-het-nieuwsblad,andre-dierickx,3.0,False,moderate,125.0,195000.0,2330.469215,23.0,...,23.0,2,spain-1991,28,2,1970,1.0,0.0,0.0,
1,result,omloop-het-nieuwsblad,christian-callens,3.0,False,moderate,125.0,195000.0,2330.469215,23.0,...,23.0,9,free-agent,28,2,1970,1.0,0.0,0.0,
2,result,omloop-het-nieuwsblad,daniel-van-ryckeghem,3.0,False,moderate,125.0,195000.0,2330.469215,25.0,...,25.0,5,norway-1987,28,2,1970,1.0,0.0,0.0,
3,result,omloop-het-nieuwsblad,eddy-merckx,3.0,False,moderate,125.0,195000.0,2330.469215,25.0,...,25.0,6,team-monex-2005,28,2,1970,1.0,0.0,0.0,
4,result,omloop-het-nieuwsblad,englebert-opdebeeck,3.0,False,moderate,125.0,195000.0,2330.469215,24.0,...,24.0,36,free-agent,28,2,1970,1.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589734,result,san-sebastian,txomin-juaristi,2.0,False,moderate,225.0,230300.0,4057.000000,28.0,...,28.0,91,norway-2021,29,7,2023,1.0,0.0,0.0,
589735,result,san-sebastian,urko-berrade-fernandez,2.0,False,moderate,225.0,230300.0,4057.000000,26.0,...,26.0,33,atala-1985,29,7,2023,1.0,0.0,0.0,
589736,result,san-sebastian,victor-de-la-parte,2.0,False,moderate,225.0,230300.0,4057.000000,37.0,...,37.0,88,c-a-1978,29,7,2023,1.0,0.0,0.0,
589737,result,san-sebastian,welay-hagos-berhe,2.0,False,moderate,225.0,230300.0,4057.000000,22.0,...,22.0,40,bankgiroloterij-batavus-2000,29,7,2023,1.0,0.0,0.0,


In [2]:
cat_cols=aggregated_data.select_dtypes(include=['object','bool']).columns

one_hot_enc=pd.get_dummies(aggregated_data[cat_cols])

aggregated_data[one_hot_enc.columns]=one_hot_enc

aggregated_data=aggregated_data.drop(columns=cat_cols)

aggregated_data

  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.columns]=one_hot_enc
  aggregated_data[one_hot_enc.c

Unnamed: 0,profile,points,length,climb_total,competitive_age,startlist_quality,delta,performance_index,difficulty,convenience_score,...,cyclist_team_venezuela-2022,cyclist_team_ville-de-charleroi-new-systems-2000,cyclist_team_vini-fantini-2013,cyclist_team_vini-ricordi-pinarello-sidermec-1986,cyclist_team_watney-avia-1972,cyclist_team_yugoslavia-1986,cyclist_team_yugoslavia-1987,cyclist_team_yugoslavia-1989,cyclist_team_zimbabwe-2001,cyclist_team_zimbabwe-2004
0,3.0,125.0,195000.0,2330.469215,23.0,430.0,0.0,0.339576,5.341637e+06,0.000138,...,False,False,False,False,False,False,False,False,False,False
1,3.0,125.0,195000.0,2330.469215,23.0,430.0,25.0,0.300752,2.507851e+06,0.000299,...,False,False,False,False,False,False,False,False,False,False
2,3.0,125.0,195000.0,2330.469215,25.0,430.0,25.0,0.474094,2.476709e+06,0.000302,...,False,False,False,False,False,False,False,False,False,False
3,3.0,125.0,195000.0,2330.469215,25.0,430.0,25.0,0.333014,6.481262e+05,0.001165,...,False,False,False,False,False,False,False,False,False,False
4,3.0,125.0,195000.0,2330.469215,24.0,430.0,195.0,0.212850,6.693088e+05,0.001128,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589734,2.0,225.0,230300.0,4057.000000,28.0,787.0,1255.0,0.298428,7.842131e+05,0.001735,...,False,False,False,False,False,False,False,False,False,False
589735,2.0,225.0,230300.0,4057.000000,26.0,787.0,521.0,0.336842,4.464273e+05,0.003051,...,False,False,False,False,False,False,False,False,False,False
589736,2.0,225.0,230300.0,4057.000000,37.0,787.0,1200.0,0.363910,5.835860e+05,0.002333,...,False,False,False,False,False,False,False,False,False,False
589737,2.0,225.0,230300.0,4057.000000,22.0,787.0,564.0,0.646890,9.297436e+04,0.014663,...,False,False,False,False,False,False,False,False,False,False


# bulk clustering
For a first test we can do bulk clustering i.e. try on the whole dataset.

First we must pick the values, for the minpts we use some heuristics to pick the values.

For MinPts we pick $2*D$ with $D$ equal to the dimensionality of the dataset.

For $Eps$ we try the elbow method to graphically detect the best eps value using the elbow method.

In [5]:
from sklearn.neighbors import NearestNeighbors
import utils
import numpy as np
import matplotlib.pyplot as plt

def plot_sorted_distances(
        clustering_data,
        n_neighbors=20,
):
    min_pts=n_neighbors
    nn=NearestNeighbors(n_neighbors=min_pts,n_jobs=-1).fit(clustering_data)
    distances,indices= nn.kneighbors(clustering_data,n_neighbors=min_pts)
    k_distances= np.sort(distances[:, -1])
    plt.plot(k_distances)
    
plot_sorted_distances(
    clustering_data=aggregated_data
)

: 

# clustering organization

A few notes are due before starting, first the eps are difficulty to setup for now a good strategy would be to take inspiration using the first paper the introduced the algorithm, which you can find [here](https://dl.acm.org/doi/10.5555/3001460.3001507), and use the distance from the k-th NN varying K until we find a good eps value for us.

A few notes before starting, for DBSCAN only the silhoutte score is used since BSS and SSE are not meaningful given the nature of the algorithm and fail to caputure meaningufl details about the clusterings.

## applying the elbow method
In this part since it is diifcult to estimate values we picked a kth neighbor that is not too low to have an eps taht is higher and manages to reach more points.

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
import itertools as it
import numpy as np
import utils
clustering_data=clustering_data.drop(columns=["difficulty_level","stage","std_name","cyclist","cyclist_team","easy","hard","moderate","is_tarmac","gain_ratio","difficulty_score","position"]).drop_duplicates()

std_scaler=StandardScaler()

print(clustering_data.columns)

dec_groups=clustering_data.groupby('decade')
normalized_decade_groups={k:std_scaler.fit_transform(g.drop(columns="decade").drop_duplicates()) for k,g in dec_groups }

print({k:len(g) for k,g in normalized_decade_groups.items()})

initial_eps=dict()

kth_neighbor=30

kth_neighbors=[
    4,# value ued in theoriginal DBSCAN paper
    clustering_data.shape[1]+1,# common heuristics used to pick kth_neighbors 
    2*clustering_data.shape[1]
    ]

for k,data in normalized_decade_groups.items():
    min_pts=data.shape[1]
    nn=NearestNeighbors(n_neighbors=min_pts-1,n_jobs=-1)
    nn.fit(data)
    distances,indices= nn.kneighbors(data)
    k_distances= np.sort(distances[:, -1])

    initial_eps[k]=k_distances[kth_neighbor-1]


print(f"""
number of groups={len(normalized_decade_groups)}
initial eps values per group={initial_eps}
""")

Index(['profile', 'points', 'length', 'climb_total', 'competitive_age',
       'startlist_quality', 'delta', 'performance_index', 'difficulty',
       'convenience_score', 'cyclist_age', 'day', 'month', 'year', 'decade'],
      dtype='object')
{Interval(1970-12-31 00:00:00, 1972-12-31 00:00:00, closed='right'): 753, Interval(1972-12-31 00:00:00, 1974-12-31 00:00:00, closed='right'): 459, Interval(1974-12-31 00:00:00, 1976-12-31 00:00:00, closed='right'): 1233, Interval(1976-12-31 00:00:00, 1978-12-31 00:00:00, closed='right'): 3686, Interval(1978-12-31 00:00:00, 1980-12-31 00:00:00, closed='right'): 7789, Interval(1980-12-31 00:00:00, 1982-12-31 00:00:00, closed='right'): 9836, Interval(1982-12-31 00:00:00, 1984-12-31 00:00:00, closed='right'): 11363, Interval(1984-12-31 00:00:00, 1986-12-31 00:00:00, closed='right'): 11256, Interval(1986-12-31 00:00:00, 1988-12-31 00:00:00, closed='right'): 10791, Interval(1988-12-31 00:00:00, 1990-12-31 00:00:00, closed='right'): 10995, Interval(1990

  dec_groups=clustering_data.groupby('decade')



number of groups=26
initial eps values per group={Interval(1970-12-31 00:00:00, 1972-12-31 00:00:00, closed='right'): 0.9155102427433038, Interval(1972-12-31 00:00:00, 1974-12-31 00:00:00, closed='right'): 1.3017056548170938, Interval(1974-12-31 00:00:00, 1976-12-31 00:00:00, closed='right'): 0.8180223071229451, Interval(1976-12-31 00:00:00, 1978-12-31 00:00:00, closed='right'): 0.5231390365005472, Interval(1978-12-31 00:00:00, 1980-12-31 00:00:00, closed='right'): 0.4627659081819203, Interval(1980-12-31 00:00:00, 1982-12-31 00:00:00, closed='right'): 0.4631630266953081, Interval(1982-12-31 00:00:00, 1984-12-31 00:00:00, closed='right'): 0.4465417852131229, Interval(1984-12-31 00:00:00, 1986-12-31 00:00:00, closed='right'): 0.45026786677913466, Interval(1986-12-31 00:00:00, 1988-12-31 00:00:00, closed='right'): 0.46040209204023885, Interval(1988-12-31 00:00:00, 1990-12-31 00:00:00, closed='right'): 0.4565313815062814, Interval(1990-12-31 00:00:00, 1992-12-31 00:00:00, closed='right'):

Now that we have the sorted distances we can pick the eps values and proceed to test dbscan , in this case we have more starting eps values given the segmentation hance we have a lot of tests to do.


NOTE: since we didn't manage to make execution feasible we had to cut the clusterings and we go only from 1970 to 1994 with jumps of two years

NOTE: DBSCAN relies a lot on the density we where afraid that sampling would make clustering meaningless because of too many points removed and having a too approximated distribution.

In [3]:

# useful for reference
db_scan_mapping={
    -1:'noisy',
    0:'border',
    1:'core'

}

group_results=pd.DataFrame()


for k,decade_data in normalized_decade_groups.items():
    #NOTE: this might have to be revisited for it's just to try if everyting works
    dimension=decade_data.shape[0]
    min_pts=int(dimension-1)
    #using the method seen at laboratory to select initial values
    #print(decade_data.drop(columns="decade").info())
    maximum_distance = abs(decade_data.max() - decade_data.min()).sum().item()
    average_concentration = dimension / maximum_distance
    #use diferent scales for eps values
    # during the tests a lot of low values where not taken into consideration
    eps_values=initial_eps[k] * np.array([500,250,100,50,10, 5, 2.5, 1, 0.1, 0.01, 0.0001])
    #try various metrics
    metrics=['euclidean']

    min_pts_values=[min_pts]
    print(
    f"""
    period {k}
    maxium distance: {maximum_distance}
    average concentration:{average_concentration}
    eps values:{eps_values}
    used metrics:{metrics}
    number of minimum samples:{min_pts}
    number of samples used:{decade_data.shape[0]}
    """
    )
    #normalization is done for each group
    result=utils.run_dbscan(min_pts_values,eps_values,metrics,decade_data)
    result["group"]=k
    group_results=pd.concat([group_results,result])
group_results.reset_index()


    period (1970-12-31 00:00:00, 1972-12-31 00:00:00]
    maxium distance: 26.4432145906155
    average concentration:28.476114256820882
    eps values:[4.57755121e+02 2.28877561e+02 9.15510243e+01 4.57755121e+01
 9.15510243e+00 4.57755121e+00 2.28877561e+00 9.15510243e-01
 9.15510243e-02 9.15510243e-03 9.15510243e-05]
    used metrics:['euclidean']
    number of minimum samples:752
    number of samples used:753
    
Running DBSCAN: 0 - eps=457.7551213716519, metric=euclidean, min_samples=752
DBSCAN done in 0.00 seconds | Silhouette Score: all noise
Running DBSCAN: 1 - eps=228.87756068582596, metric=euclidean, min_samples=752
DBSCAN done in 0.00 seconds | Silhouette Score: all noise
Running DBSCAN: 2 - eps=91.55102427433039, metric=euclidean, min_samples=752
DBSCAN done in 0.00 seconds | Silhouette Score: all noise
Running DBSCAN: 3 - eps=45.775512137165194, metric=euclidean, min_samples=752
DBSCAN done in 0.00 seconds | Silhouette Score: all noise
Running DBSCAN: 4 - eps=9.155102427

Unnamed: 0,index,SSE,BSS,group_index,eps,metric,min_samples,silhouette_score,execution_time(s),group
0,0,not needed,not needed,0,457.755121,euclidean,752,all noise,0.001918,"(1970-12-31 00:00:00, 1972-12-31 00:00:00]"
1,0,not needed,not needed,1,228.877561,euclidean,752,all noise,0.001915,"(1970-12-31 00:00:00, 1972-12-31 00:00:00]"
2,0,not needed,not needed,2,91.551024,euclidean,752,all noise,0.001148,"(1970-12-31 00:00:00, 1972-12-31 00:00:00]"
3,0,not needed,not needed,3,45.775512,euclidean,752,all noise,0.000931,"(1970-12-31 00:00:00, 1972-12-31 00:00:00]"
4,0,not needed,not needed,4,9.155102,euclidean,752,all noise,0.001391,"(1970-12-31 00:00:00, 1972-12-31 00:00:00]"
...,...,...,...,...,...,...,...,...,...,...
281,0,not needed,not needed,6,1.000888,euclidean,39698,all noise,0.025224,"(2020-12-31 00:00:00, 2022-12-31 00:00:00]"
282,0,not needed,not needed,7,0.400355,euclidean,39698,all noise,0.023848,"(2020-12-31 00:00:00, 2022-12-31 00:00:00]"
283,0,not needed,not needed,8,0.040036,euclidean,39698,all noise,0.024552,"(2020-12-31 00:00:00, 2022-12-31 00:00:00]"
284,0,not needed,not needed,9,0.004004,euclidean,39698,all noise,0.025165,"(2020-12-31 00:00:00, 2022-12-31 00:00:00]"


As for the result we only managed to find a meaningful clustering in th first two years with a silhoutte score of 0.79 all the otehr are all noise , however after some consideration we found out that we don't have any meaningful clusteriong because we have all points taht are core so eps is too high bnut lowering it doesn't change even after testing very different scales both big and small.

In [4]:
best_idx=0
best_params=group_results.iloc[best_idx]

best_dbscan=DBSCAN(eps=best_params['eps'],min_samples=best_params['min_samples']).fit(normalized_decade_groups[best_params['group']])

labels=best_dbscan.labels_

statistics=np.unique(best_dbscan.labels_,return_counts=True)

print(
f"""
results:{best_params}
statistics:
    raw counts: noise {statistics[0][0]}| core {statistics[1][0]}
"""
)





results:SSE                                                  not needed
BSS                                                  not needed
group_index                                                   0
eps                                                  457.755121
metric                                                euclidean
min_samples                                                 752
silhouette_score                                      all noise
execution_time(s)                                      0.001918
group                (1970-12-31 00:00:00, 1972-12-31 00:00:00]
Name: 0, dtype: object
statistics:
    raw counts: noise 0| core 753



So for this first delivery we can only say taht after some consideration the dataset tends to be very sparse, probably some sensd approaches would be to:
- use sampling and make a bigger hyperparameters space.
- find a more refined method to select the eps values.
- use different segmentations.

As you can see clustyring by years even after trying to reduce the dimension to get less sparse clusters is not effective, aside from a nice 0.71 in the first part we cannot get much more than that, we can infer very different densities across years which makes for very bad clusterings.

So we can try other kind of segmentations, a first approach could be geospatial: group by the race occurencies across time.

## geospatial clustering

In [2]:
ohe_diff_lvl=pd.get_dummies(races_df['difficulty_level']).astype(float)
clustering_data[ohe_diff_lvl.columns]=ohe_diff_lvl

ohe_tarmac=pd.get_dummies(races_df['is_tarmac']).astype(float)

cols=list(ohe_tarmac.columns)
cols[0]='True_is_tarmac'
cols[1]='False_is_tarmac'
clustering_data[cols]=ohe_tarmac


clustering_data=clustering_data.drop(columns=["cyclist","cyclist_team","is_tarmac","difficulty_level","date","stage"]).drop_duplicates()

std_scaler=StandardScaler()

clustering_data=utils.random_sampling_reduce(clustering_data,0.25)

races_groups=clustering_data.groupby('std_name')
normalized_races_groups={k:std_scaler.fit_transform(g.drop(columns="std_name").drop_duplicates()) for k,g in races_groups }

print({k:len(g) for k,g in normalized_races_groups.items()})

initial_eps=dict()

kth_neighbor=4

for k,data in normalized_races_groups.items():
    min_pts=data.shape[1]
    nn=NearestNeighbors(n_neighbors=min_pts-1,n_jobs=-1)
    nn.fit(data)
    distances,indices= nn.kneighbors(data)
    k_distances= np.sort(distances[:, -1])

    initial_eps[k]=k_distances[kth_neighbor-1]


print(f"""
number of groups={len(normalized_races_groups)}
initial eps values per group={initial_eps}
""")

{'amstel-gold-race': 1108, 'dauphine': 6671, 'dwars-door-vlaanderen': 648, 'e3-harelbeke': 821, 'giro-d-italia': 23911, 'gp-montreal': 269, 'gp-quebec': 327, 'gran-camino': 175, 'il-lombardia': 749, 'itzulia-basque-country': 4528, 'la-fleche-wallone': 1271, 'liege-bastogne-liege': 1190, 'milano-sanremo': 1638, 'omloop-het-nieuwsblad': 1005, 'paris-nice': 8011, 'paris-roubaix': 909, 'ronde-van-vlaanderen': 1076, 'san-sebastian': 1168, 'strade-bianche': 304, 'tirreno-adriatico': 6844, 'tour-de-france': 36613, 'tour-de-romandie': 4985, 'tour-de-suisse': 8383, 'uae-tour': 1072, 'volta-a-catalunya': 6528, 'vuelta-a-espana': 26282, 'world-championship': 949}

number of groups=27
initial eps values per group={'amstel-gold-race': 1.331096502661081, 'dauphine': 0.8401145858418447, 'dwars-door-vlaanderen': 1.2632881010334707, 'e3-harelbeke': 1.3823696824421974, 'giro-d-italia': 0.7261308576322284, 'gp-montreal': 2.0184520099061922, 'gp-quebec': 1.882460185176687, 'gran-camino': 2.48516596897522,

In [3]:
import gc

group_results=pd.DataFrame()
for k,decade_data in normalized_races_groups.items():
    #NOTE: this might have to be revisited for it's just to try if everyting works
    dimension=decade_data.shape[0]
    min_pts=int(dimension-1)
    #using the method seen at laboratory to select initial values
    #print(decade_data.drop(columns="decade").info())
    maximum_distance = abs(decade_data.max() - decade_data.min()).sum().item()
    average_concentration = dimension / maximum_distance
    #use diferent scales for eps values
    # during the tests a lot of low values where not taken into consideration
    eps_values=initial_eps[k] * np.array([500,250,100,50,10, 5, 2.5, 1, 0.1, 0.01, 0.0001])
    #try various metrics
    metrics=['euclidean']

    min_pts_values=[min_pts]
    print(
    f"""
    period {k}
    maxium distance: {maximum_distance}
    average concentration:{average_concentration}
    eps values:{eps_values}
    used metrics:{metrics}
    number of minimum samples:{min_pts}
    number of samples used:{decade_data.shape[0]}
    """
    )
    gc.collect()
    #normalization is done for each group
    result=utils.run_dbscan(min_pts_values,eps_values,metrics,decade_data)
    result["group"]=k
    group_results=pd.concat([group_results,result])
group_results.reset_index()


    period amstel-gold-race
    maxium distance: 13.372756617282896
    average concentration:82.85501873024634
    eps values:[6.65548251e+02 3.32774126e+02 1.33109650e+02 6.65548251e+01
 1.33109650e+01 6.65548251e+00 3.32774126e+00 1.33109650e+00
 1.33109650e-01 1.33109650e-02 1.33109650e-04]
    used metrics:['euclidean']
    number of minimum samples:1107
    number of samples used:1108
    
Running DBSCAN: 0 - eps=665.5482513305406, metric=euclidean, min_samples=1107
DBSCAN done in 0.02 seconds | Silhouette Score: all noise
Running DBSCAN: 1 - eps=332.7741256652703, metric=euclidean, min_samples=1107
DBSCAN done in 0.02 seconds | Silhouette Score: all noise
Running DBSCAN: 2 - eps=133.1096502661081, metric=euclidean, min_samples=1107
DBSCAN done in 0.02 seconds | Silhouette Score: all noise
Running DBSCAN: 3 - eps=66.55482513305405, metric=euclidean, min_samples=1107
DBSCAN done in 0.02 seconds | Silhouette Score: all noise
Running DBSCAN: 4 - eps=13.31096502661081, metric=euclid

: 