# DBSCAN
This notebook uses DBSCAN as a clustering density based approach.



In [1]:
import datetime
from sklearn.cluster import DBSCAN
from os import path
import pandas as pd
from sklearn.preprocessing import StandardScaler

def most_frequent(series):
    return series.mode()[0] if not series.mode().empty else series.iloc[0]


RACES_PATH=path.join("..","dataset","engineered_races.csv")

races_df=pd.read_csv(RACES_PATH)

#print(races_df.describe())



#clustering_data=clustering_data.drop(columns=["date","difficulty_level"])
#print(races_df.info())
clustering_data=races_df.groupby(['date','stage','std_name','cyclist']).agg({
    'profile':most_frequent,
    'is_tarmac':most_frequent,
    'difficulty_level':most_frequent,

    'points':'sum',

    'length':'mean',
    'climb_total':'mean',
    'competitive_age':'mean',
    'startlist_quality':'mean',
    'delta':'mean',
    'performance_index':'mean',
    'difficulty':'mean',
    'convenience_score':'mean',
    'difficulty_score':'mean',
    'gain_ratio':'mean',

    'cyclist_age':'first',
    'position':'first',
    'cyclist_team':'first',
}).reset_index()

#clustering_data=races_df[cols].copy()
#convert to timestamp(units are useless since it's getting normalized)
clustering_data['date']=pd.to_datetime(clustering_data['date'])
clustering_data['day']=clustering_data['date'].dt.day
clustering_data['month']=clustering_data['date'].dt.month
clustering_data['year']=clustering_data['date'].dt.year

#one hot encoding difficulty
ohe_diff_lvl=pd.get_dummies(races_df['difficulty_level']).astype(float)

#clustering_data
dec_cut=pd.date_range(
    start=clustering_data['date'].min(),
    end=clustering_data['date'].max(),
    freq='2YE'
)
clustering_data['decade']=pd.cut(
    clustering_data['date'],
    bins=dec_cut,
)

clustering_data[ohe_diff_lvl.columns]=ohe_diff_lvl

clustering_data=clustering_data.drop(columns="date")

clustering_data



Unnamed: 0,stage,std_name,cyclist,profile,is_tarmac,difficulty_level,points,length,climb_total,competitive_age,...,cyclist_age,position,cyclist_team,day,month,year,decade,easy,hard,moderate
0,result,omloop-het-nieuwsblad,andre-dierickx,3.0,False,moderate,125.0,195000.0,2330.469215,23.0,...,23.0,2,spain-1991,28,2,1970,,1.0,0.0,0.0
1,result,omloop-het-nieuwsblad,christian-callens,3.0,False,moderate,125.0,195000.0,2330.469215,23.0,...,23.0,9,free-agent,28,2,1970,,1.0,0.0,0.0
2,result,omloop-het-nieuwsblad,daniel-van-ryckeghem,3.0,False,moderate,125.0,195000.0,2330.469215,25.0,...,25.0,5,norway-1987,28,2,1970,,1.0,0.0,0.0
3,result,omloop-het-nieuwsblad,eddy-merckx,3.0,False,moderate,125.0,195000.0,2330.469215,25.0,...,25.0,6,team-monex-2005,28,2,1970,,1.0,0.0,0.0
4,result,omloop-het-nieuwsblad,englebert-opdebeeck,3.0,False,moderate,125.0,195000.0,2330.469215,24.0,...,24.0,36,free-agent,28,2,1970,,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589734,result,san-sebastian,txomin-juaristi,2.0,False,moderate,225.0,230300.0,4057.000000,28.0,...,28.0,91,norway-2021,29,7,2023,,1.0,0.0,0.0
589735,result,san-sebastian,urko-berrade-fernandez,2.0,False,moderate,225.0,230300.0,4057.000000,26.0,...,26.0,33,atala-1985,29,7,2023,,1.0,0.0,0.0
589736,result,san-sebastian,victor-de-la-parte,2.0,False,moderate,225.0,230300.0,4057.000000,37.0,...,37.0,88,c-a-1978,29,7,2023,,1.0,0.0,0.0
589737,result,san-sebastian,welay-hagos-berhe,2.0,False,moderate,225.0,230300.0,4057.000000,22.0,...,22.0,40,bankgiroloterij-batavus-2000,29,7,2023,,1.0,0.0,0.0


Note: before starting, for reasons of time we could,for this delivery, do the clustering on the full dataset so for now we decided to employ some sort of data reduction as to make it feasible to run such an algorithm

# clustering organization

For reasons of time using DBSCAN on the whole dataset is not feasible, a second approach would be to try and a segmentation, for this part we wanted to employ a clusterization that is time based and analyses clusters across decades and see what we can find.

In [2]:
import numpy as np
import random




def random_sampling_reduce(data,reduction_percent):
    num_samples=data.shape[0]

    reduction_num_samples=int(np.ceil(reduction_percent*num_samples))

    RANDOM_SEED=42

    np.random.seed(RANDOM_SEED)

    reduction_idx=np.random.choice(range(len(clustering_data)),reduction_num_samples,replace=False)

    return data.iloc[reduction_idx]



A few notes are due before starting, first the eps are difficulty to setup for now a good strategy would be to take inspiration using the first paper the introduced the algorithm, which you can find [here](https://dl.acm.org/doi/10.5555/3001460.3001507), and use the distance from the k-th NN varying K until we find a good eps value for us.



In [3]:
from sklearn.metrics import silhouette_score
import itertools as it
import numpy as np
import utils

Given that we have highly dimensional features just trying the values isn't enough. A good idea would be to use a KNN and the elbow method to estimate the correct eps value. We still need to do some kind of aggregation before.




## applying the elbow method
so given the unfeasibility of using the whole dataset for this part we divide everything going from decade to decade.

In [4]:
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
clustering_data=clustering_data.drop(columns=["difficulty_level","stage","std_name","cyclist","cyclist_team","easy","hard","moderate","is_tarmac","gain_ratio","difficulty_score","position"]).drop_duplicates()

#clustering_data=random_sampling_reduce(clustering_data,1)

std_scaler=StandardScaler()

print(clustering_data.columns)

dec_groups=clustering_data.groupby('decade')
normalized_decade_groups={k:std_scaler.fit_transform(g.drop(columns="decade").drop_duplicates()) for k,g in dec_groups }

print({k:len(g) for k,g in normalized_decade_groups.items()})

initial_eps=dict()

kth_neighbor=30

for k,data in normalized_decade_groups.items():
    min_pts=data.shape[1]
    nn=NearestNeighbors(n_neighbors=min_pts-1,n_jobs=-1)
    nn.fit(data)
    distances,indices= nn.kneighbors(data)
    k_distances= np.sort(distances[:, -1])

    initial_eps[k]=k_distances[kth_neighbor-1]


print(f"""
number of groups={len(normalized_decade_groups)}
initial eps values per group={initial_eps}
""")

Index(['profile', 'points', 'length', 'climb_total', 'competitive_age',
       'startlist_quality', 'delta', 'performance_index', 'difficulty',
       'convenience_score', 'cyclist_age', 'day', 'month', 'year', 'decade'],
      dtype='object')
{Interval(1970-12-31 00:00:00, 1972-12-31 00:00:00, closed='right'): 753, Interval(1972-12-31 00:00:00, 1974-12-31 00:00:00, closed='right'): 459, Interval(1974-12-31 00:00:00, 1976-12-31 00:00:00, closed='right'): 1233, Interval(1976-12-31 00:00:00, 1978-12-31 00:00:00, closed='right'): 3686, Interval(1978-12-31 00:00:00, 1980-12-31 00:00:00, closed='right'): 7789, Interval(1980-12-31 00:00:00, 1982-12-31 00:00:00, closed='right'): 9836, Interval(1982-12-31 00:00:00, 1984-12-31 00:00:00, closed='right'): 11363, Interval(1984-12-31 00:00:00, 1986-12-31 00:00:00, closed='right'): 11256, Interval(1986-12-31 00:00:00, 1988-12-31 00:00:00, closed='right'): 10791, Interval(1988-12-31 00:00:00, 1990-12-31 00:00:00, closed='right'): 10995, Interval(1990

  dec_groups=clustering_data.groupby('decade')



number of groups=26
initial eps values per group={Interval(1970-12-31 00:00:00, 1972-12-31 00:00:00, closed='right'): 0.9155102427433038, Interval(1972-12-31 00:00:00, 1974-12-31 00:00:00, closed='right'): 1.3017056548170938, Interval(1974-12-31 00:00:00, 1976-12-31 00:00:00, closed='right'): 0.8180223071229451, Interval(1976-12-31 00:00:00, 1978-12-31 00:00:00, closed='right'): 0.5231390365005472, Interval(1978-12-31 00:00:00, 1980-12-31 00:00:00, closed='right'): 0.4627659081819203, Interval(1980-12-31 00:00:00, 1982-12-31 00:00:00, closed='right'): 0.4631630266953081, Interval(1982-12-31 00:00:00, 1984-12-31 00:00:00, closed='right'): 0.4465417852131229, Interval(1984-12-31 00:00:00, 1986-12-31 00:00:00, closed='right'): 0.45026786677913466, Interval(1986-12-31 00:00:00, 1988-12-31 00:00:00, closed='right'): 0.46040209204023885, Interval(1988-12-31 00:00:00, 1990-12-31 00:00:00, closed='right'): 0.4565313815062814, Interval(1990-12-31 00:00:00, 1992-12-31 00:00:00, closed='right'):

Now that we have the sorted distances we can pick the eps values and proceed to test dbscan , in this case we have more starting eps values given the segmentation hance we have a lot of tests to do.

In [None]:

from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
# useful for reference
db_scan_mapping={
    -1:'noisy',
    0:'border',
    1:'core'

}

std_scaler=StandardScaler()

group_results=pd.DataFrame()


for k,decade_data in normalized_decade_groups.items():
    #NOTE: this might have to be revisited for it's just to try if everyting works
    dimension=decade_data.shape[0]
    min_pts=int(dimension-1)
    #using the method seen at laboratory to select initial values
    #print(decade_data.drop(columns="decade").info())
    maximum_distance = abs(decade_data.max() - decade_data.min()).sum().item()
    average_concentration = dimension / maximum_distance
    #use diferent scales for eps values
    # during the tests a lot of low values where not taken into consideration
    eps_values=initial_eps[k] * np.array([500,250,100,50,10, 5, 2.5, 1, 0.1, 0.01, 0.0001])
    #try various metrics
    metrics=['euclidean']

    min_pts_values=[min_pts]
    print(
    f"""
    period {k}
    maxium distance: {maximum_distance}
    average concentration:{average_concentration}
    eps values:{eps_values}
    used metrics:{metrics}
    number of minimum samples:{min_pts}
    number of samples used:{decade_data.shape[0]}
    """
    )
    #normalization is done for each group
    result=utils.run_dbscan(min_pts_values,eps_values,metrics,decade_data)
    result["group"]=k
    group_results=pd.concat([group_results,result])
group_results.reset_index().sort_values(by='silhoutte_score')


    period (1970-12-31 00:00:00, 1972-12-31 00:00:00]
    maxium distance: 26.4432145906155
    average concentration:28.476114256820882
    eps values:[9.15510243e+00 4.57755121e+00 2.28877561e+00 9.15510243e-01
 9.15510243e-02 9.15510243e-03 9.15510243e-05]
    used metrics:['euclidean']
    number of minimum samples:752
    number of samples used:753
    
-0 - (9.155102427433038, 'euclidean', 752)
dbscan done, time=0.02748727798461914 seconds | silhoutte score:0.7908954023847292
-1 - (4.577551213716519, 'euclidean', 752)
dbscan done, time=0.020033597946166992 seconds | silhoutte score:all noise
-2 - (2.2887756068582594, 'euclidean', 752)
dbscan done, time=0.019155502319335938 seconds | silhoutte score:all noise
-3 - (0.9155102427433038, 'euclidean', 752)
dbscan done, time=0.018077850341796875 seconds | silhoutte score:all noise
-4 - (0.09155102427433039, 'euclidean', 752)
dbscan done, time=0.017905712127685547 seconds | silhoutte score:all noise
-5 - (0.009155102427433039, 'euclide



dbscan done, time=0.0058727264404296875 seconds | silhoutte score:all noise
-3 - (1.3017056548170938, 'euclidean', 458)
dbscan done, time=0.006792306900024414 seconds | silhoutte score:all noise
-4 - (0.13017056548170938, 'euclidean', 458)
dbscan done, time=0.006266355514526367 seconds | silhoutte score:all noise
-5 - (0.013017056548170938, 'euclidean', 458)
dbscan done, time=0.0057904720306396484 seconds | silhoutte score:all noise
-6 - (0.00013017056548170937, 'euclidean', 458)
dbscan done, time=0.005656719207763672 seconds | silhoutte score:all noise

    period (1974-12-31 00:00:00, 1976-12-31 00:00:00]
    maxium distance: 11.144002200913832
    average concentration:110.64247635368301
    eps values:[8.18022307e+00 4.09011154e+00 2.04505577e+00 8.18022307e-01
 8.18022307e-02 8.18022307e-03 8.18022307e-05]
    used metrics:['euclidean']
    number of minimum samples:1232
    number of samples used:1233
    
-0 - (8.18022307122945, 'euclidean', 1232)
dbscan done, time=0.03167438507

KeyboardInterrupt: 

In [None]:
group_results

In [None]:
best_idx=group_results['silhoutte_score'].argmax()
best_params=group_results.iloc[best_idx]
best_eps=best_params['eps']
best_metric=best_params['metric']

best_dbscan=DBSCAN(eps=best_eps,metric=best_metric,min_samples=min_pts).fit(reduction_data)

labels=best_dbscan.labels_

statistics=np.unique(best_dbscan.labels_,return_counts=True)

print(
f"""
results:{best_params}
statistics:
    raw counts:{statistics}
    percentags:{statistics/np.sum(statistics)}
"""
)




- segmentazione sui migliori o per anni
- rifare per altri in caso
- plot BSS e SSE sì
- aggregazione gare sì