# DBSCAN
This notebook uses DBSCAN as a clustering density based approach.



In [1]:
import datetime
from sklearn.cluster import DBSCAN
from os import path
import pandas as pd
from sklearn.preprocessing import StandardScaler


RACES_PATH=path.join("..","dataset","engineered_races.csv")

races_df=pd.read_csv(RACES_PATH)

#print(races_df.describe())

cols=list(races_df.columns)
#too noisy to use since it cuts away too much information
cols.remove("is_tarmac")
# not really relevant unless we want to find a usefull correlation
cols.remove("stage")
# useless unless we care about grouping together performances of cyclists but overall might be noisy
cols.remove("std_name")
# not really usefull unless we care about teams performances
# also the way it was filled is difficult to make it useful it might be very noisy
cols.remove("cyclist_team")
#same as above
cols.remove("cyclist")


clustering_data=races_df[cols].copy()
#convert to timestamp(units are useless since it's getting normalized)
clustering_data['date']=pd.to_datetime(clustering_data['date'])
clustering_data['day']=clustering_data['date'].dt.day
clustering_data['month']=clustering_data['date'].dt.month
clustering_data['year']=clustering_data['date'].dt.year

#one hot encoding difficulty
ohe_diff_lvl=pd.get_dummies(races_df['difficulty_level']).astype(float)

#clustering_data
dec_cut=pd.date_range(
    start=clustering_data['date'].min(),
    end=clustering_data['date'].max(),
    freq='10YE'
)
dec_cuts=pd.cut(
    clustering_data['date'],
    bins=dec_cut
)

clustering_data=clustering_data.drop(columns=["date","difficulty_level"])
clustering_data['decade']=dec_cuts
clustering_data[ohe_diff_lvl.columns]=ohe_diff_lvl

clustering_data.describe()

Unnamed: 0,points,length,climb_total,profile,startlist_quality,position,cyclist_age,delta,climbing_efficiency,competitive_age,...,convenience_score,difficulty_score,performance_index,gain_ratio,day,month,year,easy,hard,moderate
count,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,...,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0
mean,89.221635,166776.180584,2330.469215,2.673627,1101.161178,74.219491,28.486208,418.292794,0.023028,28.485343,...,0.035668,1.244299,0.328403,179.78689,15.090092,5.900314,2006.166425,0.316364,0.111405,0.572231
std,54.413315,64545.605664,1191.967186,1.34981,380.586928,48.404023,3.855262,842.961596,0.062654,3.856015,...,0.058427,0.549808,0.172467,1027.92069,7.948271,1.981774,11.477521,0.465057,0.314633,0.494756
min,18.0,1000.0,2.0,1.0,115.0,0.0,13.0,-6906.0,6.9e-05,13.0,...,0.0,0.001593,0.0,11.834738,1.0,2.0,1970.0,0.0,0.0,0.0
25%,50.0,152500.0,1666.0,2.0,844.0,32.0,26.0,10.0,0.009397,26.0,...,0.004432,0.874988,0.192755,40.91899,9.0,4.0,1999.0,0.0,0.0,0.0
50%,80.0,178200.0,2330.469215,3.0,988.0,70.0,28.0,156.0,0.013032,28.0,...,0.013905,1.262876,0.310048,61.405397,14.0,6.0,2008.0,0.0,0.0,1.0
75%,100.0,203500.0,2863.0,3.0,1309.0,112.0,31.0,624.0,0.018083,31.0,...,0.042161,1.582046,0.446206,116.276626,22.0,7.0,2015.0,1.0,0.0,1.0
max,350.0,338000.0,6974.0,5.0,2047.0,209.0,56.0,61547.0,1.370864,56.0,...,1.0,2.752226,0.990431,50233.876744,31.0,11.0,2023.0,1.0,1.0,1.0


Note: before starting, for reasons of time we could,for this delivery, do the clustering on the full dataset so for now we decided to employ some sort of data reduction as to make it feasible to run such an algorithm

# clustering organization

For reasons of time using DBSCAN on the whole dataset is not feasible, a second approach would be to try and a segmentation, for this part we wanted to employ a clusterization that is time based and analyses clusters across decades and see what we can find.

In [2]:
import numpy as np
import random

clustering_data=clustering_data.drop_duplicates()

num_samples=clustering_data.shape[0]

reduction_percent=0.8

reduction_num_samples=int(np.ceil(reduction_percent*num_samples))

RANDOM_SEED=42

np.random.seed(RANDOM_SEED)

reduction_idx=np.random.choice(range(len(clustering_data)),reduction_num_samples,replace=False)

reduction_data=clustering_data.iloc[reduction_idx]


dec_groups=reduction_data.groupby('decade')

dec_groups.describe()


  dec_groups=reduction_data.groupby('decade')


Unnamed: 0_level_0,points,points,points,points,points,points,points,points,length,length,...,hard,hard,moderate,moderate,moderate,moderate,moderate,moderate,moderate,moderate
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
decade,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
"(1970-12-31 00:00:00, 1980-12-31 00:00:00]",11163.0,116.966317,70.661337,50.0,80.0,100.0,100.0,350.0,11163.0,169754.670787,...,0.0,1.0,11163.0,0.5804,0.493516,0.0,0.0,1.0,1.0,1.0
"(1980-12-31 00:00:00, 1990-12-31 00:00:00]",43512.0,111.688633,57.693682,50.0,100.0,100.0,100.0,350.0,43512.0,172376.040173,...,0.0,1.0,43512.0,0.497012,0.499997,0.0,0.0,0.0,1.0,1.0
"(1990-12-31 00:00:00, 2000-12-31 00:00:00]",79122.0,95.979991,55.074528,50.0,80.0,80.0,100.0,350.0,79122.0,173879.013422,...,0.0,1.0,79122.0,0.575124,0.494327,0.0,0.0,1.0,1.0,1.0
"(2000-12-31 00:00:00, 2010-12-31 00:00:00]",142380.0,81.894437,50.323952,18.0,50.0,80.0,80.0,350.0,142380.0,162432.915438,...,0.0,1.0,142380.0,0.578431,0.493812,0.0,0.0,1.0,1.0,1.0
"(2010-12-31 00:00:00, 2020-12-31 00:00:00]",150837.0,85.446305,52.293645,50.0,50.0,80.0,100.0,350.0,150837.0,166492.978845,...,0.0,1.0,150837.0,0.577259,0.493997,0.0,0.0,1.0,1.0,1.0


A few notes are due before starting, first the eps are difficulty to setup for now a good strategy would be to take inspiration using the first paper the introduced the algorithm, which you can find [here](https://dl.acm.org/doi/10.5555/3001460.3001507), and use the distance from the k-th NN varying K until we find a good eps value for us.



In [3]:
from sklearn.metrics import silhouette_score
import itertools as it
import numpy as np
import utils

In [4]:



# useful for reference
db_scan_mapping={
    -1:'noisy',
    0:'border',
    1:'core'

}

std_scaler=StandardScaler()

group_results=pd.DataFrame()

for k,decade_data in dec_groups:
    #NOTE: this might have to be revisited for it's just to try if everyting works
    dimension=decade_data.shape[0]
    min_pts=int(dimension/2)
    #using the method seen at laboratory to select initial values
    #print(decade_data.drop(columns="decade").info())
    maximum_distance = abs(decade_data.drop(columns="decade").max() - decade_data.drop(columns="decade").min()).sum().item()
    average_concentration = dimension / maximum_distance
    #use diferent scales for eps values
    eps_values=average_concentration * np.array([10, 5, 2.5, 1, 0.1, 0.01, 0.0001])
    #try various metrics
    metrics=['euclidean','cosine','l1']

    min_pts_values=[min_pts]
    print(
    f"""
    group no.{k}

    average concentration:{average_concentration}

    eps values:{eps_values}

    used metrics:{metrics}

    number of minimum samples:{min_pts}

    number of samples used:{decade_data.shape[0]}
    """
    )
    #normalization is done for each group
    result=utils.run_dbscan(min_pts_values,eps_values,metrics,std_scaler.fit_transform(decade_data.drop(columns="decade")))
    result["group"]=k

    group_results=pd.concat([group_results,result])
    print(result.sort_values(by='silhoutte_score'))

group_results.sort_values(by='silhoutte_score')


    group no.(1970-12-31 00:00:00, 1980-12-31 00:00:00]

    average concentration:5.095399183535452e-05

    eps values:[5.09539918e-04 2.54769959e-04 1.27384980e-04 5.09539918e-05
 5.09539918e-06 5.09539918e-07 5.09539918e-09]

    used metrics:['euclidean', 'cosine', 'l1']

    number of minimum samples:5581

    number of samples used:11163
    
starting 0
 {'index': 0, 'eps': 0.0005095399183535452, 'metric': 'euclidean', 'min_samples': 5581}
(array([-1]), array([11163]))


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [None]:
best_idx=results['silhoutte_score'].argmax()
best_params=results.iloc[best_idx]
best_eps=best_params['eps']
best_metric=best_params['metric']

best_dbscan=DBSCAN(eps=best_eps,metric=best_metric,min_samples=min_pts).fit(reduction_data)

labels=best_dbscan.labels_

statistics=np.unique(best_dbscan.labels_,return_counts=True)

print(f"""
results:{best_params}
statistics:
    raw counts:{statistics}
    percentags:{statistics/np.sum(statistics)}
"""
)


- segmentazione sui migliori o per anni
- rifare per altri in caso
- plot BSS e SSE sì
- aggregazione gare sì