# DBSCAN
This notebook uses DBSCAN as a clustering density based approach.



In [1]:
import datetime
from sklearn.cluster import DBSCAN
from os import path
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder

std_scaler=StandardScaler()
encoder=OneHotEncoder()


RACES_PATH=path.join("..","dataset","engineered_races.csv")

races_df=pd.read_csv(RACES_PATH)

cols=list(races_df.columns)
#too noisy to use since it cuts away too much information
cols.remove("is_tarmac")
# not really relevant unless we want to find a usefull correlation
cols.remove("stage")
# useless unless we care about grouping together performances of cyclists but overall might be noisy
cols.remove("std_name")
# not really usefull unless we care about teams performances
# also the way it was filled is difficult to make it useful it might be very noisy
cols.remove("cyclist_team")
#same as above
cols.remove("cyclist")


clustering_data=races_df[cols].copy()
#convert to timestamp(units are useless since it's getting normalized)
clustering_data['date']=pd.to_datetime(clustering_data['date'])
clustering_data['day']=clustering_data['date'].dt.day
clustering_data['month']=clustering_data['date'].dt.month
clustering_data['year']=clustering_data['date'].dt.year


#one hot encoding difficulty
ohe_diff_lvl=pd.get_dummies(races_df['difficulty_level']).astype(int)

clustering_data=clustering_data.drop(columns=["date","difficulty_level"])

clustering_data[clustering_data.columns]=std_scaler.fit_transform(clustering_data[clustering_data.columns])
clustering_data[ohe_diff_lvl.columns]=ohe_diff_lvl

clustering_data.describe()




Unnamed: 0,points,length,climb_total,profile,startlist_quality,position,cyclist_age,delta,climbing_efficiency,competitive_age,...,convenience_score,difficulty_score,performance_index,gain_ratio,day,month,year,easy,hard,moderate
count,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,...,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0,589865.0
mean,6.032413000000001e-17,-6.991413000000001e-17,-7.610329e-16,3.15758e-16,-2.224688e-16,5.998835e-17,-2.294673e-15,2.792831e-17,4.243152e-18,-3.610579e-16,...,-1.15598e-16,-6.07111e-18,4.868573e-16,-5.4832720000000005e-17,-1.517778e-18,1.324683e-16,4.355663e-15,0.316364,0.111405,0.572231
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,...,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,0.465057,0.314633,0.494756
min,-1.308902,-2.56836,-1.953469,-1.2399,-2.591161,-1.533334,-4.016905,-8.688769,-0.3664412,-4.015896,...,-0.6104685,-2.260255,-1.904144,-0.1633903,-1.772726,-1.968095,-3.151069,0.0,0.0,0.0
25%,-0.7208101,-0.2211799,-0.5574564,-0.4990538,-0.6756969,-0.8722318,-0.6448874,-0.4843556,-0.2175541,-0.6445373,...,-0.5346063,-0.6717084,-0.786514,-0.135096,-0.7662166,-0.9588967,-0.6243884,0.0,0.0,0.0
50%,-0.169474,0.1769885,-7.630206e-16,0.2417921,-0.2973336,-0.08717239,-0.1261155,-0.3111565,-0.159534,-0.1258666,...,-0.3724805,0.0337882,-0.1064245,-0.1151661,-0.1371484,0.05030121,0.1597538,0.0,0.0,1.0
75%,0.1980834,0.5689597,0.4467667,0.2417921,0.5461013,0.7805248,0.6520425,0.2440294,-0.07892303,0.6521393,...,0.1111404,0.6143,0.6830501,-0.06178523,0.8693607,0.5549002,0.7696421,1.0,0.0,1.0
max,4.792551,2.652759,3.89569,1.723484,2.485213,2.784492,7.136692,72.51666,21.51231,7.135522,...,16.50501,2.742643,3.83857,48.69455,2.001683,2.573296,1.466657,1.0,1.0,1.0


Note: before starting, for reasons of time we could,for this delivery, do the clustering on the full dataset so for now we decided to employ some sort of data reduction as to make it feasible to run such an algorithm

In [2]:
import numpy as np
import random
num_samples=clustering_data.shape[0]

reduction_percent=0.5

reduction_num_samples=int(np.ceil(reduction_percent*num_samples))

print(reduction_num_samples)


RANDOM_SEED=42

random.seed(RANDOM_SEED)

population_idx=[int(random.uniform(0,num_samples)) for _ in range(reduction_num_samples)]

reduction_data=clustering_data.iloc[population_idx]

reduction_data


294933


Unnamed: 0,points,length,climb_total,profile,startlist_quality,position,cyclist_age,delta,climbing_efficiency,competitive_age,...,convenience_score,difficulty_score,performance_index,gain_ratio,day,month,year,easy,hard,moderate
377175,0.198083,0.700650,6.707663e-01,-0.499054,1.867745,0.801184,0.133271,-0.287431,-0.131892,0.133469,...,2.089204,0.146353,0.074299,-0.101469,1.120988,0.554900,-1.669911,0,0,1
14752,-0.169474,-1.824699,-1.873769e+00,-1.239900,-0.278941,-1.037507,0.911428,-0.207949,-0.335941,0.911475,...,0.204807,-1.979308,1.761840,0.323799,-0.892030,1.564098,0.072627,1,0,0
162230,-0.720810,0.858678,4.467667e-01,-0.499054,-0.539066,1.131736,-1.423045,-0.098810,-0.161888,-1.422543,...,-0.294501,0.131750,-0.572500,-0.137962,1.624243,-1.463496,0.682515,0,0,1
131664,-0.720810,0.056144,-7.630206e-16,-0.499054,-0.160702,1.379649,-1.163659,-0.006279,-0.149252,-1.163208,...,-0.538003,-0.286743,-1.095646,-0.130140,-0.262962,-1.463496,-0.101627,0,0,1
434418,-0.169474,0.170791,5.658976e-01,-0.499054,-0.045091,-0.025194,-1.163659,0.478916,-0.097786,-1.163208,...,0.454887,-0.070837,-0.718347,-0.110336,1.372615,1.564098,-0.363008,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367940,-0.169474,1.242902,1.219060e-02,0.241792,-0.783425,-1.409378,-0.126115,-0.496219,-0.216008,-0.125867,...,-0.571854,0.585168,0.348554,-0.125207,1.624243,-0.958897,-1.495657,0,0,1
268965,0.198083,-0.050758,-1.017201e+00,-1.239900,1.342241,-1.264761,0.392656,-0.496219,-0.258399,0.392804,...,0.049814,-1.094989,1.824459,-0.023434,-0.892030,0.554900,0.682515,1,0,0
432681,-0.720810,-2.507937,-7.630206e-16,0.241792,-0.546948,1.069757,-0.385501,-0.447580,7.223429,-0.385202,...,-0.390800,-0.725256,-1.180459,-0.117376,-1.395285,-0.454298,-0.624388,1,0,0
384440,-0.720810,-0.283152,-8.552837e-01,-1.239900,0.149345,-1.078826,-0.904273,-0.420296,-0.226632,-0.903873,...,-0.392970,-1.125597,2.246146,-0.097131,-1.269471,-1.463496,0.943896,1,0,0


A few notes are due before starting, first the eps are difficulty to setup for now a good strategy would be to take inspiration using the first paper the introduced the algorithm, which you can find [here](https://dl.acm.org/doi/10.5555/3001460.3001507), and use the distance from the k-th NN varying K until we find a good eps value for us.



In [None]:
from sklearn.metrics import silhouette_score
import itertools as it
import numpy as np
import utils

: 

In [None]:

#NOTE: this might have to be revisited for it's just to try if everyting works
dimension=reduction_data.shape[0]
min_pts=dimension*2
#using the method seen at laboratory to select initial values
maximum_distance = abs(reduction_data.max() - reduction_data.min()).sum().item()
average_concentration = dimension / maximum_distance
#use diferent scales for eps values
eps_values=average_concentration * np.array([10, 5, 2.5, 1, 0.1, 0.01, 0.0001])
#try various metrics
metrics=['euclidean','cosine','l1']

min_pts_values=[min_pts]

print(
f"""
average concentration:{average_concentration}

eps values:{eps_values}

used metrics:{metrics}

number of minimum samples:{min_pts}

nmber of samples used:{reduction_data.shape[0]}
"""
)

# useful for reference
db_scan_mapping={
    -1:'noisy',
    0:'border',
    1:'core'

}

results=utils.run_dbscan(min_pts_values,eps_values,metrics,reduction_data)

results.sort_values(by='silhoutte_score')

589865

average concentration:600.5588889209139

eps values:[6.00558889e+03 3.00279444e+03 1.50139722e+03 6.00558889e+02
 6.00558889e+01 6.00558889e+00 6.00558889e-02]

used metrics:['euclidean', 'cosine', 'l1']

number of minimum samples:589866

nmber of samples used:294933



In [None]:
best_idx=results['silhoutte_score'].argmax()
best_params=results.iloc[best_idx]
best_eps=best_params['eps']
best_metric=best_params['metric']

best_dbscan=DBSCAN(eps=best_eps,metric=best_metric,min_samples=min_pts).fit(reduction_data)

labels=best_dbscan.labels_

statistics=np.unique(best_dbscan.labels_,return_counts=True)[1]

print(f"""
results:{best_params}
statistics:
    raw counts:{statistics}
    percentags:{statistics/np.sum(statistics)}
""")
