# Utility file to show main information about the dataset

In [119]:
import warnings
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

from collections import defaultdict
from scipy.stats import pearsonr

import os

#reading the dataset
cyclists = pd.read_csv('./dataset/cyclists.csv')
races = pd.read_csv('./dataset/races.csv')

## Cyclist dataset

In [120]:
cyclists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6134 entries, 0 to 6133
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   _url         6134 non-null   object 
 1   name         6134 non-null   object 
 2   birth_year   6121 non-null   float64
 3   weight       3078 non-null   float64
 4   height       3143 non-null   float64
 5   nationality  6133 non-null   object 
dtypes: float64(3), object(3)
memory usage: 287.7+ KB


In [121]:
cyclists.head()

Unnamed: 0,_url,name,birth_year,weight,height,nationality
0,bruno-surra,Bruno Surra,1964.0,,,Italy
1,gerard-rue,Gérard Rué,1965.0,74.0,182.0,France
2,jan-maas,Jan Maas,1996.0,69.0,189.0,Netherlands
3,nathan-van-hooydonck,Nathan Van Hooydonck,1995.0,78.0,192.0,Belgium
4,jose-felix-parra,José Félix Parra,1997.0,55.0,171.0,Spain


## Races dataset

In [122]:
races.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589865 entries, 0 to 589864
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   _url                 589865 non-null  object 
 1   name                 589865 non-null  object 
 2   points               589388 non-null  float64
 3   uci_points           251086 non-null  float64
 4   length               589865 non-null  float64
 5   climb_total          442820 non-null  float64
 6   profile              441671 non-null  float64
 7   startlist_quality    589865 non-null  int64  
 8   average_temperature  29933 non-null   float64
 9   date                 589865 non-null  object 
 10  position             589865 non-null  int64  
 11  cyclist              589865 non-null  object 
 12  cyclist_age          589752 non-null  float64
 13  is_tarmac            589865 non-null  bool   
 14  is_cobbled           589865 non-null  bool   
 15  is_gravel        

In [123]:
races.head()

Unnamed: 0,_url,name,points,uci_points,length,climb_total,profile,startlist_quality,average_temperature,date,position,cyclist,cyclist_age,is_tarmac,is_cobbled,is_gravel,cyclist_team,delta
0,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,0,sean-kelly,22.0,True,False,False,vini-ricordi-pinarello-sidermec-1986,0.0
1,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,1,gerrie-knetemann,27.0,True,False,False,norway-1987,0.0
2,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,2,rene-bittinger,24.0,True,False,False,,0.0
3,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,3,joseph-bruyere,30.0,True,False,False,navigare-blue-storm-1993,0.0
4,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,4,sven-ake-nilsson,27.0,True,False,False,spain-1991,0.0


In [124]:
# numero entry con diversi terreni
race_cobbled = races['is_cobbled'].sum()
race_tarmac = races['is_tarmac'].sum()
race_gravel = races['is_gravel'].sum()
print(f"Cobble:{race_cobbled} Tarmac:{race_tarmac} Gravel:{race_gravel}")

Cobble:0 Tarmac:536042 Gravel:0


In [125]:
races.tail()

Unnamed: 0,_url,name,points,uci_points,length,climb_total,profile,startlist_quality,average_temperature,date,position,cyclist,cyclist_age,is_tarmac,is_cobbled,is_gravel,cyclist_team,delta
589860,giro-d-italia/2010/stage-1,Giro d'Italia,80.0,16.0,8400.0,60.0,1.0,878,,2010-05-08 00:11:38,192,anders-lund-1,25.0,True,False,False,watney-avia-1972,80.0
589861,giro-d-italia/2010/stage-1,Giro d'Italia,80.0,16.0,8400.0,60.0,1.0,878,,2010-05-08 00:11:40,193,andrea-masciarelli,28.0,True,False,False,,82.0
589862,giro-d-italia/2010/stage-1,Giro d'Italia,80.0,16.0,8400.0,60.0,1.0,878,,2010-05-08 00:11:41,194,marco-corti,24.0,True,False,False,kazakhstan-2001,83.0
589863,giro-d-italia/2010/stage-1,Giro d'Italia,80.0,16.0,8400.0,60.0,1.0,878,,2010-05-08 00:11:48,195,robbie-mcewen,38.0,True,False,False,radio-popular-paredes-boavista-2023,90.0
589864,giro-d-italia/2010/stage-1,Giro d'Italia,80.0,16.0,8400.0,60.0,1.0,878,,2010-05-08 00:11:49,196,martin-pedersen,27.0,True,False,False,kazakhstan-2001,91.0


In [126]:
# show the entries with uci_points > 500
races[races['date'].str.contains('1978')]

Unnamed: 0,_url,name,points,uci_points,length,climb_total,profile,startlist_quality,average_temperature,date,position,cyclist,cyclist_age,is_tarmac,is_cobbled,is_gravel,cyclist_team,delta
0,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,0,sean-kelly,22.0,True,False,False,vini-ricordi-pinarello-sidermec-1986,0.0
1,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,1,gerrie-knetemann,27.0,True,False,False,norway-1987,0.0
2,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,2,rene-bittinger,24.0,True,False,False,,0.0
3,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,3,joseph-bruyere,30.0,True,False,False,navigare-blue-storm-1993,0.0
4,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,,1978-07-05 04:02:24,4,sven-ake-nilsson,27.0,True,False,False,spain-1991,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580807,tour-de-suisse/1978/stage-9a,Tour de Suisse,50.0,,97500.0,,,691,,1978-06-23 02:18:22,1,piero-spinelli,30.0,True,False,False,,0.0
580808,tour-de-suisse/1978/stage-9a,Tour de Suisse,50.0,,97500.0,,,691,,1978-06-23 02:18:22,2,gustaaf-van-roosbroeck,30.0,True,False,False,team-saxo-bank-tinkoff-bank-2012,0.0
580809,tour-de-suisse/1978/stage-9a,Tour de Suisse,50.0,,97500.0,,,691,,1978-06-23 02:18:22,3,jean-chassang,27.0,True,False,False,,0.0
580810,tour-de-suisse/1978/stage-9a,Tour de Suisse,50.0,,97500.0,,,691,,1978-06-23 02:18:22,4,clyde-sefton,27.0,True,False,False,,0.0


In [127]:
# stampa i dati delle gare che hanno "result" nella _url
races[races['_url'].str.contains('result')].head()

Unnamed: 0,_url,name,points,uci_points,length,climb_total,profile,startlist_quality,average_temperature,date,position,cyclist,cyclist_age,is_tarmac,is_cobbled,is_gravel,cyclist_team,delta
1269,ronde-van-vlaanderen/1980/result,Ronde van Vlaanderen / Tour des Flandres,275.0,,265000.0,,,602,,1980-03-30 06:36:45,0,michel-pollentier,29.0,False,False,False,australia-1983,0.0
1270,ronde-van-vlaanderen/1980/result,Ronde van Vlaanderen / Tour des Flandres,275.0,,265000.0,,,602,,1980-03-30 06:36:45,1,francesco-moser,29.0,False,False,False,japan-2008,0.0
1271,ronde-van-vlaanderen/1980/result,Ronde van Vlaanderen / Tour des Flandres,275.0,,265000.0,,,602,,1980-03-30 06:36:45,2,jan-raas,28.0,False,False,False,liberty-seguros-wurth-team-2005,0.0
1272,ronde-van-vlaanderen/1980/result,Ronde van Vlaanderen / Tour des Flandres,275.0,,265000.0,,,602,,1980-03-30 06:37:05,3,roger-de-vlaeminck,33.0,False,False,False,liquigas-2007,20.0
1273,ronde-van-vlaanderen/1980/result,Ronde van Vlaanderen / Tour des Flandres,275.0,,265000.0,,,602,,1980-03-30 06:37:05,4,marc-demeyer,30.0,False,False,False,ag2r-prevoyance-2001,20.0


In [130]:
races[races['climb_total'] > 6500]

Unnamed: 0,_url,name,points,uci_points,length,climb_total,profile,startlist_quality,average_temperature,date,position,cyclist,cyclist_age,is_tarmac,is_cobbled,is_gravel,cyclist_team,delta
56822,giro-d-italia/2011/stage-15,Giro d'Italia,80.0,16.0,229000.0,6939.0,5.0,891,,2011-05-22 07:27:14,0,mikel-nieve,27.0,True,False,False,japan-2008,0.0
56823,giro-d-italia/2011/stage-15,Giro d'Italia,80.0,16.0,229000.0,6939.0,5.0,891,,2011-05-22 07:28:55,1,stefano-garzelli,38.0,True,False,False,south-africa-2022,101.0
56824,giro-d-italia/2011/stage-15,Giro d'Italia,80.0,16.0,229000.0,6939.0,5.0,891,,2011-05-22 07:29:05,2,alberto-contador,29.0,True,False,False,new-zealand-1991,111.0
56825,giro-d-italia/2011/stage-15,Giro d'Italia,80.0,16.0,229000.0,6939.0,5.0,891,,2011-05-22 07:29:11,3,michele-scarponi,32.0,True,False,False,lotto-super-club-1991,117.0
56826,giro-d-italia/2011/stage-15,Giro d'Italia,80.0,16.0,229000.0,6939.0,5.0,891,,2011-05-22 07:29:42,4,john-gadret,32.0,True,False,False,team-volksbank-2008,148.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337313,volta-a-catalunya/1995/stage-4,Volta Ciclista a Catalunya,50.0,,226800.0,6687.0,,737,,1995-06-19 06:31:10,15,massimo-donati,28.0,True,False,False,o.n.c.e.-deutsche-bank-2000,180.0
337314,volta-a-catalunya/1995/stage-4,Volta Ciclista a Catalunya,50.0,,226800.0,6687.0,,737,,1995-06-19 06:31:28,16,yvon-ledanois,26.0,True,False,False,,198.0
337315,volta-a-catalunya/1995/stage-4,Volta Ciclista a Catalunya,50.0,,226800.0,6687.0,,737,,1995-06-19 06:32:00,17,jose-manuel-garcia-114,27.0,True,False,False,,230.0
337316,volta-a-catalunya/1995/stage-4,Volta Ciclista a Catalunya,50.0,,226800.0,6687.0,,737,,1995-06-19 06:32:15,18,pascal-herve,31.0,True,False,False,,245.0
