In [12]:
import datacleaner as dc
import pandas as pd
import numpy as np

In [89]:
dm = dc.DataCleaner(r"./dataset/races.csv", r"./dataset/cyclists.csv")

In [90]:
dm.df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589865 entries, 0 to 589864
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   _url                 589865 non-null  object        
 1   location             589865 non-null  object        
 2   points               589388 non-null  float64       
 3   uci_points           251086 non-null  float64       
 4   length               589865 non-null  float64       
 5   climb_total          442820 non-null  float64       
 6   profile              441671 non-null  float64       
 7   startlist_quality    589865 non-null  int64         
 8   average_temperature  29933 non-null   float64       
 9   date                 589865 non-null  datetime64[ns]
 10  position             589865 non-null  int64         
 11  cyclist              589865 non-null  object        
 12  cyclist_age          589752 non-null  float64       
 13  is_tarmac     

In [91]:
df_races = pd.read_csv("dataset/races.csv", parse_dates=['date'])
df_cyclists = pd.read_csv("dataset/cyclists.csv")


## Cyclists analysis

One cyclist has all NaN fields. Consider removing

In [92]:
df_cyclists.info()

print(df_cyclists.loc[df_cyclists.nationality[df_cyclists.nationality.isna()].index])
df_cyclists[df_cyclists['_url'] == 'scott-davies']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6134 entries, 0 to 6133
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   _url         6134 non-null   object 
 1   name         6134 non-null   object 
 2   birth_year   6121 non-null   float64
 3   weight       3078 non-null   float64
 4   height       3143 non-null   float64
 5   nationality  6133 non-null   object 
dtypes: float64(3), object(3)
memory usage: 287.7+ KB
           _url           name  birth_year  weight  height nationality
9  scott-davies  Scott  Davies         NaN     NaN     NaN         NaN


Unnamed: 0,_url,name,birth_year,weight,height,nationality
9,scott-davies,Scott Davies,,,,


These cyclists have unclear birth_year. Try to infer from their age in the other dataset

In [93]:
df_cyclists.loc[df_cyclists.birth_year[df_cyclists.birth_year.isna()].index]

Unnamed: 0,_url,name,birth_year,weight,height,nationality
9,scott-davies,Scott Davies,,,,
601,vladimir-malakov,Vladimir Malakov,,,,Russia
894,antonio-zanini,Antonio Zanini,,,,Italy
2408,filippo-simonetti,Filippo Simonetti,,,,Italy
2515,carlos-garcia,Carlos García,,,,Spain
2536,alexandr-osipov,Alexandr Osipov,,,,Russia
3046,nicolai-kosyakov,Nicolai Kosyakov,,,,Russia
3551,nevens-guy,Guy Nevens,,,,Belgium
4142,oscar-pumar,Oscar Pumar,,,,Venezuela
4384,javier-luquin,Javier Luquin,,,,Spain


## Races analysis

In [94]:
df_races.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589865 entries, 0 to 589864
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   _url                 589865 non-null  object        
 1   name                 589865 non-null  object        
 2   points               589388 non-null  float64       
 3   uci_points           251086 non-null  float64       
 4   length               589865 non-null  float64       
 5   climb_total          442820 non-null  float64       
 6   profile              441671 non-null  float64       
 7   startlist_quality    589865 non-null  int64         
 8   average_temperature  29933 non-null   float64       
 9   date                 589865 non-null  datetime64[ns]
 10  position             589865 non-null  int64         
 11  cyclist              589865 non-null  object        
 12  cyclist_age          589752 non-null  float64       
 13  is_tarmac     

Can remove is_cobbled and is_gravel as they are constant

In [95]:
#getting unique values of boolean columns
print(df_races.is_cobbled.unique())
print(df_races.is_gravel.unique())

[False]
[False]


In [96]:
dm.df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589865 entries, 0 to 589864
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   _url                 589865 non-null  object        
 1   location             589865 non-null  object        
 2   points               589388 non-null  float64       
 3   uci_points           251086 non-null  float64       
 4   length               589865 non-null  float64       
 5   climb_total          442820 non-null  float64       
 6   profile              441671 non-null  float64       
 7   startlist_quality    589865 non-null  int64         
 8   average_temperature  29933 non-null   float64       
 9   date                 589865 non-null  datetime64[ns]
 10  position             589865 non-null  int64         
 11  cyclist              589865 non-null  object        
 12  cyclist_age          589752 non-null  float64       
 13  is_tarmac     

The positions are 0-indexed, meaning that the position 0 is the first place. The reason we infer this is because the number of 0 positions exactly coincide with the number of different races in the dataset (5281). The delta seems to be zero for higher positions on the grid. The maximum is 204, seems unlikely, need to understand if 0 is the (unfortunate) default value.

Probably 0 is default value for delta

In [97]:
#print(dm.df.loc[dm.df.position == 0].shape)
#print(dm.df.loc[dm.df.position == 1].shape)

print(dm.df.loc[(dm.df.delta == 0) & (dm.df.position == 2)].shape)
print(dm.df.loc[dm.df.delta == 0].position.max())
dm.df._url.unique().shape

(2235, 20)
204


(5281,)

In pandas if there are NaN values the column cannot be an integer column. Thus after imputation we have to recast as integer the following columns

In [98]:
print(all(x.is_integer() for x in dm.df.points.dropna())) #points column is integer
print(all(x.is_integer() for x in dm.df.uci_points.dropna())) #uci_points column is integer
print(all(x.is_integer() for x in dm.df.climb_total.dropna())) #climb_total column is integer
print(all(x.is_integer() for x in dm.df.length.dropna())) #length column is integer

True
True
True
False


The "profile" column is categorical, values ranging from 1 to 5. (also need to recast after imputation)

In [99]:
dm.df.profile.unique()

array([ 1.,  5., nan,  3.,  2.,  4.])

The startlist are integers representing how strong the lineup is. They are sparse.

In [100]:
dm.df.startlist_quality.unique()

array([1241,  821, 1699,  804, 1551,  899,  659,  388,  900,  541,  830,
        789,  602,  817, 1400, 1161, 1040,  896,  791,  819,  670,  225,
        520, 1057,  809,  828,  722,  747,  714,  815,  376,  621,  760,
        798,  933, 1994, 1437, 1362,  884, 1150,  971,  881, 1112, 1175,
        891,  878,  400,  936,  692,  727, 1002,  928, 1196, 1489,  687,
        585,  835,  673, 1328,  885,  502,  861,  982,  923, 1036, 1690,
        925,  989, 1109, 1084,  803,  792,  548,  668, 1713, 1520, 1959,
        883,  859, 2047, 1024,  533, 1269,  751, 1202,  570, 1703, 1416,
        251, 1158, 1139,  657, 1048,  521, 1034, 1099,  880, 1029,  627,
        340,  767,  737, 1220,  824,  369, 1255, 1019,  995, 1059, 1342,
        968,  676, 1381,  705,  680,  779, 1123, 1020, 1632, 1251, 1470,
        708,  892,  523,  590,  604,  781,  614, 1309,  951, 1812,  877,
       1758,  843, 1041, 1646,  935,  660,  961, 1238, 1496,  765,  494,
       1933,  956, 1849, 1136,  393,  882,  597,  8

Cannot infer birth year from age: they are both NaN in the same exact cases.

In [101]:
#group by cyclist and show if there is a value different from Nan

ages = dm.df.groupby('cyclist')['cyclist_age'].unique()

ages = ages.apply(lambda x: np.isnan(x).all())
# names for which is true
ages = ages[ages == True].index

for idx, name in enumerate(ages):
    print(idx,name)
    print(df_cyclists.loc[df_cyclists['_url'] == name].birth_year)
    #print(df_cyclists.loc[df_cyclists['_url'] == name])



0 alexandr-osipov
2536   NaN
Name: birth_year, dtype: float64
1 antonio-zanini
894   NaN
Name: birth_year, dtype: float64
2 batik-odriozola
6080   NaN
Name: birth_year, dtype: float64
3 carlos-garcia
2515   NaN
Name: birth_year, dtype: float64
4 filippo-simonetti
2408   NaN
Name: birth_year, dtype: float64
5 javier-luquin
4384   NaN
Name: birth_year, dtype: float64
6 nevens-guy
3551   NaN
Name: birth_year, dtype: float64
7 nicolai-kosyakov
3046   NaN
Name: birth_year, dtype: float64
8 oscar-pumar
4142   NaN
Name: birth_year, dtype: float64
9 scott-davies
9   NaN
Name: birth_year, dtype: float64
10 sergei-jermachenko
6072   NaN
Name: birth_year, dtype: float64
11 thierry-lauder
4756   NaN
Name: birth_year, dtype: float64
12 vladimir-malakov
601   NaN
Name: birth_year, dtype: float64


one profile per _url not unique per Location


one location per _url 


one points per _url Not unique per location

one uci_points per _url Not unique per location

one length per _url Not u x loc

one climb_total per _url  (this is strange, there are floats, but the floats have small decimal part with lot of zeros)

one startlist_quality per _url (not u x loc)

In [135]:
# check which column only depends on the races
profile_counts = dm.df.groupby('location')['startlist_quality'].nunique()
print(profile_counts[profile_counts > 1])

# print urls with delta == 0 for position != 0
#print(dm.df.loc[(dm.df.delta == 0) & (dm.df.position != 0)]._url.unique().shape)

#urls_with_all_positions_delta_zero = dm.df.groupby('_url')['delta'].max()
#urls_with_all_positions_delta_zero = urls_with_all_positions_delta_zero[urls_with_all_positions_delta_zero == 0].index
#print(urls_with_all_positions_delta_zero)

location
Amstel Gold Race                                   53
Clasica Ciclista San Sebastian                     34
Clásica Ciclista San Sebastián                      2
Clásica San Sebastián                               2
Criterium du Dauphiné Libére                        2
Critérium du Dauphiné                              13
Critérium du Dauphiné Libéré                       23
Donostia San Sebastian Klasikoa                     3
Dwars door België / À travers la Belgique          12
Dwars door Vlaanderen                              13
Dwars door Vlaanderen - A travers la Flandre ME     5
Dwars door Vlaanderen / A travers la Flandre        3
E3 Harelbeke                                        3
E3 Prijs Vlaanderen                                10
E3 Saxo Bank Classic                                2
E3-Prijs Harelbeke                                 32
Giro d'Italia                                      33
Giro di Lombardia                                  40
Grand Prix Cycliste

In [103]:
non_integer_lengths = dm.df.length[~dm.df.length.dropna().apply(float.is_integer)]
print(non_integer_lengths)

11632      32200.0
11633      32200.0
11634      32200.0
11635      32200.0
11636      32200.0
            ...   
509529    258100.0
509530    258100.0
509531    258100.0
509532    258100.0
509533    258100.0
Name: length, Length: 1451, dtype: float64


In [104]:
dm.df.loc[11632].length

np.float64(32200.000000000004)

Every _url has exactly one position zero --> pos zero means first

In [130]:
#check if, for every _url, there is position 0, and there is onyl one position 0

#get all records with position 0
position_zero = dm.df[dm.df.position == 0]

print(position_zero.shape)
# project onyl on the two interesting columns
position_zero = position_zero[['_url', 'position']]
# remove duplicates
position_zero = position_zero.drop_duplicates()
print(position_zero.shape)

(5281, 20)
(5281, 2)
