In [2]:
import pandas as pd
import ast

### Nettoyage Résultats

In [3]:
results = pd.read_html('imports/olympic_results.html', index_col=0, encoding='utf-8')[0]
results.head()

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,medal_type,athletes,rank_equal,rank_position,country_name,country_code,country_3_letter_code,athlete_url,athlete_full_name,value_unit,value_type
0,Curling,Mixed Doubles,beijing-2022,GameTeam,GOLD,"[('Stefania CONSTANTINI', 'https://olympics.co...",False,1,Italy,IT,ITA,,,,
1,Curling,Mixed Doubles,beijing-2022,GameTeam,SILVER,"[('Kristin SKASLIEN', 'https://olympics.com/en...",False,2,Norway,NO,NOR,,,,
2,Curling,Mixed Doubles,beijing-2022,GameTeam,BRONZE,"[('Almida DE VAL', 'https://olympics.com/en/at...",False,3,Sweden,SE,SWE,,,,
3,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Jennifer DODDS', 'https://olympics.com/en/a...",False,4,Great Britain,GB,GBR,,,,
4,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Rachel HOMAN', 'https://olympics.com/en/ath...",False,5,Canada,CA,CAN,,,,


In [4]:
# informations des colonnes
results.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162804 entries, 0 to 162803
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   discipline_title       162804 non-null  object
 1   event_title            162804 non-null  object
 2   slug_game              162804 non-null  object
 3   participant_type       162804 non-null  object
 4   medal_type             20206 non-null   object
 5   athletes               7976 non-null    object
 6   rank_equal             32526 non-null   object
 7   rank_position          158926 non-null  object
 8   country_name           162804 non-null  object
 9   country_code           157768 non-null  object
 10  country_3_letter_code  162804 non-null  object
 11  athlete_url            129991 non-null  object
 12  athlete_full_name      141646 non-null  object
 13  value_unit             78646 non-null   object
 14  value_type             90049 non-null   object
dtypes: ob

In [5]:
# dimensions
results.shape

(162804, 15)

In [6]:
def explode_athletes(row: pd.Series) -> list:
    '''
    Duplique une ligne du DataFrame en fonction du nombre d'athlètes
    '''
    athletes = row['athletes']
    if pd.isna(athletes) or athletes == '[]':
        return [row]
    
    athletes = ast.literal_eval(athletes)
    new_rows = []
    for athlete in athletes:
        new_row = row.copy()
        new_row['athlete_full_name'] = athlete[0]
        new_row['athlete_url'] = athlete[1]
        new_rows.append(new_row)
    return new_rows


In [7]:
# applatir les lignes selon les athlètes
expanded_rows = []
for index, row in results.iterrows():
    expanded_rows.extend(explode_athletes(row))

In [8]:
cleaned_results = pd.DataFrame(expanded_rows)

In [9]:
# suppression de la colonne athletes
cleaned_results.drop(columns=['athletes', 'medal_type'], inplace=True)

In [10]:
# dimensions
cleaned_results.shape

(170780, 13)

In [11]:
# vérifier les doublons
cleaned_results.duplicated().sum()

144

In [12]:
# supprimer les doublons
cleaned_results.drop_duplicates(inplace=True)

In [13]:
# dimensions
cleaned_results.shape

(170636, 13)

In [14]:
# on vérifie les valeurs uniques de la colonne 'value_type'
cleaned_results['value_type'].unique()

array([nan, 'POINTS', 'SC_REST$IRM', 'IRM', 'CODE', 'TIME', 'NO_TIME',
       'IRM_POINTS', 'STROKES', 'WEIGHT', 'DISTANCE', 'RANK', 'SCORE'],
      dtype=object)

In [15]:
# supprimer les value_type associés à des valeurs nulles
for value_type in cleaned_results['value_type'].unique():
    value_type_rows = cleaned_results[cleaned_results['value_type'] == value_type]
    if value_type_rows['value_unit'].isna().all():
        cleaned_results.loc[cleaned_results['value_type'] == value_type, 'value_type'] = None

In [16]:
# voir les lignes dont value_type est manquante et la valeur de value_unit est renseignée
cleaned_results[cleaned_results['value_type'].isna() & cleaned_results['value_unit'].notna()][0:5]

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,rank_equal,rank_position,country_name,country_code,country_3_letter_code,athlete_url,athlete_full_name,value_unit,value_type
3364,Speed skating,Women's Mass Start,beijing-2022,Athlete,False,11,Netherlands,NL,NED,https://olympics.com/en/athletes/marijke-groen...,Marijke GROENEWOUD,27.46,
3369,Speed skating,Women's Mass Start,beijing-2022,Athlete,False,8,Japan,JP,JPN,https://olympics.com/en/athletes/ayano-sato,Ayano SATO,33.88,
3370,Speed skating,Women's Mass Start,beijing-2022,Athlete,False,26,Republic of Korea,KR,KOR,https://olympics.com/en/athletes/ji-woo-park,Ji Woo PARK,33.7,
3375,Speed skating,Women's Mass Start,beijing-2022,Athlete,False,15,ROC,ROC,ROC,https://olympics.com/en/athletes/elizaveta-gol...,Elizaveta GOLUBEVA,27.81,
3376,Speed skating,Women's Mass Start,beijing-2022,Athlete,False,7,Belarus,BY,BLR,https://olympics.com/en/athletes/maryna-zuyeva,Maryna ZUYEVA,45.47,


In [17]:
# il s'agit d'un chrono donc on crée une nouvelle valeur pour value_type
cleaned_results.loc[cleaned_results['value_type'].isna() & cleaned_results['value_unit'].notna(), 'value_type'] = 'CHRONO'

In [18]:
# maintenant l'inverse, voir les lignes dont value_type est renseignée et value_unit est manquante
cleaned_results.loc[cleaned_results['value_type'].notna() & cleaned_results['value_unit'].isna()][0:5]

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,rank_equal,rank_position,country_name,country_code,country_3_letter_code,athlete_url,athlete_full_name,value_unit,value_type
10267,Rowing,Lightweight Women's Double Sculls,tokyo-2020,GameTeam,False,1,Italy,IT,ITA,https://olympics.com/en/athletes/valentina-rodini,Valentina RODINI,,TIME
10267,Rowing,Lightweight Women's Double Sculls,tokyo-2020,GameTeam,False,1,Italy,IT,ITA,https://olympics.com/en/athletes/federica-cesa...,Federica CESARINI,,TIME
10268,Rowing,Lightweight Women's Double Sculls,tokyo-2020,GameTeam,False,2,France,FR,FRA,https://olympics.com/en/athletes/laura-tarantola,Laura TARANTOLA,,TIME
10268,Rowing,Lightweight Women's Double Sculls,tokyo-2020,GameTeam,False,2,France,FR,FRA,https://olympics.com/en/athletes/claire-bove,Claire BOVE,,TIME
10269,Rowing,Lightweight Women's Double Sculls,tokyo-2020,GameTeam,False,3,Netherlands,NL,NED,https://olympics.com/en/athletes/marieke-keijser,Marieke KEIJSER,,TIME


In [19]:
# ces valeurs sont inutilisables, on les remplace par des NaN
cleaned_results.loc[cleaned_results['value_type'].notna() & cleaned_results['value_unit'].isna(), 'value_type'] = None

In [20]:
# valeurs manquantes
cleaned_results.isna().sum()

discipline_title              0
event_title                   0
slug_game                     0
participant_type              0
rank_equal               137300
rank_position              3879
country_name                  0
country_code               5238
country_3_letter_code         0
athlete_url               25586
athlete_full_name         13155
value_unit                87841
value_type                87841
dtype: int64

In [21]:
# exportation
cleaned_results.to_csv('exports/olympic_results_cleaned.csv', index=False , encoding='utf-8')

### Nettoyage Athlètes

In [22]:
athletes = pd.read_json('imports/olympic_athletes.json', encoding='utf-8')
athletes.head()

Unnamed: 0,athlete_url,athlete_full_name,games_participations,first_game,athlete_year_birth,athlete_medals,bio
0,https://olympics.com/en/athletes/cooper-woods-...,Cooper WOODS-TOPALOVIC,1,Beijing 2022,2000.0,,
1,https://olympics.com/en/athletes/elofsson,Felix ELOFSSON,2,PyeongChang 2018,1995.0,,
2,https://olympics.com/en/athletes/dylan-walczyk,Dylan WALCZYK,1,Beijing 2022,1993.0,,
3,https://olympics.com/en/athletes/olli-penttala,Olli PENTTALA,1,Beijing 2022,1995.0,,
4,https://olympics.com/en/athletes/reikherd,Dmitriy REIKHERD,1,Beijing 2022,1989.0,,


In [23]:
# informations des colonnes
athletes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75904 entries, 0 to 75903
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   athlete_url           75904 non-null  object 
 1   athlete_full_name     75904 non-null  object 
 2   games_participations  75904 non-null  int64  
 3   first_game            75882 non-null  object 
 4   athlete_year_birth    73448 non-null  float64
 5   athlete_medals        15352 non-null  object 
 6   bio                   22842 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 4.1+ MB


In [24]:
# vérifier les doublons
athletes.duplicated().sum()

0

In [25]:
# vérifier les valeurs manquantes
athletes['athlete_year_birth'].isna().sum()

2456

In [26]:
# remplacer les valeurs nulles par 0
athletes.fillna({ 'athlete_year_birth': 0 }, inplace=True)

In [27]:
# convertir l'année de naissance en entier
athletes['athlete_year_birth'] = athletes['athlete_year_birth'].astype(int)

In [28]:
# vérifier le type des dates de naissance
athletes['athlete_year_birth'].dtype

dtype('int64')

In [29]:
# vérifier les valeurs nulles
athletes.isna().sum()

athlete_url                 0
athlete_full_name           0
games_participations        0
first_game                 22
athlete_year_birth          0
athlete_medals          60552
bio                     53062
dtype: int64

In [30]:
# nettoyer les biographies
athletes['bio'] = athletes['bio'].str.strip().replace('\n', ' ')

In [31]:
# formater le nombre de médailles
athletes['athlete_medals'] = athletes['athlete_medals'].replace('\n', '')

In [32]:
# en prévision de l'insertion en bdd, on vérifie les différences entre les dataframes cleaned_results et athletes
cleaned_results_athletes = cleaned_results['athlete_full_name'].unique()
athletes_athletes = athletes['athlete_full_name'].unique()

# différences
missing_athletes = set(cleaned_results_athletes) - set(athletes_athletes)
missing_athletes

{'Yury Zakharov',
 'Abdel Malek El-Aouad',
 'Bob Curry',
 'Rubem Ribeiro',
 'Jong-Hun Sin',
 'George Smith',
 'Leri Khabelovi',
 'Rodolfo Wagner',
 '- Viéville',
 'Cesare Locatelli',
 'Dimitar Galinchev',
 'Carl Körting',
 'Casimiro Vega',
 'Judit Kéri-Novák',
 'Charlie Purdy',
 'Fazlollah Dehkhoda',
 'Greetje Gaillard',
 'Georgi Georgiev',
 'Hayk Yeghiazaryan',
 'Yelena Pavluxina',
 'Marcel Moret',
 'William Plant',
 'Yanan Wu',
 'Dmitry Bochkaryov',
 'Bandi Damdinjavyn',
 'Bob Maslen-Jones',
 'Agnes Olsen',
 'Erich Gallwitz',
 'Ali Heidar Ali Mohamed',
 'P. F. Koukoudakis',
 'Freddie Meachem',
 'Alexandros Khalkokondylis',
 'Chung-Yeol Yang',
 'Mohamed Al-Takroni',
 'Josef Pavlík',
 'Cemil Sarıbacak',
 'John Walker',
 'Tin Dekkers',
 'Guðmundur Helgason',
 'Abdul Rehman',
 'Bob Fowler',
 'Austin Anthony',
 'Jean Madelaine',
 'Edmond Filiâtre',
 "Frank O'Neill",
 'Harold Simpkins',
 'Umberto Del Carlo',
 'Lyuben Khristov',
 'Tarek Fouad',
 '- Pelat',
 'Konstantinos Loudaros',
 'Alekse

In [33]:
# oula, il y a des différences... on va essayer de corriger ça
missing_athletes_df = cleaned_results[cleaned_results['athlete_full_name'].isin(missing_athletes)][['athlete_full_name', 'athlete_url']]
# supprimer les doublons
missing_athletes_df.drop_duplicates(inplace=True)


In [34]:
def get_games_participations(athlete_full_name: str) -> int:
    '''
    Retourne le nombre de participations à des JO
    déterminé par le nombre de valeurs uniques de la colonne 'slug_game'
    '''
    return len(cleaned_results[cleaned_results['athlete_full_name'] == athlete_full_name]['slug_game'].unique())

In [35]:
missing_athletes_df['games_participations'] = missing_athletes_df['athlete_full_name'].apply(get_games_participations)

In [36]:
# on détermine les premiers jeux olympiques de chaque athlète en fonction de la date parsée dans le slug_game
def get_first_games(athlete_full_name: str) -> str:
    '''
    Retourne les premiers jeux olympiques
    '''
    all_athlete_games = cleaned_results[cleaned_results['athlete_full_name'] == athlete_full_name]['slug_game']
    all_athlete_games = all_athlete_games.drop_duplicates()

    if(all_athlete_games.empty):
        return None
    
    if(all_athlete_games.shape[0] == 1):
        return all_athlete_games.iloc[0]
    
    # sinon on récupère les 4 derniers caractères du slug (année des JO) et on renvoi le slug dont l'année est la plus petite
    return all_athlete_games[all_athlete_games.str[-4:].astype(int).idxmin()]

In [37]:
missing_athletes_df['first_game'] = missing_athletes_df['athlete_full_name'].apply(get_first_games)

In [38]:
# on rempli les colonnes athlete_year_birth	athlete_medals	bio avec des valeurs nulles
missing_athletes_df['athlete_year_birth'] = 0
missing_athletes_df['athlete_medals'] = None
missing_athletes_df['bio'] = None

In [39]:
missing_athletes_df.head()

Unnamed: 0,athlete_full_name,athlete_url,games_participations,first_game,athlete_year_birth,athlete_medals,bio
10,,,0,,0,,
1920,Alexandru Stefan STEFANESCU,https://olympics.com/en/athletes/alexandru-ste...,1,beijing-2022,0,,
2692,Raimo VIGANTS,https://olympics.com/en/athletes/raimo-vigants,1,beijing-2022,0,,
2698,Thibaut DE MARRE,https://olympics.com/en/athletes/thibaut-de-marre,1,beijing-2022,0,,
3142,CIRENZHANDUI,https://olympics.com/en/athletes/cirenzhandui,1,beijing-2022,0,,


In [40]:
# on merge les deux dataframes
athletes = pd.concat([athletes, missing_athletes_df], ignore_index=True)

In [41]:
athletes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83875 entries, 0 to 83874
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   athlete_url           75929 non-null  object
 1   athlete_full_name     83874 non-null  object
 2   games_participations  83875 non-null  int64 
 3   first_game            83852 non-null  object
 4   athlete_year_birth    83875 non-null  int64 
 5   athlete_medals        15352 non-null  object
 6   bio                   22842 non-null  object
dtypes: int64(2), object(5)
memory usage: 4.5+ MB


In [42]:
# exportation
athletes.to_csv('exports/olympic_athletes_cleaned.csv', index=False, encoding='utf-8')

In [43]:
# TODO: déterminer la matrice des pays des athlètes

### Pays

On exporte les données dédupliquées et nettoyées des pays pour faciliter l'insertion en base de données

In [44]:
countries = cleaned_results[['country_name', 'country_code', 'country_3_letter_code']]
countries.head()

Unnamed: 0,country_name,country_code,country_3_letter_code
0,Italy,IT,ITA
0,Italy,IT,ITA
1,Norway,NO,NOR
1,Norway,NO,NOR
2,Sweden,SE,SWE


In [45]:
# suppression des doublons
countries = countries.drop_duplicates()

In [46]:
# vérification des valeurs nulles
countries.isnull().sum()

country_name              0
country_code             22
country_3_letter_code     0
dtype: int64

In [47]:
# on vérifie l'unicité des codes pays pour assurer l'intégrité des données
countries['country_3_letter_code'].duplicated().sum()

9

In [48]:
# voir les valeurs dupliquées
countries[countries['country_3_letter_code'].duplicated()]

Unnamed: 0,country_name,country_code,country_3_letter_code
10366,Singapore,,SGP
10686,Norway,,NOR
12976,The Former Yugoslav Republic of Macedonia,MK,MKD
14637,Ivory Coast,CI,CIV
18087,US Virgin Islands,VI,ISV
21592,British Virgin Islands,VG,IVB
26082,Swaziland,SZ,SWZ
67250,Nigeria,,NGR
89264,Zambia,,ZAM


In [49]:
# on garde la ligne avec la valeur non nulle de country_code
countries = countries.sort_values('country_code').drop_duplicates('country_3_letter_code', keep='first')

In [50]:
# exportation
countries.to_csv('exports/olympic_countries.csv', index=False, encoding='utf-8')

### Discipline

On exporte les données dédupliquées et nettoyées des disciplines pour faciliter l'insertion en base de données

In [51]:
disciplines = cleaned_results[['discipline_title']]
disciplines.head()

Unnamed: 0,discipline_title
0,Curling
0,Curling
1,Curling
1,Curling
2,Curling


In [52]:
# supprimer les doublons
disciplines = disciplines.drop_duplicates()

In [53]:
# vérification des valeurs nulles
disciplines.isnull().sum()

discipline_title    0
dtype: int64

In [54]:
# exportation
disciplines.to_csv('exports/olympic_disciplines.csv', index=False, encoding='utf-8')

###  Jeux

In [55]:
hosts = pd.read_xml('imports/olympic_hosts.xml', encoding='utf-8')
hosts.set_index('index', inplace=True)
hosts.head()

Unnamed: 0_level_0,game_slug,game_end_date,game_start_date,game_location,game_name,game_season,game_year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,beijing-2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z,China,Beijing 2022,Winter,2022
1,tokyo-2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,Japan,Tokyo 2020,Summer,2020
2,pyeongchang-2018,2018-02-25T08:00:00Z,2018-02-08T23:00:00Z,Republic of Korea,PyeongChang 2018,Winter,2018
3,rio-2016,2016-08-21T21:00:00Z,2016-08-05T12:00:00Z,Brazil,Rio 2016,Summer,2016
4,sochi-2014,2014-02-23T16:00:00Z,2014-02-07T04:00:00Z,Russian Federation,Sochi 2014,Winter,2014


In [56]:
hosts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53 entries, 0 to 52
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   game_slug        53 non-null     object
 1   game_end_date    53 non-null     object
 2   game_start_date  53 non-null     object
 3   game_location    53 non-null     object
 4   game_name        53 non-null     object
 5   game_season      53 non-null     object
 6   game_year        53 non-null     int64 
dtypes: int64(1), object(6)
memory usage: 3.3+ KB


In [57]:
# vérifier les valeurs dupliquées
hosts.duplicated().sum()

0

In [58]:
# vérifier les valeurs nulles
hosts.isnull().sum()

game_slug          0
game_end_date      0
game_start_date    0
game_location      0
game_name          0
game_season        0
game_year          0
dtype: int64

In [59]:
# vérifier si on n'a que deux saisons
hosts['game_season'].unique()

array(['Winter', 'Summer'], dtype=object)

In [60]:
# on vérifie si les données des résultats sont cohérentes avec les données des hôtes
results_hosts = cleaned_results['slug_game'].unique()
hosts_games = hosts['game_slug'].unique()

# différences
missing_hosts = set(results_hosts) - set(hosts_games)
missing_hosts

set()

In [61]:
# on projette d'utiliser la table des pays comme clé étrangère pour la colonne game_location
# on vérifie si les pays sont bien renseignés
countrie_names = countries['country_name'].unique()
hosts_countries = hosts['game_location'].unique()

# différences
missing_countries = set(hosts_countries) - set(countrie_names)
missing_countries

{'Australia, Sweden', 'China', 'USSR', 'United States'}

In [62]:
# heureusement il n'y a pas beaucoup de différence, on les règle à la main
hosts = hosts.replace({ 'game_location': { 
    'United States': 'United States of America',
    'Australia, Sweden': 'Australia',
    'USSR': 'Soviet Union',
    'China': 'People\'s Republic of China' }})

In [63]:
# on vérifie si le netttoyage a bien été effectué
hosts_countries = hosts['game_location'].unique()
missing_countries = set(hosts_countries) - set(countrie_names)
missing_countries

set()

In [64]:
# exportation
hosts.to_csv('exports/olympic_hosts_cleaned.csv', encoding='utf-8')

### Evenements

In [65]:
events = cleaned_results[['event_title', 'discipline_title', 'slug_game']]
events.head()

Unnamed: 0,event_title,discipline_title,slug_game
0,Mixed Doubles,Curling,beijing-2022
0,Mixed Doubles,Curling,beijing-2022
1,Mixed Doubles,Curling,beijing-2022
1,Mixed Doubles,Curling,beijing-2022
2,Mixed Doubles,Curling,beijing-2022


In [66]:
# supprimer les doublons
events = events.drop_duplicates()

In [67]:
# vérifier des valeurs nulles
events.isnull().sum()

event_title         0
discipline_title    0
slug_game           0
dtype: int64

In [68]:
def get_event_gender(title: str) -> str:
    '''
    Détermine le genre de l'événement en fonction du titre
    '''
    title = title.lower()
    if "women" in title or "ladi" in title:
        return 'Women'
    if "men" in title:
        return 'Men'
    if "mixed" in title:
        return 'Mixed'
    return 'Men'

In [69]:
# ajout de la colonne event_gender
events['event_gender'] = events.apply(lambda x: get_event_gender(x['event_title']), axis=1)

In [70]:
# exportation
events.to_csv('exports/olympic_events.csv', index=False, encoding='utf-8')