In [330]:
# %pip install ftfy
# %pip install plotly

import pandas as pd
import ast
from unidecode import unidecode
import unicodedata
import ftfy
import warnings
import plotly.express as px
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [331]:
athletes_df = pd.read_csv('clean/athletes.csv')
hosts_df = pd.read_csv('clean/hosts.csv')
results_df = pd.read_csv('clean/results_jo.csv')

In [332]:
hosts_df.isnull().sum()

game_slug          0
game_end_date      0
game_start_date    0
game_location      0
game_name          0
game_season        0
game_year          0
dtype: int64

In [333]:
hosts_df.head(10)

Unnamed: 0,game_slug,game_end_date,game_start_date,game_location,game_name,game_season,game_year
0,beijing-2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z,China,Beijing 2022,Winter,2022
1,tokyo-2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,Japan,Tokyo 2020,Summer,2020
2,pyeongchang-2018,2018-02-25T08:00:00Z,2018-02-08T23:00:00Z,Republic of Korea,PyeongChang 2018,Winter,2018
3,rio-2016,2016-08-21T21:00:00Z,2016-08-05T12:00:00Z,Brazil,Rio 2016,Summer,2016
4,sochi-2014,2014-02-23T16:00:00Z,2014-02-07T04:00:00Z,Russian Federation,Sochi 2014,Winter,2014
5,london-2012,2012-08-12T19:00:00Z,2012-07-27T07:00:00Z,Great Britain,London 2012,Summer,2012
6,vancouver-2010,2010-02-28T04:00:00Z,2010-02-12T16:00:00Z,Canada,Vancouver 2010,Winter,2010
7,beijing-2008,2008-08-24T12:00:00Z,2008-08-08T00:00:00Z,China,Beijing 2008,Summer,2008
8,turin-2006,2006-02-26T19:00:00Z,2006-02-10T07:00:00Z,Italy,Turin 2006,Winter,2006
9,athens-2004,2004-08-29T18:00:00Z,2004-08-13T06:00:00Z,Greece,Athens 2004,Summer,2004


In [334]:
jo_types = hosts_df[['game_slug', 'game_season', 'game_year', 'game_end_date', 'game_start_date']]
jo_types.head(10)

Unnamed: 0,game_slug,game_season,game_year,game_end_date,game_start_date
0,beijing-2022,Winter,2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z
1,tokyo-2020,Summer,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z
2,pyeongchang-2018,Winter,2018,2018-02-25T08:00:00Z,2018-02-08T23:00:00Z
3,rio-2016,Summer,2016,2016-08-21T21:00:00Z,2016-08-05T12:00:00Z
4,sochi-2014,Winter,2014,2014-02-23T16:00:00Z,2014-02-07T04:00:00Z
5,london-2012,Summer,2012,2012-08-12T19:00:00Z,2012-07-27T07:00:00Z
6,vancouver-2010,Winter,2010,2010-02-28T04:00:00Z,2010-02-12T16:00:00Z
7,beijing-2008,Summer,2008,2008-08-24T12:00:00Z,2008-08-08T00:00:00Z
8,turin-2006,Winter,2006,2006-02-26T19:00:00Z,2006-02-10T07:00:00Z
9,athens-2004,Summer,2004,2004-08-29T18:00:00Z,2004-08-13T06:00:00Z


In [335]:
results_df.head(10)

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,medal_type,athletes,rank_equal,rank_position,country_name,country_code,country_3_letter_code,value_unit,value_type,event_gender
0,Curling,Mixed Doubles,beijing-2022,GameTeam,GOLD,"[('Stefania CONSTANTINI', 'https://olympics.co...",False,1,Italy,IT,ITA,,,Mixed
1,Curling,Mixed Doubles,beijing-2022,GameTeam,SILVER,"[('Kristin SKASLIEN', 'https://olympics.com/en...",False,2,Norway,NO,NOR,,,Mixed
2,Curling,Mixed Doubles,beijing-2022,GameTeam,BRONZE,"[('Almida DE VAL', 'https://olympics.com/en/at...",False,3,Sweden,SE,SWE,,,Mixed
3,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Jennifer DODDS', 'https://olympics.com/en/a...",False,4,Great Britain,GB,GBR,,,Mixed
4,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Rachel HOMAN', 'https://olympics.com/en/ath...",False,5,Canada,CA,CAN,,,Mixed
5,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Zuzana HAJKOVA', 'https://olympics.com/en/a...",False,6,Czech Republic,CZ,CZE,,,Mixed
6,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Jenny PERRET', 'https://olympics.com/en/ath...",False,7,Switzerland,CH,SUI,,,Mixed
7,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Vicky PERSINGER', 'https://olympics.com/en/...",False,8,United States of America,US,USA,,,Mixed
8,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Suyuan FAN', 'https://olympics.com/en/athle...",False,9,People's Republic of China,CN,CHN,,,Mixed
9,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Tahli GILL', 'https://olympics.com/en/athle...",False,10,Australia,AU,AUS,,,Mixed


In [336]:
hosts_results_merge = results_df.merge(jo_types, left_on='slug_game', right_on="game_slug")
hosts_results_merge.head(10)

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,medal_type,athletes,rank_equal,rank_position,country_name,country_code,country_3_letter_code,value_unit,value_type,event_gender,game_slug,game_season,game_year,game_end_date,game_start_date
0,Curling,Mixed Doubles,beijing-2022,GameTeam,GOLD,"[('Stefania CONSTANTINI', 'https://olympics.co...",False,1,Italy,IT,ITA,,,Mixed,beijing-2022,Winter,2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z
1,Curling,Mixed Doubles,beijing-2022,GameTeam,SILVER,"[('Kristin SKASLIEN', 'https://olympics.com/en...",False,2,Norway,NO,NOR,,,Mixed,beijing-2022,Winter,2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z
2,Curling,Mixed Doubles,beijing-2022,GameTeam,BRONZE,"[('Almida DE VAL', 'https://olympics.com/en/at...",False,3,Sweden,SE,SWE,,,Mixed,beijing-2022,Winter,2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z
3,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Jennifer DODDS', 'https://olympics.com/en/a...",False,4,Great Britain,GB,GBR,,,Mixed,beijing-2022,Winter,2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z
4,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Rachel HOMAN', 'https://olympics.com/en/ath...",False,5,Canada,CA,CAN,,,Mixed,beijing-2022,Winter,2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z
5,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Zuzana HAJKOVA', 'https://olympics.com/en/a...",False,6,Czech Republic,CZ,CZE,,,Mixed,beijing-2022,Winter,2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z
6,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Jenny PERRET', 'https://olympics.com/en/ath...",False,7,Switzerland,CH,SUI,,,Mixed,beijing-2022,Winter,2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z
7,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Vicky PERSINGER', 'https://olympics.com/en/...",False,8,United States of America,US,USA,,,Mixed,beijing-2022,Winter,2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z
8,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Suyuan FAN', 'https://olympics.com/en/athle...",False,9,People's Republic of China,CN,CHN,,,Mixed,beijing-2022,Winter,2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z
9,Curling,Mixed Doubles,beijing-2022,GameTeam,,"[('Tahli GILL', 'https://olympics.com/en/athle...",False,10,Australia,AU,AUS,,,Mixed,beijing-2022,Winter,2022,2022-02-20T12:00:00Z,2022-02-04T15:00:00Z


In [337]:
hosts_results_merge.game_season.unique()

array(['Winter', 'Summer'], dtype=object)

In [338]:
summer_games_results = hosts_results_merge.loc[(hosts_results_merge.game_season == 'Summer')].copy()
winter_games_results = hosts_results_merge.loc[(hosts_results_merge.game_season == 'Winter')].copy()

In [339]:
summer_games_results.drop(['game_season'], axis=1, inplace=True)
summer_games_results.head()

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,medal_type,athletes,rank_equal,rank_position,country_name,country_code,country_3_letter_code,value_unit,value_type,event_gender,game_slug,game_year,game_end_date,game_start_date
3784,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,GOLD,"[('Fatima GALVEZ', 'https://olympics.com/en/at...",False,1,Spain,ES,ESP,,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z
3785,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,SILVER,"[('Alessandra PERILLI', 'https://olympics.com/...",False,2,San Marino,SM,SMR,,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z
3786,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,BRONZE,"[('Madelynn Ann BERNAU', 'https://olympics.com...",False,3,United States of America,US,USA,,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z
3787,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,,"[('Zuzana STEFECEKOVA', 'https://olympics.com/...",False,4,Slovakia,SK,SVK,,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z
3788,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,,"[('Yukie NAKAYAMA', 'https://olympics.com/en/a...",False,5,Japan,JP,JPN,,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z


In [340]:
summer_games_results.isnull().sum()

discipline_title              0
event_title                   0
slug_game                     0
participant_type              0
medal_type               101672
athletes                  10586
rank_equal                90721
rank_position              3858
country_name                  0
country_code               3555
country_3_letter_code         0
value_unit                68966
value_type                61920
event_gender                  0
game_slug                     0
game_year                     0
game_end_date                 0
game_start_date               0
dtype: int64

In [341]:
# summer_games_results.drop_duplicates(inplace=True)
# summer_games_results.duplicated().sum()

dopplers = summer_games_results.loc[(summer_games_results.duplicated())]
dopplers

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,medal_type,athletes,rank_equal,rank_position,country_name,country_code,country_3_letter_code,value_unit,value_type,event_gender,game_slug,game_year,game_end_date,game_start_date
154567,Water Polo,Water Polo Women,antwerp-1920,GameTeam,,,,1,Netherlands,NL,NED,2,SCORE,Women,antwerp-1920,1920,1920-09-12T19:00:00Z,1920-04-20T07:00:00Z
155542,Sailing,8m mixed,stockholm-1912,GameTeam,,,,5,Russian Federation,RU,RUS,0,POINTS,Mixed,stockholm-1912,1912,1912-07-27T20:00:00Z,1912-05-05T08:00:00Z
157220,Shooting,trap 125 targets men,stockholm-1912,Athlete,GOLD,"[('Jay Graham', nan)]",,1,United States of America,US,USA,96,POINTS,Men,stockholm-1912,1912,1912-07-27T20:00:00Z,1912-05-05T08:00:00Z
157221,Shooting,trap 125 targets men,stockholm-1912,Athlete,SILVER,"[('Alfred GOELDEL', 'https://olympics.com/en/a...",,2,Germany,DE,GER,94,POINTS,Men,stockholm-1912,1912,1912-07-27T20:00:00Z,1912-05-05T08:00:00Z
157222,Shooting,trap 125 targets men,stockholm-1912,Athlete,BRONZE,"[('Harry Blaus', nan)]",,3,Russian Federation,RU,RUS,91,POINTS,Men,stockholm-1912,1912,1912-07-27T20:00:00Z,1912-05-05T08:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161942,Sailing,Â½-1 Ton Race Two Open,paris-1900,GameTeam,,"[('LETOT LETOT', 'https://olympics.com/en/athl...",,DNF,France,FR,FRA,,IRM,Men,paris-1900,1900,1900-10-28T19:50:39Z,1900-05-14T08:50:39Z
161943,Sailing,Â½-1 Ton Race Two Open,paris-1900,GameTeam,,"[('LETOT LETOT', 'https://olympics.com/en/athl...",,DNF,France,FR,FRA,,IRM,Men,paris-1900,1900,1900-10-28T19:50:39Z,1900-05-14T08:50:39Z
161944,Sailing,Â½-1 Ton Race Two Open,paris-1900,GameTeam,,"[('LETOT LETOT', 'https://olympics.com/en/athl...",,DNF,France,FR,FRA,,IRM,Men,paris-1900,1900,1900-10-28T19:50:39Z,1900-05-14T08:50:39Z
161945,Sailing,Â½-1 Ton Race Two Open,paris-1900,GameTeam,,"[('LETOT LETOT', 'https://olympics.com/en/athl...",,DNF,France,FR,FRA,,IRM,Men,paris-1900,1900,1900-10-28T19:50:39Z,1900-05-14T08:50:39Z


In [342]:
summer_games_results.medal_type.unique()

array(['GOLD', 'SILVER', 'BRONZE', nan], dtype=object)

In [343]:
summer_games_results['medal_type'] = summer_games_results['medal_type'].fillna('None')

In [344]:
summer_games_results['total_medals'] = summer_games_results['medal_type'].apply(lambda x: 0 if x == 'None' else 1)
summer_games_results['gold_medals'] = summer_games_results['medal_type'].apply(lambda x: 1 if x == 'GOLD' else 0)
summer_games_results['silver_medals'] = summer_games_results['medal_type'].apply(lambda x: 1 if x == 'SILVER' else 0)
summer_games_results['bronze_medals'] = summer_games_results['medal_type'].apply(lambda x: 1 if x == 'BRONZE' else 0)
summer_games_results.head(10)

Unnamed: 0,discipline_title,event_title,slug_game,participant_type,medal_type,athletes,rank_equal,rank_position,country_name,country_code,...,value_type,event_gender,game_slug,game_year,game_end_date,game_start_date,total_medals,gold_medals,silver_medals,bronze_medals
3784,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,GOLD,"[('Fatima GALVEZ', 'https://olympics.com/en/at...",False,1,Spain,ES,...,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,1,1,0,0
3785,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,SILVER,"[('Alessandra PERILLI', 'https://olympics.com/...",False,2,San Marino,SM,...,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,1,0,1,0
3786,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,BRONZE,"[('Madelynn Ann BERNAU', 'https://olympics.com...",False,3,United States of America,US,...,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,1,0,0,1
3787,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,,"[('Zuzana STEFECEKOVA', 'https://olympics.com/...",False,4,Slovakia,SK,...,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,0,0,0,0
3788,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,,"[('Yukie NAKAYAMA', 'https://olympics.com/en/a...",False,5,Japan,JP,...,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,0,0,0,0
3789,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,,"[('Penny SMITH', 'https://olympics.com/en/athl...",False,6,Australia,AU,...,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,0,0,0,0
3790,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,,"[('Laetisha SCANLAN', 'https://olympics.com/en...",False,7,Australia,AU,...,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,0,0,0,0
3791,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,,"[('Jana SPOTAKOVA', 'https://olympics.com/en/a...",False,8,Slovakia,SK,...,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,0,0,0,0
3792,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,,"[('Xiaojing WANG', 'https://olympics.com/en/at...",False,9,People's Republic of China,CN,...,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,0,0,0,0
3793,Shooting,Trap Mixed Team,tokyo-2020,GameTeam,,"[('Kirsty HEGARTY', 'https://olympics.com/en/a...",False,10,Great Britain,GB,...,,Mixed,tokyo-2020,2020,2021-08-08T14:00:00Z,2021-07-23T11:00:00Z,0,0,0,0


In [345]:
historic_medalsbycountry = summer_games_results.groupby(['game_year','country_name']).agg({'total_medals':'sum', 'gold_medals':'sum', 'silver_medals':'sum','bronze_medals':'sum'})
historic_medalsbycountry.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_medals,gold_medals,silver_medals,bronze_medals
game_year,country_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1896,Australia,2,2,0,0
1896,Austria,5,2,1,2
1896,Chile,0,0,0,0
1896,Denmark,6,1,2,3
1896,France,11,5,4,2
1896,Germany,13,6,5,2
1896,Great Britain,7,2,3,2
1896,Greece,47,10,18,19
1896,Hungary,6,2,1,3
1896,Italy,0,0,0,0


In [346]:
historic_medalsbycountry = historic_medalsbycountry.sort_values(by=['game_year','total_medals'],ascending=[True, False])
historic_medalsbycountry.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_medals,gold_medals,silver_medals,bronze_medals
game_year,country_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1896,Greece,47,10,18,19
1896,United States of America,20,11,7,2
1896,Germany,13,6,5,2
1896,France,11,5,4,2
1896,Great Britain,7,2,3,2
1896,Denmark,6,1,2,3
1896,Hungary,6,2,1,3
1896,Austria,5,2,1,2
1896,Switzerland,3,1,2,0
1896,Australia,2,2,0,0


In [347]:
historic_sportsbycountry = summer_games_results.groupby(['game_year','country_name','discipline_title']).count()
historic_sportsbycountry.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,event_title,slug_game,participant_type,medal_type,athletes,rank_equal,rank_position,country_code,country_3_letter_code,value_unit,value_type,event_gender,game_slug,game_end_date,game_start_date,total_medals,gold_medals,silver_medals,bronze_medals
game_year,country_name,discipline_title,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1896,Australia,Athletics,3,3,3,3,3,0,3,3,3,2,3,3,3,3,3,3,3,3,3
1896,Australia,Tennis,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1
1896,Austria,Cycling Track,4,4,4,4,4,0,4,4,4,2,4,4,4,4,4,4,4,4,4
1896,Austria,Fencing,1,1,1,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1
1896,Austria,Swimming,4,4,4,4,4,0,4,4,4,2,4,4,4,4,4,4,4,4,4
1896,Chile,Athletics,2,2,2,2,2,0,2,2,2,0,0,2,2,2,2,2,2,2,2
1896,Denmark,Athletics,5,5,5,5,5,0,3,5,5,0,5,5,5,5,5,5,5,5,5
1896,Denmark,Fencing,1,1,1,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1
1896,Denmark,Gymnastics Artistic,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1
1896,Denmark,Shooting,7,7,7,7,7,1,7,7,7,3,7,7,7,7,7,7,7,7,7


In [348]:
historic_sportsbycountry = historic_sportsbycountry[['event_title']].reset_index()
historic_sportsbycountry = historic_sportsbycountry.rename({'discipline_title':'sports', 'event_title':'epreuves'},axis=1)
historic_sportsbycountry

Unnamed: 0,game_year,country_name,sports,epreuves
0,1896,Australia,Athletics,3
1,1896,Australia,Tennis,1
2,1896,Austria,Cycling Track,4
3,1896,Austria,Fencing,1
4,1896,Austria,Swimming,4
...,...,...,...,...
23451,2020,Zambia,Boxing,3
23452,2020,Zambia,Football,1
23453,2020,Zambia,Judo,1
23454,2020,Zimbabwe,Golf,1


In [349]:
historic_sportsbycountry['country_name'].unique()

array(['Australia', 'Austria', 'Chile', 'Denmark', 'France', 'Germany',
       'Great Britain', 'Greece', 'Hungary', 'Italy', 'MIX', 'Sweden',
       'Switzerland', 'United States of America', 'Argentina', 'Belgium',
       'Bohemia', 'Brazil', 'Canada', 'Cuba', 'India', 'Luxembourg',
       'Netherlands', 'New Zealand', 'Norway', 'Romania',
       'Russian Federation', 'Spain', 'Newfoundland', 'South Africa',
       'Australasia', 'Finland', 'Iceland', 'Turkey', 'Japan', 'Portugal',
       'Serbia', 'Czechoslovakia', 'Egypt', 'Estonia', 'Monaco', 'Poland',
       'Yugoslavia', 'Bulgaria', 'Ecuador', 'Haiti', 'Ireland', 'Latvia',
       'Lithuania', 'Mexico', 'Philippines', 'Uruguay', 'Malta',
       'Zimbabwe', 'Colombia', 'Afghanistan', 'Bermuda', 'Liechtenstein',
       "People's Republic of China", 'Peru', 'Guyana', 'Iraq',
       'Islamic Republic of Iran', 'Jamaica', 'Lebanon', 'Myanmar',
       'Pakistan', 'Panama', 'Puerto Rico', 'Republic of Korea',
       'Singapore', 'Sri La

In [350]:
historic_sportsbycountry = historic_sportsbycountry.groupby(['game_year','country_name']).agg({'sports':'count'})
historic_sportsbycountry

Unnamed: 0_level_0,Unnamed: 1_level_0,sports
game_year,country_name,Unnamed: 2_level_1
1896,Australia,2
1896,Austria,3
1896,Chile,1
1896,Denmark,6
1896,France,7
...,...,...
2020,"Virgin Islands, British",1
2020,"Virgin Islands, US",1
2020,Yemen,2
2020,Zambia,3


In [351]:
historic_epreuvesbycountry = summer_games_results.groupby(['game_year','country_name','event_title']).count()
historic_epreuvesbycountry

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,discipline_title,slug_game,participant_type,medal_type,athletes,rank_equal,rank_position,country_code,country_3_letter_code,value_unit,value_type,event_gender,game_slug,game_end_date,game_start_date,total_medals,gold_medals,silver_medals,bronze_medals
game_year,country_name,event_title,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1896,Australia,1500m men,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1
1896,Australia,800m men,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1
1896,Australia,Singles men,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1
1896,Australia,marathon men,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1
1896,Austria,100km men,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020,Zambia,Men's Fly (48-52kg),1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1
2020,Zambia,Men's Welter (63-69kg),1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1
2020,Zambia,Women,1,1,1,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1,1
2020,Zimbabwe,Men's Individual Stroke Play,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [352]:
historic_epreuvesbycountry = historic_epreuvesbycountry[['discipline_title']].reset_index()
historic_epreuvesbycountry = historic_epreuvesbycountry.rename({'event_title':'epreuves', 'discipline_title':'participation'},axis=1)
historic_epreuvesbycountry = historic_epreuvesbycountry.groupby(['game_year','country_name']).agg({'epreuves':'count'})
historic_epreuvesbycountry = historic_epreuvesbycountry.reset_index()
historic_epreuvesbycountry

Unnamed: 0,game_year,country_name,epreuves
0,1896,Australia,4
1,1896,Austria,8
2,1896,Chile,2
3,1896,Denmark,14
4,1896,France,20
...,...,...,...
2884,2020,"Virgin Islands, British",2
2885,2020,"Virgin Islands, US",1
2886,2020,Yemen,2
2887,2020,Zambia,5


In [353]:
historic_olympic_data = historic_medalsbycountry.merge(historic_sportsbycountry, on=['game_year','country_name'])
historic_olympic_data = historic_olympic_data.merge(historic_epreuvesbycountry, on=['game_year','country_name'])
historic_olympic_data = historic_olympic_data.sort_values(by=['game_year','total_medals'],ascending=[True, False])
historic_olympic_data

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals,sports,epreuves
0,1896,Greece,47,10,18,19,10,39
1,1896,United States of America,20,11,7,2,3,17
2,1896,Germany,13,6,5,2,7,27
3,1896,France,11,5,4,2,7,20
4,1896,Great Britain,7,2,3,2,9,20
...,...,...,...,...,...,...,...,...
2884,2020,"Virgin Islands, British",0,0,0,0,1,2
2885,2020,"Virgin Islands, US",0,0,0,0,1,1
2886,2020,Yemen,0,0,0,0,2,2
2887,2020,Zambia,0,0,0,0,3,5


In [354]:
historic_olympic_data.game_year.dtype

dtype('int64')

In [355]:
historic_olympic_data.loc[(historic_olympic_data.country_name=='German Democratic Republic (Germany)')]

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals,sports,epreuves
688,1968,German Democratic Republic (Germany),25,9,9,7,20,104
792,1972,German Democratic Republic (Germany),66,20,23,23,20,142
911,1976,German Democratic Republic (Germany),90,40,25,25,18,134
1017,1980,German Democratic Republic (Germany),126,47,37,42,18,159
1228,1988,German Democratic Republic (Germany),102,37,35,30,17,154


In [356]:
historic_olympic_data.loc[(historic_olympic_data.country_name=='ROC')]

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals,sports,epreuves
2701,2020,ROC,70,20,28,22,38,182


In [357]:
historic_olympic_data.loc[(historic_olympic_data.country_name=='Unified Team')]

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals,sports,epreuves
1383,1992,Unified Team,112,45,38,29,30,207


In [358]:
historic_olympic_data.loc[(historic_olympic_data.game_year==1992)]['country_name'].unique()

array(['Unified Team', 'United States of America', 'Germany',
       "People's Republic of China", 'Cuba', 'Hungary', 'France',
       'Republic of Korea', 'Australia', 'Japan', 'Spain',
       'Great Britain', 'Italy', 'Poland', 'Canada', 'Romania',
       'Bulgaria', 'Netherlands', 'Sweden', 'New Zealand',
       "Democratic People's Republic of Korea", 'Kenya', 'Czechoslovakia',
       'Norway', 'Denmark', 'Turkey', 'Finland', 'Indonesia', 'Jamaica',
       'Nigeria', 'Belgium', 'Brazil', 'Croatia', 'Ethiopia',
       'Independent Olympic Athletes', 'Islamic Republic of Iran',
       'Latvia', 'Morocco', 'Algeria', 'Austria', 'Estonia', 'Greece',
       'Ireland', 'Israel', 'Lithuania', 'Mongolia', 'Namibia',
       'Slovenia', 'South Africa', 'Argentina', 'Bahamas',
       'Chinese Taipei', 'Colombia', 'Ghana', 'Malaysia', 'Mexico',
       'Pakistan', 'Peru', 'Philippines', 'Puerto Rico', 'Qatar',
       'Suriname', 'Switzerland', 'Thailand', 'Albania', 'American Samoa',
       'An

In [359]:
fig = px.bar(historic_olympic_data, 
             x="country_name", 
             y="total_medals", 
             animation_frame="game_year", 
             hover_name="country_name")

fig.update_layout(
        title="Country ranked on Total medals since 1896",
        xaxis_title="Country",
        yaxis_title="Total medals",
        showlegend=False)

In [360]:
game_part = summer_games_results.groupby(['country_name','game_year']).agg({'total_medals':'sum', 'gold_medals':'sum', 'silver_medals':'sum','bronze_medals':'sum'})
game_part

Unnamed: 0_level_0,Unnamed: 1_level_0,total_medals,gold_medals,silver_medals,bronze_medals
country_name,game_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,1936,0,0,0,0
Afghanistan,1948,0,0,0,0
Afghanistan,1956,0,0,0,0
Afghanistan,1960,0,0,0,0
Afghanistan,1964,0,0,0,0
...,...,...,...,...,...
Zimbabwe,2004,3,1,1,1
Zimbabwe,2008,4,1,3,0
Zimbabwe,2012,0,0,0,0
Zimbabwe,2016,0,0,0,0


In [361]:
game_p = game_part.reset_index()
game_p_france = game_p.loc[(game_p.country_name=='France')]
game_p_france

Unnamed: 0,country_name,game_year,total_medals,gold_medals,silver_medals,bronze_medals
886,France,1896,11,5,4,2
887,France,1900,97,26,36,35
888,France,1904,1,0,1,0
889,France,1908,19,5,5,9
890,France,1912,14,7,4,3
891,France,1920,41,9,19,13
892,France,1924,38,13,15,10
893,France,1928,21,6,10,5
894,France,1932,19,10,5,4
895,France,1936,18,7,5,6


In [362]:
import warnings
warnings.filterwarnings('ignore')

In [363]:
game_p_france = game_p.loc[(game_p.country_name=='France')]
game_p_france['game_part'] = range(0, game_p_france.shape[0])
game_p_france['prec_game_medal'] = game_p_france['total_medals'].shift(1, fill_value=0)
game_p_france['prec_game_gold'] = game_p_france['gold_medals'].shift(1, fill_value=0)
game_p_france['prec_game_silver'] = game_p_france['silver_medals'].shift(1, fill_value=0)
game_p_france['prec_game_bronze'] = game_p_france['bronze_medals'].shift(1, fill_value=0)
#game_p_france['total_game_medal'] = 0
#game_p_france['total_game_medal'] = game_p_france['total_game_medal'].shift(1, fill_value=0) + game_p_france['total_medals'].shift(1, fill_value=0)
#game_p_france['prec_game_medal'].shift(1, fill_value=0) + game_p_france['prec_game_medal'].shift(2, fill_value=0)
game_p_france

Unnamed: 0,country_name,game_year,total_medals,gold_medals,silver_medals,bronze_medals,game_part,prec_game_medal,prec_game_gold,prec_game_silver,prec_game_bronze
886,France,1896,11,5,4,2,0,0,0,0,0
887,France,1900,97,26,36,35,1,11,5,4,2
888,France,1904,1,0,1,0,2,97,26,36,35
889,France,1908,19,5,5,9,3,1,0,1,0
890,France,1912,14,7,4,3,4,19,5,5,9
891,France,1920,41,9,19,13,5,14,7,4,3
892,France,1924,38,13,15,10,6,41,9,19,13
893,France,1928,21,6,10,5,7,38,13,15,10
894,France,1932,19,10,5,4,8,21,6,10,5
895,France,1936,18,7,5,6,9,19,10,5,4


In [364]:
liste_country = list(game_p.country_name.unique())
liste_country.remove('France')
liste_country

['Afghanistan',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australasia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bohemia',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'Chinese Taipei',
 'Colombia',
 'Comoros',
 'Congo',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Czechoslovakia',
 "CÃ´te d'Ivoire",
 "Democratic People's Republic of Korea",
 'Democratic Republic of Timor-Leste',
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Gu

In [365]:
for country in liste_country:
    game_p_temp = game_p.loc[(game_p.country_name==country)]
    game_p_temp['game_part'] = range(0, game_p_temp.shape[0])
    game_p_temp['prec_game_medal'] = game_p_temp['total_medals'].shift(1, fill_value=0)
    game_p_temp['prec_game_gold'] = game_p_temp['gold_medals'].shift(1, fill_value=0)
    game_p_temp['prec_game_silver'] = game_p_temp['silver_medals'].shift(1, fill_value=0)
    game_p_temp['prec_game_bronze'] = game_p_temp['bronze_medals'].shift(1, fill_value=0)
    game_p_france = pd.concat([game_p_france,game_p_temp])
game_p_france

Unnamed: 0,country_name,game_year,total_medals,gold_medals,silver_medals,bronze_medals,game_part,prec_game_medal,prec_game_gold,prec_game_silver,prec_game_bronze
886,France,1896,11,5,4,2,0,0,0,0,0
887,France,1900,97,26,36,35,1,11,5,4,2
888,France,1904,1,0,1,0,2,97,26,36,35
889,France,1908,19,5,5,9,3,1,0,1,0
890,France,1912,14,7,4,3,4,19,5,5,9
...,...,...,...,...,...,...,...,...,...,...,...
2884,Zimbabwe,2004,3,1,1,1,9,0,0,0,0
2885,Zimbabwe,2008,4,1,3,0,10,3,1,1,1
2886,Zimbabwe,2012,0,0,0,0,11,4,1,3,0
2887,Zimbabwe,2016,0,0,0,0,12,0,0,0,0


In [366]:
game_p_france = game_p_france[['game_year', 'country_name', 'total_medals', 'gold_medals', 'silver_medals', 'bronze_medals', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']]
game_p_france

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals,game_part,prec_game_medal,prec_game_gold,prec_game_silver,prec_game_bronze
886,1896,France,11,5,4,2,0,0,0,0,0
887,1900,France,97,26,36,35,1,11,5,4,2
888,1904,France,1,0,1,0,2,97,26,36,35
889,1908,France,19,5,5,9,3,1,0,1,0
890,1912,France,14,7,4,3,4,19,5,5,9
...,...,...,...,...,...,...,...,...,...,...,...
2884,2004,Zimbabwe,3,1,1,1,9,0,0,0,0
2885,2008,Zimbabwe,4,1,3,0,10,3,1,1,1
2886,2012,Zimbabwe,0,0,0,0,11,4,1,3,0
2887,2016,Zimbabwe,0,0,0,0,12,0,0,0,0


In [367]:
game_p_france = game_p_france.sort_values(by=['game_year','total_medals'],ascending=[True, False])
game_p_france

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals,game_part,prec_game_medal,prec_game_gold,prec_game_silver,prec_game_bronze
1008,1896,Greece,47,10,18,19,0,0,0,0,0
2738,1896,United States of America,20,11,7,2,0,0,0,0,0
943,1896,Germany,13,6,5,2,0,0,0,0,0
886,1896,France,11,5,4,2,0,0,0,0,0
979,1896,Great Britain,7,2,3,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2831,2020,"Virgin Islands, British",0,0,0,0,0,0,0,0,0
2832,2020,"Virgin Islands, US",0,0,0,0,0,0,0,0,0
2840,2020,Yemen,0,0,0,0,6,0,0,0,0
2874,2020,Zambia,0,0,0,0,14,0,0,0,0


In [368]:
game_p_france = game_p_france[['game_year', 'country_name', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']]
game_p_france.head(20)

Unnamed: 0,game_year,country_name,game_part,prec_game_medal,prec_game_gold,prec_game_silver,prec_game_bronze
1008,1896,Greece,0,0,0,0,0
2738,1896,United States of America,0,0,0,0,0
943,1896,Germany,0,0,0,0,0
886,1896,France,0,0,0,0,0
979,1896,Great Britain,0,0,0,0,0
699,1896,Denmark,0,0,0,0,0
1149,1896,Hungary,0,0,0,0,0
152,1896,Austria,0,0,0,0,0
2519,1896,Switzerland,0,0,0,0,0
124,1896,Australia,0,0,0,0,0


In [369]:
historic_olympic_data = pd.merge(historic_olympic_data, game_p_france, on=['game_year','country_name'])
historic_olympic_data

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals,sports,epreuves,game_part,prec_game_medal,prec_game_gold,prec_game_silver,prec_game_bronze
0,1896,Greece,47,10,18,19,10,39,0,0,0,0,0
1,1896,United States of America,20,11,7,2,3,17,0,0,0,0,0
2,1896,Germany,13,6,5,2,7,27,0,0,0,0,0
3,1896,France,11,5,4,2,7,20,0,0,0,0,0
4,1896,Great Britain,7,2,3,2,9,20,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2884,2020,"Virgin Islands, British",0,0,0,0,1,2,0,0,0,0,0
2885,2020,"Virgin Islands, US",0,0,0,0,1,1,0,0,0,0,0
2886,2020,Yemen,0,0,0,0,2,2,6,0,0,0,0
2887,2020,Zambia,0,0,0,0,3,5,14,0,0,0,0


In [370]:
historic_olympic_data = historic_olympic_data.sort_values(by=['game_year','total_medals'],ascending=[True, False])
historic_olympic_data

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals,sports,epreuves,game_part,prec_game_medal,prec_game_gold,prec_game_silver,prec_game_bronze
0,1896,Greece,47,10,18,19,10,39,0,0,0,0,0
1,1896,United States of America,20,11,7,2,3,17,0,0,0,0,0
2,1896,Germany,13,6,5,2,7,27,0,0,0,0,0
3,1896,France,11,5,4,2,7,20,0,0,0,0,0
4,1896,Great Britain,7,2,3,2,9,20,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2884,2020,"Virgin Islands, British",0,0,0,0,1,2,0,0,0,0,0
2885,2020,"Virgin Islands, US",0,0,0,0,1,1,0,0,0,0,0
2886,2020,Yemen,0,0,0,0,2,2,6,0,0,0,0
2887,2020,Zambia,0,0,0,0,3,5,14,0,0,0,0


In [371]:
liste_pays = list(historic_olympic_data.country_name.unique())
dict_pays = {}
for pays in enumerate(liste_pays):
    dict_pays[pays[1]] = pays[0]
dict_pays

{'Greece': 0,
 'United States of America': 1,
 'Germany': 2,
 'France': 3,
 'Great Britain': 4,
 'Denmark': 5,
 'Hungary': 6,
 'Austria': 7,
 'Switzerland': 8,
 'Australia': 9,
 'MIX': 10,
 'Chile': 11,
 'Italy': 12,
 'Sweden': 13,
 'Belgium': 14,
 'Netherlands': 15,
 'Norway': 16,
 'Bohemia': 17,
 'Canada': 18,
 'Cuba': 19,
 'India': 20,
 'Luxembourg': 21,
 'Spain': 22,
 'Argentina': 23,
 'Brazil': 24,
 'New Zealand': 25,
 'Romania': 26,
 'Russian Federation': 27,
 'Newfoundland': 28,
 'South Africa': 29,
 'Australasia': 30,
 'Finland': 31,
 'Iceland': 32,
 'Turkey': 33,
 'Japan': 34,
 'Portugal': 35,
 'Serbia': 36,
 'Estonia': 37,
 'Czechoslovakia': 38,
 'Egypt': 39,
 'Monaco': 40,
 'Poland': 41,
 'Yugoslavia': 42,
 'Haiti': 43,
 'Uruguay': 44,
 'Bulgaria': 45,
 'Ecuador': 46,
 'Ireland': 47,
 'Latvia': 48,
 'Lithuania': 49,
 'Mexico': 50,
 'Philippines': 51,
 'Malta': 52,
 'Zimbabwe': 53,
 'Colombia': 54,
 'Afghanistan': 55,
 'Bermuda': 56,
 'Liechtenstein': 57,
 "People's Republic 

In [372]:
data_all_train = historic_olympic_data.loc[(historic_olympic_data.game_year < 2020)]
data_all_test = historic_olympic_data.loc[(historic_olympic_data.game_year == 2020)]
data_all_test.shape

(190, 13)

In [373]:
X_all_train = data_all_train[['country_name', 'sports', 'epreuves', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']]
X_all_test = data_all_test[['country_name', 'sports', 'epreuves', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']]

y_all_total_train = data_all_train['total_medals']
y_all_gold_train = data_all_train['gold_medals']
y_all_silver_train = data_all_train['silver_medals']
y_all_bronze_train = data_all_train['bronze_medals']

y_all_total_test = data_all_test['total_medals']
y_all_gold_test = data_all_test['gold_medals']
y_all_silver_test = data_all_test['silver_medals']
y_all_bronze_test = data_all_test['bronze_medals']

In [374]:
X_all_train = X_all_train.replace(dict_pays)
X_all_test = X_all_test.replace(dict_pays)

In [375]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [376]:
model = LinearRegression()
model.fit(X_all_train, y_all_total_train)

In [377]:
predictions = model.predict(X_all_test)

print('Predicted labels : ', np.round(predictions)[:10])
print('Actual labels : ' , y_all_total_test[:10])

mse = mean_squared_error(y_all_total_test, predictions)
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(y_all_total_test, predictions)
print("R2:", r2)

Predicted labels :  [102.  61.  37.  60.  37.  31.  33.  45.  18.  30.]
Actual labels :  2699    113
2700     88
2701     70
2702     65
2703     57
2704     45
2705     40
2706     37
2707     36
2708     33
Name: total_medals, dtype: int64
MSE: 32.30623748434444
RMSE: 5.683857623511029
R2: 0.8449657359930957


In [378]:
result_df_2020_total_medals = pd.DataFrame({'pays': data_all_test['country_name'],'reglog_total_medals_pred':np.round(predictions),'total_medals_truth':data_all_test['total_medals']})
result_df_2020_total_medals

Unnamed: 0,pays,reglog_total_medals_pred,total_medals_truth
2699,United States of America,102.0,113
2700,People's Republic of China,61.0,88
2701,ROC,37.0,70
2702,Great Britain,60.0,65
2703,Japan,37.0,57
...,...,...,...
2884,"Virgin Islands, British",0.0,0
2885,"Virgin Islands, US",0.0,0
2886,Yemen,-2.0,0
2887,Zambia,-3.0,0


In [379]:
from sklearn.tree import DecisionTreeRegressor

model_1 = DecisionTreeRegressor()
model_1.fit(X_all_train, y_all_total_train)

In [380]:
predictions_1 = model_1.predict(X_all_test)

print('Predicted labels : ', np.round(predictions_1)[:10])
print('Actual labels : ' , y_all_total_test[:10])

mse = mean_squared_error(y_all_total_test, predictions_1)
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(y_all_total_test, predictions_1)
print("R2:", r2)

Predicted labels :  [104.  56. 112.  67.  19.  27.  27.  38.  18.  16.]
Actual labels :  2699    113
2700     88
2701     70
2702     65
2703     57
2704     45
2705     40
2706     37
2707     36
2708     33
Name: total_medals, dtype: int64
MSE: 33.73157894736842
RMSE: 5.807889371137197
R2: 0.8381256709813303


In [381]:
result_df_2020_total_medals['tree_total_medals_pred'] = np.round(predictions_1)
result_df_2020_total_medals

Unnamed: 0,pays,reglog_total_medals_pred,total_medals_truth,tree_total_medals_pred
2699,United States of America,102.0,113,104.0
2700,People's Republic of China,61.0,88,56.0
2701,ROC,37.0,70,112.0
2702,Great Britain,60.0,65,67.0
2703,Japan,37.0,57,19.0
...,...,...,...,...
2884,"Virgin Islands, British",0.0,0,1.0
2885,"Virgin Islands, US",0.0,0,0.0
2886,Yemen,-2.0,0,0.0
2887,Zambia,-3.0,0,0.0


In [382]:
from sklearn.ensemble import GradientBoostingRegressor

model_2 = GradientBoostingRegressor()
model_2.fit(X_all_train, y_all_total_train)

In [383]:
predictions_2 = model_2.predict(X_all_test)

print('Predicted labels : ', np.round(predictions_2)[:10])
print('Actual labels : ' , y_all_total_test[:10])

mse = mean_squared_error(y_all_total_test, predictions_2)
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(y_all_total_test, predictions_2)
print("R2:", r2)

Predicted labels :  [112.  59.  53.  66.  38.  31.  32.  35.  19.  34.]
Actual labels :  2699    113
2700     88
2701     70
2702     65
2703     57
2704     45
2705     40
2706     37
2707     36
2708     33
Name: total_medals, dtype: int64
MSE: 14.754757651144143
RMSE: 3.8411922174168978
R2: 0.9291934570172745


In [384]:
result_df_2020_total_medals['XGB_total_medals_pred'] = np.round(predictions_2)
result_df_2020_total_medals.columns

Index(['pays', 'reglog_total_medals_pred', 'total_medals_truth',
       'tree_total_medals_pred', 'XGB_total_medals_pred'],
      dtype='object')

In [385]:
result_df_2020_total_medals = result_df_2020_total_medals[['pays', 'reglog_total_medals_pred', 'tree_total_medals_pred', 'XGB_total_medals_pred', 'total_medals_truth']]
result_df_2020_total_medals

Unnamed: 0,pays,reglog_total_medals_pred,tree_total_medals_pred,XGB_total_medals_pred,total_medals_truth
2699,United States of America,102.0,104.0,112.0,113
2700,People's Republic of China,61.0,56.0,59.0,88
2701,ROC,37.0,112.0,53.0,70
2702,Great Britain,60.0,67.0,66.0,65
2703,Japan,37.0,19.0,38.0,57
...,...,...,...,...,...
2884,"Virgin Islands, British",0.0,1.0,0.0,0
2885,"Virgin Islands, US",0.0,0.0,0.0,0
2886,Yemen,-2.0,0.0,0.0,0
2887,Zambia,-3.0,0.0,0.0,0


In [386]:
# ------------------------------------------------------------------------------------------------------------

print(historic_olympic_data['sports'])

data_all_train = historic_olympic_data.loc[(historic_olympic_data.game_year < 2020)]
data_all_test = historic_olympic_data.loc[(historic_olympic_data.game_year == 2020)]

X_all_train = data_all_train[['country_name', 'sports', 'epreuves', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']]
X_all_test = data_all_test[['country_name', 'sports', 'epreuves', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']]

y_all_total_train = data_all_train['total_medals']
y_all_gold_train = data_all_train['gold_medals']
y_all_silver_train = data_all_train['silver_medals']
y_all_bronze_train = data_all_train['bronze_medals']

y_all_total_test = data_all_test['total_medals']
y_all_gold_test = data_all_test['gold_medals']
y_all_silver_test = data_all_test['silver_medals']
y_all_bronze_test = data_all_test['bronze_medals']

0       10
1        3
2        7
3        7
4        9
        ..
2884     1
2885     1
2886     2
2887     3
2888     2
Name: sports, Length: 2889, dtype: int64


In [387]:
data_all_test['country_name']

2699      United States of America
2700    People's Republic of China
2701                           ROC
2702                 Great Britain
2703                         Japan
                   ...            
2884       Virgin Islands, British
2885            Virgin Islands, US
2886                         Yemen
2887                        Zambia
2888                      Zimbabwe
Name: country_name, Length: 190, dtype: object

In [388]:
X_all_train = X_all_train.replace(dict_pays)
X_all_test = X_all_test.replace(dict_pays)
print(X_all_train.shape)
print(X_all_test.shape)

(2699, 8)
(190, 8)


In [389]:
print(X_all_test.to_json())
print(X_all_test)

{"country_name":{"2699":1,"2700":58,"2701":229,"2702":4,"2703":34,"2704":9,"2705":12,"2706":2,"2707":15,"2708":3,"2709":18,"2710":24,"2711":6,"2712":25,"2713":62,"2714":191,"2715":22,"2716":19,"2717":41,"2718":8,"2719":33,"2720":86,"2721":193,"2722":5,"2723":89,"2724":60,"2725":36,"2726":13,"2727":185,"2728":198,"2729":194,"2730":16,"2731":7,"2732":201,"2733":192,"2734":14,"2735":20,"2736":63,"2737":45,"2738":39,"2739":97,"2740":54,"2741":128,"2742":79,"2743":188,"2744":200,"2745":197,"2746":87,"2747":0,"2748":47,"2749":80,"2750":50,"2751":116,"2752":51,"2753":35,"2754":26,"2755":196,"2756":93,"2757":74,"2758":23,"2759":46,"2760":207,"2761":167,"2762":99,"2763":29,"2764":76,"2765":37,"2766":88,"2767":31,"2768":152,"2769":226,"2770":48,"2771":114,"2772":82,"2773":85,"2774":103,"2775":157,"2776":56,"2777":159,"2778":146,"2779":230,"2780":77,"2781":162,"2782":132,"2783":49,"2784":96,"2785":187,"2786":231,"2787":64,"2788":199,"2789":141,"2790":73,"2791":212,"2792":55,"2793":136,"2794":104,

In [390]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_all_train, y_all_total_train)
predictions = model.predict(X_all_test)

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
# model_1 = DecisionTreeRegressor()
# model_1.fit(X_all_train, y_all_total_train)
# predictions_1 = model_1.predict(X_all_test)

param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_dt = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=param_grid_dt, cv=5, scoring='neg_mean_squared_error')
grid_search_dt.fit(X_all_train, y_all_total_train)
best_model_dt = grid_search_dt.best_estimator_
predictions_1 = best_model_dt.predict(X_all_test)

from sklearn.ensemble import GradientBoostingRegressor
model_2 = GradientBoostingRegressor()
model_2.fit(X_all_train, y_all_total_train)
predictions_2 = model_2.predict(X_all_test)

In [391]:
import joblib
joblib.dump(model, './h5/linear_regression_model.h5')
joblib.dump(best_model_dt, './h5/decision_tree_model.h5')
joblib.dump(model_2, './h5/gradient_boosting_model.h5')

['./h5/gradient_boosting_model.h5']

In [392]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, r2

mse, rmse, r2 = evaluate_model(y_all_total_test, predictions)
print("Linear Regression - MSE:", mse, "RMSE:", rmse, "R2:", r2)

mse_1, rmse_1, r2_1 = evaluate_model(y_all_total_test, predictions_1)
print("Decision Tree - MSE:", mse_1, "RMSE:", rmse_1, "R2:", r2_1)

mse_2, rmse_2, r2_2 = evaluate_model(y_all_total_test, predictions_2)
print("Gradient Boosting - MSE:", mse_2, "RMSE:", rmse_2, "R2:", r2_2)


Linear Regression - MSE: 32.30623748434444 RMSE: 5.683857623511029 R2: 0.8449657359930957
Decision Tree - MSE: 13.547861409476067 RMSE: 3.680741964533247 R2: 0.9349852261965353
Gradient Boosting - MSE: 14.75339511772916 RMSE: 3.841014855182047 R2: 0.9291999956730149


In [393]:
data_all_test['total_medals']

2699    113
2700     88
2701     70
2702     65
2703     57
       ... 
2884      0
2885      0
2886      0
2887      0
2888      0
Name: total_medals, Length: 190, dtype: int64

In [394]:
data_all_test['epreuves'].count()

190

In [413]:
data_all_test['sports'].count()

190

In [396]:
print(data_all_test['sports'].unique())
print(data_all_test['sports'].count())
print(data_all_test['sports'])

[43 41 38 35 46 40 36 31 32 24 25 26 13 27 17 16 19  6  4 15 22  8 23 10
 12 14 18  5  3 20 29  9 28  1 11  7  2]
190
2699    43
2700    41
2701    38
2702    35
2703    46
        ..
2884     1
2885     1
2886     2
2887     3
2888     2
Name: sports, Length: 190, dtype: int64


In [397]:
# pays participants
# sports
# nombre d'épreuves par pays
# nombre de participations aux jeux précédents ?
# nombre de médailles aux jeux précédents ?

print(
    len(data_all_test['country_name']),
    len(data_all_test['sports']),
    len(data_all_test['epreuves']),
    len(data_all_test['game_part']),
    len(data_all_test['prec_game_medal']),
    len(data_all_test['prec_game_gold']),
    len(data_all_test['prec_game_silver']),
    len(data_all_test['prec_game_bronze'])
)

countries = data_all_test['country_name']
sports = data_all_test['sports']
epreuves = data_all_test['epreuves']
game_part = data_all_test['game_part']
prec_game_medal = data_all_test['total_medals']
prec_game_gold = data_all_test['prec_game_gold']
prec_game_silver = data_all_test['prec_game_silver']
prec_game_bronze = data_all_test['prec_game_bronze']

data_pred_2024 = pd.DataFrame({
    'country_name': countries,
    'sports': sports,
    'epreuves': epreuves,
    'game_part': game_part,
    'prec_game_medal': prec_game_medal,
    'prec_game_gold': prec_game_gold,
    'prec_game_silver': prec_game_silver,
    'prec_game_bronze': prec_game_bronze
})

X_all_pred = data_pred_2024.replace(dict_pays)


190 190 190 190 190 190 190 190


In [398]:
predictions_2024 = model.predict(X_all_pred)
result_df_2024_total_medals = pd.DataFrame({'pays': data_pred_2024['country_name'], 'pred_total_medals_2024': np.round(predictions_2024)})
result_df_2024_total_medals

Unnamed: 0,pays,pred_total_medals_2024
2699,United States of America,99.0
2700,People's Republic of China,67.0
2701,ROC,58.0
2702,Great Britain,60.0
2703,Japan,42.0
...,...,...
2884,"Virgin Islands, British",0.0
2885,"Virgin Islands, US",0.0
2886,Yemen,-2.0
2887,Zambia,-3.0


In [399]:
predictions_2024_1 = model_1.predict(X_all_pred)
result_df_2024_total_medals_1 = pd.DataFrame({'pays': data_pred_2024['country_name'], 'pred_total_medals_2024': np.round(predictions_2024_1)})
result_df_2024_total_medals_1

Unnamed: 0,pays,pred_total_medals_2024
2699,United States of America,104.0
2700,People's Republic of China,56.0
2701,ROC,40.0
2702,Great Britain,67.0
2703,Japan,46.0
...,...,...
2884,"Virgin Islands, British",1.0
2885,"Virgin Islands, US",0.0
2886,Yemen,0.0
2887,Zambia,0.0


In [400]:
predictions_2024_2 = model_2.predict(X_all_pred)
result_df_2024_total_medals_2 = pd.DataFrame({'pays': data_pred_2024['country_name'], 'pred_total_medals_2024': np.round(predictions_2024_2)})
result_df_2024_total_medals_2

Unnamed: 0,pays,pred_total_medals_2024
2699,United States of America,112.0
2700,People's Republic of China,63.0
2701,ROC,59.0
2702,Great Britain,66.0
2703,Japan,43.0
...,...,...
2884,"Virgin Islands, British",0.0
2885,"Virgin Islands, US",0.0
2886,Yemen,0.0
2887,Zambia,0.0


In [401]:
predictions_2024_models = (
    result_df_2024_total_medals
    .merge(result_df_2024_total_medals_1, on='pays')
    .merge(result_df_2024_total_medals_2, on='pays')
    .rename(columns={
        'pred_total_medals_2024_x': 'pred_2024_linearRegression',
        'pred_total_medals_2024_y': 'pred_2024_DecisionTree',
        'pred_total_medals_2024': 'pred_2024_GradientBoosting'
    })
)

predictions_2024_models

Unnamed: 0,pays,pred_2024_linearRegression,pred_2024_DecisionTree,pred_2024_GradientBoosting
0,United States of America,99.0,104.0,112.0
1,People's Republic of China,67.0,56.0,63.0
2,ROC,58.0,40.0,59.0
3,Great Britain,60.0,67.0,66.0
4,Japan,42.0,46.0,43.0
...,...,...,...,...
185,"Virgin Islands, British",0.0,1.0,0.0
186,"Virgin Islands, US",0.0,0.0,0.0
187,Yemen,-2.0,0.0,0.0
188,Zambia,-3.0,0.0,0.0


In [402]:
combined_df = pd.merge(result_df_2020_total_medals, predictions_2024_models, on='pays')
combined_df

Unnamed: 0,pays,reglog_total_medals_pred,tree_total_medals_pred,XGB_total_medals_pred,total_medals_truth,pred_2024_linearRegression,pred_2024_DecisionTree,pred_2024_GradientBoosting
0,United States of America,102.0,104.0,112.0,113,99.0,104.0,112.0
1,People's Republic of China,61.0,56.0,59.0,88,67.0,56.0,63.0
2,ROC,37.0,112.0,53.0,70,58.0,40.0,59.0
3,Great Britain,60.0,67.0,66.0,65,60.0,67.0,66.0
4,Japan,37.0,19.0,38.0,57,42.0,46.0,43.0
...,...,...,...,...,...,...,...,...
185,"Virgin Islands, British",0.0,1.0,0.0,0,0.0,1.0,0.0
186,"Virgin Islands, US",0.0,0.0,0.0,0,0.0,0.0,0.0
187,Yemen,-2.0,0.0,0.0,0,-2.0,0.0,0.0
188,Zambia,-3.0,0.0,0.0,0,-3.0,0.0,0.0


In [403]:
# ------------------------------------------------------------------------------------------------------------

# Préparation des données
data_all_train = historic_olympic_data.loc[(historic_olympic_data.game_year < 2020)]
data_all_test = historic_olympic_data.loc[(historic_olympic_data.game_year == 2020)]

X_all_train = data_all_train[['country_name', 'sports', 'epreuves', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']]
X_all_test = data_all_test[['country_name', 'sports', 'epreuves', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']]

y_all_gold_train = data_all_train['gold_medals']
y_all_silver_train = data_all_train['silver_medals']
y_all_bronze_train = data_all_train['bronze_medals']

y_all_gold_test = data_all_test['gold_medals']
y_all_silver_test = data_all_test['silver_medals']
y_all_bronze_test = data_all_test['bronze_medals']

X_all_train = X_all_train.replace(dict_pays)
X_all_test = X_all_test.replace(dict_pays)

In [404]:
# Fonction d'évaluation
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, r2

param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

def grid_search_cv(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Modèles pour les médailles d'or
model_gold = LinearRegression()
model_gold.fit(X_all_train, y_all_gold_train)
predictions_gold = model_gold.predict(X_all_test)

model_gold_1 = grid_search_cv(DecisionTreeRegressor(), param_grid_dt, X_all_train, y_all_gold_train)
predictions_gold_1 = model_gold_1.predict(X_all_test)
#
# model_gold_1 = DecisionTreeRegressor()
# model_gold_1.fit(X_all_train, y_all_gold_train)
# predictions_gold_1 = model_gold_1.predict(X_all_test)
#
# grid_search_dt = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=param_grid_dt, cv=5, scoring='neg_mean_squared_error')
# grid_search_dt.fit(X_all_train, y_all_total_train)
# model_gold_1 = grid_search_dt.best_estimator_
# predictions_gold_1 = model_gold_1.predict(X_all_test)

model_gold_2 = GradientBoostingRegressor()
model_gold_2.fit(X_all_train, y_all_gold_train)
predictions_gold_2 = model_gold_2.predict(X_all_test)

# Modèles pour les médailles d'argent
model_silver = LinearRegression()
model_silver.fit(X_all_train, y_all_silver_train)
predictions_silver = model_silver.predict(X_all_test)

model_silver_1 = grid_search_cv(DecisionTreeRegressor(), param_grid_dt, X_all_train, y_all_gold_train)
predictions_silver_1 = model_silver_1.predict(X_all_test)
#
# model_silver_1 = DecisionTreeRegressor()
# model_silver_1.fit(X_all_train, y_all_silver_train)
# predictions_silver_1 = model_silver_1.predict(X_all_test)
#
# grid_search_dt = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=param_grid_dt, cv=5, scoring='neg_mean_squared_error')
# grid_search_dt.fit(X_all_train, y_all_total_train)
# model_silver_1 = grid_search_dt.best_estimator_
# predictions_silver_1 = model_silver_1.predict(X_all_test)

model_silver_2 = GradientBoostingRegressor()
model_silver_2.fit(X_all_train, y_all_silver_train)
predictions_silver_2 = model_silver_2.predict(X_all_test)

# Modèles pour les médailles de bronze
model_bronze = LinearRegression()
model_bronze.fit(X_all_train, y_all_bronze_train)
predictions_bronze = model_bronze.predict(X_all_test)

model_bronze_1 = grid_search_cv(DecisionTreeRegressor(), param_grid_dt, X_all_train, y_all_gold_train)
predictions_bronze_1 = model_bronze_1.predict(X_all_test)
#
# model_bronze_1 = DecisionTreeRegressor()
# model_bronze_1.fit(X_all_train, y_all_bronze_train)
# predictions_bronze_1 = model_bronze_1.predict(X_all_test)
#
# grid_search_dt = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=param_grid_dt, cv=5, scoring='neg_mean_squared_error')
# grid_search_dt.fit(X_all_train, y_all_total_train)
# model_bronze_1 = grid_search_dt.best_estimator_
# predictions_bronze_1 = model_bronze_1.predict(X_all_test)

model_bronze_2 = GradientBoostingRegressor()
model_bronze_2.fit(X_all_train, y_all_bronze_train)
predictions_bronze_2 = model_bronze_2.predict(X_all_test)


In [405]:
# Évaluation des modèles pour les médailles d'or
mse_gold, rmse_gold, r2_gold = evaluate_model(y_all_gold_test, predictions_gold)
mse_gold_1, rmse_gold_1, r2_gold_1 = evaluate_model(y_all_gold_test, predictions_gold_1)
mse_gold_2, rmse_gold_2, r2_gold_2 = evaluate_model(y_all_gold_test, predictions_gold_2)

print("Gold Medals - Linear Regression - MSE:", mse_gold, "RMSE:", rmse_gold, "R2:", r2_gold)
print("Gold Medals - Decision Tree - MSE:", mse_gold_1, "RMSE:", rmse_gold_1, "R2:", r2_gold_1)
print("Gold Medals - Gradient Boosting - MSE:", mse_gold_2, "RMSE:", rmse_gold_2, "R2:", r2_gold_2)

# Évaluation des modèles pour les médailles d'argent
mse_silver, rmse_silver, r2_silver = evaluate_model(y_all_silver_test, predictions_silver)
mse_silver_1, rmse_silver_1, r2_silver_1 = evaluate_model(y_all_silver_test, predictions_silver_1)
mse_silver_2, rmse_silver_2, r2_silver_2 = evaluate_model(y_all_silver_test, predictions_silver_2)

print("Silver Medals - Linear Regression - MSE:", mse_silver, "RMSE:", rmse_silver, "R2:", r2_silver)
print("Silver Medals - Decision Tree - MSE:", mse_silver_1, "RMSE:", rmse_silver_1, "R2:", r2_silver_1)
print("Silver Medals - Gradient Boosting - MSE:", mse_silver_2, "RMSE:", rmse_silver_2, "R2:", r2_silver_2)

# Évaluation des modèles pour les médailles de bronze
mse_bronze, rmse_bronze, r2_bronze = evaluate_model(y_all_bronze_test, predictions_bronze)
mse_bronze_1, rmse_bronze_1, r2_bronze_1 = evaluate_model(y_all_bronze_test, predictions_bronze_1)
mse_bronze_2, rmse_bronze_2, r2_bronze_2 = evaluate_model(y_all_bronze_test, predictions_bronze_2)

print("Bronze Medals - Linear Regression - MSE:", mse_bronze, "RMSE:", rmse_bronze, "R2:", r2_bronze)
print("Bronze Medals - Decision Tree - MSE:", mse_bronze_1, "RMSE:", rmse_bronze_1, "R2:", r2_bronze_1)
print("Bronze Medals - Gradient Boosting - MSE:", mse_bronze_2, "RMSE:", rmse_bronze_2, "R2:", r2_bronze_2)


Gold Medals - Linear Regression - MSE: 6.2329152575277185 RMSE: 2.496580713201101 R2: 0.7705891352987679
Gold Medals - Decision Tree - MSE: 7.442289354338225 RMSE: 2.7280559661301353 R2: 0.7260764881933078
Gold Medals - Gradient Boosting - MSE: 4.200928955270855 RMSE: 2.049616782540301 R2: 0.8453791357080962
Silver Medals - Linear Regression - MSE: 5.144142679996289 RMSE: 2.268070254642984 R2: 0.7895044997252774
Silver Medals - Decision Tree - MSE: 8.023254266618927 RMSE: 2.832534954174251 R2: 0.6716928309063791
Silver Medals - Gradient Boosting - MSE: 3.288029477844515 RMSE: 1.8132924413465454 R2: 0.8654556350957561
Bronze Medals - Linear Regression - MSE: 4.411405435595273 RMSE: 2.1003346008660793 R2: 0.807578675932204
Bronze Medals - Decision Tree - MSE: 11.765751817413921 RMSE: 3.430124169387155 R2: 0.4867890570447275
Bronze Medals - Gradient Boosting - MSE: 3.1954219414620053 RMSE: 1.7875743177451406 R2: 0.8606187234639346


In [406]:
# Préparation des données pour 2024
countries = data_all_test['country_name']
sports = data_all_test['sports']
epreuves = data_all_test['epreuves']
game_part = data_all_test['game_part']
prec_game_medal = result_df_2020_total_medals['total_medals_truth']
prec_game_gold = data_all_test['prec_game_gold']
prec_game_silver = data_all_test['prec_game_silver']
prec_game_bronze = data_all_test['prec_game_bronze']

data_pred_2024 = pd.DataFrame({
    'country_name': countries,
    'sports': sports,
    'epreuves': epreuves,
    'game_part': game_part,
    'prec_game_medal': prec_game_medal,
    'prec_game_gold': prec_game_gold,
    'prec_game_silver': prec_game_silver,
    'prec_game_bronze': prec_game_bronze
})

X_all_pred = data_pred_2024.replace(dict_pays)

# Prédictions pour 2024
# Linear Regression
predictions_gold_2024 = model_gold.predict(X_all_pred)
predictions_silver_2024 = model_silver.predict(X_all_pred)
predictions_bronze_2024 = model_bronze.predict(X_all_pred)

# Decision Tree
predictions_gold_2024_1 = model_gold_1.predict(X_all_pred)
predictions_silver_2024_1 = model_silver_1.predict(X_all_pred)
predictions_bronze_2024_1 = model_bronze_1.predict(X_all_pred)

# Gradient Boosting
predictions_gold_2024_2 = model_gold_2.predict(X_all_pred)
predictions_silver_2024_2 = model_silver_2.predict(X_all_pred)
predictions_bronze_2024_2 = model_bronze_2.predict(X_all_pred)

# Création des DataFrames pour les résultats
result_df_2024_gold_medals = pd.DataFrame({
    'pays': data_pred_2024['country_name'],
    'pred_gold_medals_2024_LR': np.round(predictions_gold_2024),
    'pred_gold_medals_2024_DT': np.round(predictions_gold_2024_1),
    'pred_gold_medals_2024_GB': np.round(predictions_gold_2024_2)
})

result_df_2024_silver_medals = pd.DataFrame({
    'pays': data_pred_2024['country_name'],
    'pred_silver_medals_2024_LR': np.round(predictions_silver_2024),
    'pred_silver_medals_2024_DT': np.round(predictions_silver_2024_1),
    'pred_silver_medals_2024_GB': np.round(predictions_silver_2024_2)
})

result_df_2024_bronze_medals = pd.DataFrame({
    'pays': data_pred_2024['country_name'],
    'pred_bronze_medals_2024_LR': np.round(predictions_bronze_2024),
    'pred_bronze_medals_2024_DT': np.round(predictions_bronze_2024_1),
    'pred_bronze_medals_2024_GB': np.round(predictions_bronze_2024_2)
})

# Fusion des résultats
predictions_2024_models = (
    result_df_2024_gold_medals
    .merge(result_df_2024_silver_medals, on='pays')
    .merge(result_df_2024_bronze_medals, on='pays')
)

predictions_2024_models



Unnamed: 0,pays,pred_gold_medals_2024_LR,pred_gold_medals_2024_DT,pred_gold_medals_2024_GB,pred_silver_medals_2024_LR,pred_silver_medals_2024_DT,pred_silver_medals_2024_GB,pred_bronze_medals_2024_LR,pred_bronze_medals_2024_DT,pred_bronze_medals_2024_GB
0,United States of America,37.0,52.0,53.0,31.0,52.0,33.0,31.0,52.0,30.0
1,People's Republic of China,24.0,47.0,29.0,21.0,47.0,15.0,22.0,47.0,26.0
2,ROC,20.0,16.0,18.0,18.0,16.0,19.0,20.0,16.0,17.0
3,Great Britain,23.0,26.0,26.0,19.0,26.0,20.0,18.0,26.0,21.0
4,Japan,13.0,16.0,17.0,13.0,16.0,11.0,15.0,16.0,15.0
...,...,...,...,...,...,...,...,...,...,...
185,"Virgin Islands, British",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
186,"Virgin Islands, US",0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0
187,Yemen,-0.0,0.0,0.0,-1.0,0.0,0.0,-0.0,0.0,0.0
188,Zambia,-1.0,0.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0


In [407]:
# %pip install onnx
# %pip install skl2onnx
# 
# import onnx
# from skl2onnx import convert_sklearn
# import joblib

In [408]:
# joblib.dump(model, 'linear_regression_model.pkl')
# joblib.dump(best_model_dt, 'decision_tree_model.pkl')
# joblib.dump(model_2, 'gradient_boosting_model.pkl')

In [409]:
# joblib.dump(model, './h5/linear_regression_model.h5')
# joblib.dump(best_model_dt, './h5/decision_tree_model.h5')
# joblib.dump(model_2, './h5/gradient_boosting_model.h5')

['./h5/gradient_boosting_model.h5']

In [410]:
# # %pip install onnxmltools
# import onnxmltools
# from onnxmltools.convert import convert_sklearn
# from skl2onnx.common.data_types import FloatTensorType

# # Define initial types for input features
# initial_types = [('float_input', FloatTensorType([None, X_all_train.shape[1]]))]

# # Convert Linear Regression model to ONNX format
# onnx_model_linear = convert_sklearn(model, 'linear_regression_model', initial_types=initial_types)

# # Save the ONNX model to a file
# onnxmltools.utils.save_model(onnx_model_linear, 'linear_regression_model.onnx')

# # Convert Decision Tree model to ONNX format
# onnx_model_tree = convert_sklearn(best_model_dt, 'decision_tree_model', initial_types=initial_types)

# # Save the ONNX model to a file
# onnxmltools.utils.save_model(onnx_model_tree, 'decision_tree_model.onnx')

# # Convert Gradient Boosting model to ONNX format
# onnx_model_boosting = convert_sklearn(model_2, 'gradient_boosting_model', initial_types=initial_types)

# # Save the ONNX model to a file
# onnxmltools.utils.save_model(onnx_model_boosting, 'gradient_boosting_model.onnx')



In [411]:
# from skl2onnx import to_onnx
# import numpy

# # onx = to_onnx(model, X_all_train[:1].astype(numpy.float32), target_opset=12)
# # with open("filename.onnx", "wb") as f:
# #     f.write(onx.SerializeToString())

# numerical_features = ['game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']
# onx = to_onnx(model, X_all_train['game_part'][:1].astype(numpy.float32), target_opset=12)
# with open("filename.onnx", "wb") as f:
#     f.write(onx.SerializeToString())

In [412]:
# onnx_model_lr = convert_sklearn(model, 'linear_regression_model.onnx')
# onnx.save_model(onnx_model_lr, 'linear_regression_model.onnx')

# onnx_model_dt = convert_sklearn(best_model_dt, 'decision_tree_model.onnx')
# onnx.save_model(onnx_model_dt, 'decision_tree_model.onnx')

# onnx_model_gb = convert_sklearn(model_2, 'gradient_boosting_model.onnx')
# onnx.save_model(onnx_model_gb, 'gradient_boosting_model.onnx')