# On se base sur le même dataset que celui du DeepLearning

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
import pandas as pd
import numpy as np

In [2]:
olympic_data = pd.read_csv('data/olympic_data_cleaned.csv')
# Séparation des données en données d'entraînement et de test en fonction de l'année
train_data = olympic_data[olympic_data['game_year'] < 2020]
test_data = olympic_data[olympic_data['game_year'] == 2020]

# Définir les variables explicatives et la variable cible
features = ['sports', 'epreuves', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']
targets = ['gold_medals', 'silver_medals', 'bronze_medals']

X_train = train_data[features]
y_train = train_data[targets]

X_test = test_data[features]
y_test = test_data[targets]

## Linear Regression

In [3]:
# Définition et entraînement du modèle de régression linéaire
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred_linear = linear_reg.predict(X_test)

# Evaluation du modèle
results_linear = {
    "Model": "Linear Regression",
    "MSE (Gold)": mean_squared_error(y_test['gold_medals'], y_pred_linear[:, 0]),
    "MSE (Silver)": mean_squared_error(y_test['silver_medals'], y_pred_linear[:, 1]),
    "MSE (Bronze)": mean_squared_error(y_test['bronze_medals'], y_pred_linear[:, 2]),
    "R2 Score (Gold)": r2_score(y_test['gold_medals'], y_pred_linear[:, 0]),
    "R2 Score (Silver)": r2_score(y_test['silver_medals'], y_pred_linear[:, 1]),
    "R2 Score (Bronze)": r2_score(y_test['bronze_medals'], y_pred_linear[:, 2])
}

results_linear

{'Model': 'Linear Regression',
 'MSE (Gold)': 6.056399546351201,
 'MSE (Silver)': 4.774153267686079,
 'MSE (Bronze)': 4.242308057554072,
 'R2 Score (Gold)': 0.7770860344641226,
 'R2 Score (Silver)': 0.8046442637803142,
 'R2 Score (Bronze)': 0.8149545432955921}

## Random forest

In [4]:
# Initialiser et entraîner le modèle de régression Random Forest
random_forest = RandomForestRegressor(random_state=42)
random_forest.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred_rf = random_forest.predict(X_test)

# Evaluation du modèle
results_rf = {
    "Model": "Random Forest",
    "MSE (Gold)": mean_squared_error(y_test['gold_medals'], y_pred_rf[:, 0]),
    "MSE (Silver)": mean_squared_error(y_test['silver_medals'], y_pred_rf[:, 1]),
    "MSE (Bronze)": mean_squared_error(y_test['bronze_medals'], y_pred_rf[:, 2]),
    "R2 Score (Gold)": r2_score(y_test['gold_medals'], y_pred_rf[:, 0]),
    "R2 Score (Silver)": r2_score(y_test['silver_medals'], y_pred_rf[:, 1]),
    "R2 Score (Bronze)": r2_score(y_test['bronze_medals'], y_pred_rf[:, 2])
}

results_rf

{'Model': 'Random Forest',
 'MSE (Gold)': 5.032383759929051,
 'MSE (Silver)': 2.4096175956414703,
 'MSE (Bronze)': 2.553349404238355,
 'R2 Score (Gold)': 0.8147763185967515,
 'R2 Score (Silver)': 0.9013997680823093,
 'R2 Score (Bronze)': 0.8886253189954271}

## Gradient Boosting Regressor

In [7]:
# Separation des modèles de Gradient Boosting pour chaque médaille
gradient_boost_gold = GradientBoostingRegressor(random_state=42)
gradient_boost_silver = GradientBoostingRegressor(random_state=42)
gradient_boost_bronze = GradientBoostingRegressor(random_state=42)

gradient_boost_gold.fit(X_train, y_train['gold_medals'])
gradient_boost_silver.fit(X_train, y_train['silver_medals'])
gradient_boost_bronze.fit(X_train, y_train['bronze_medals'])

# Prédiction sur l'ensemble de test
y_pred_gb_gold = gradient_boost_gold.predict(X_test)
y_pred_gb_silver = gradient_boost_silver.predict(X_test)
y_pred_gb_bronze = gradient_boost_bronze.predict(X_test)

# Combiner les prédictions de Gradient Boosting en un seul tableau
y_pred_gb = np.column_stack((y_pred_gb_gold, y_pred_gb_silver, y_pred_gb_bronze))

# Evaluation du modèle
results_gb = {
    "Model": "Gradient Boosting",
    "MSE (Gold)": mean_squared_error(y_test['gold_medals'], y_pred_gb_gold),
    "MSE (Silver)": mean_squared_error(y_test['silver_medals'], y_pred_gb_silver),
    "MSE (Bronze)": mean_squared_error(y_test['bronze_medals'], y_pred_gb_bronze),
    "R2 Score (Gold)": r2_score(y_test['gold_medals'], y_pred_gb_gold),
    "R2 Score (Silver)": r2_score(y_test['silver_medals'], y_pred_gb_silver),
    "R2 Score (Bronze)": r2_score(y_test['bronze_medals'], y_pred_gb_bronze)
}

results_gb

{'Model': 'Gradient Boosting',
 'MSE (Gold)': 4.298433996533897,
 'MSE (Silver)': 2.514013961679817,
 'MSE (Bronze)': 3.0094894717148977,
 'R2 Score (Gold)': 0.8417903309667084,
 'R2 Score (Silver)': 0.8971279259770042,
 'R2 Score (Bronze)': 0.8687289215716055}

## Paris 2024 prediction using the Gradient Boosting Regressor

In [12]:
# Séparation des données pour l'entraînement (toutes les données jusqu'en 2020)
train_data = olympic_data[olympic_data['game_year'] <= 2020]

# Définir les variables explicatives et la variable cible
features = ['sports', 'epreuves', 'game_part', 'prec_game_medal', 'prec_game_gold', 'prec_game_silver', 'prec_game_bronze']
targets = ['gold_medals', 'silver_medals', 'bronze_medals']

X_train = train_data[features]
y_train_gold = train_data['gold_medals']
y_train_silver = train_data['silver_medals']
y_train_bronze = train_data['bronze_medals']

# Separation des modèles de Gradient Boosting pour chaque médaille
gradient_boost_gold = GradientBoostingRegressor(random_state=42)
gradient_boost_silver = GradientBoostingRegressor(random_state=42)
gradient_boost_bronze = GradientBoostingRegressor(random_state=42)

gradient_boost_gold.fit(X_train, y_train_gold)
gradient_boost_silver.fit(X_train, y_train_silver)
gradient_boost_bronze.fit(X_train, y_train_bronze)

# Préparation de la donnée pour la prédiction (en utilisant les données de 2020 comme base)
test_data_2020 = olympic_data[olympic_data['game_year'] == 2020]
X_test_2024 = test_data_2020[features]

# Prediction pour Paris 2024
y_pred_2024_gb_gold = gradient_boost_gold.predict(X_test_2024)
y_pred_2024_gb_silver = gradient_boost_silver.predict(X_test_2024)
y_pred_2024_gb_bronze = gradient_boost_bronze.predict(X_test_2024)

# Combiner les prédictions de Gradient Boosting en un seul tableau et arrondir les résultats
y_pred_2024_gb = np.column_stack((y_pred_2024_gb_gold, y_pred_2024_gb_silver, y_pred_2024_gb_bronze)).round().astype(int)

# Prédictions pour Paris 2024
predictions_2024 = pd.DataFrame({
    'country_name': test_data_2020['country_name'],
    'gold_medals': y_pred_2024_gb[:, 0],
    'silver_medals': y_pred_2024_gb[:, 1],
    'bronze_medals': y_pred_2024_gb[:, 2],
})

predictions_2024



Unnamed: 0,country_name,gold_medals_gb,silver_medals_gb,bronze_medals_gb
2699,United States of America,41,40,33
2700,People's Republic of China,36,28,21
2701,ROC,22,26,20
2702,Great Britain,25,21,21
2703,Japan,25,14,15
...,...,...,...,...
2884,"Virgin Islands, British",0,0,0
2885,"Virgin Islands, US",0,0,0
2886,Yemen,0,0,0
2887,Zambia,0,0,0
