In [24]:
import requests
from bs4 import BeautifulSoup
import json
import time
import re

def extraire_informations_pays(url_pays):
    try:
        response = requests.get(url_pays)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        
      
        nom_pays = ""
        h1_title = soup.select_one('h1')
        if h1_title and "Olympic medals from" in h1_title.text:
            nom_pays = h1_title.text.replace("Olympic medals from", "").strip()
        
        periode = soup.select_one('h3').text.strip() if soup.select_one('h3') else ""
        
       
        stats = {}
        h2_stats = soup.select_one('h2')
        if h2_stats:
            stats_text = h2_stats.text.strip()
            pattern_dict = {
                'nb_medals': r'(\d+)\s+Medals',
                'nb_games': r'(\d+)\s+Games',
                'nb_sports': r'(\d+)\s+Sports',
                'nb_events': r'(\d+)\s+Events'
            }
            
            for key, pattern in pattern_dict.items():
                match = re.search(pattern, stats_text)
                if match:
                    stats[key] = int(match.group(1))
        
      
        medal_counts = {'summer': 0, 'winter': 0, 'gold': 0, 'silver': 0, 'bronze': 0, 'total': 0}
        
        # Extraction des médailles
        medal_teaser = soup.select_one('div.rnd.teaser')
        if medal_teaser:
           
            summer_winter_divs = medal_teaser.find_all('div', recursive=False)
            if summer_winter_divs:
             
                first_div = summer_winter_divs[0]
                summer_span = first_div.select_one('span.mal')
                count = 0
                
                if summer_span:
                    try:
                        count = int(summer_span.text.strip())
                    except ValueError:
                        pass
                
                if "Summer" in first_div.text:
                    medal_counts['summer'] = count
                elif "Winter" in first_div.text:
                    medal_counts['winter'] = count
                
           
                if len(summer_winter_divs) >= 2:
                    second_div = summer_winter_divs[1]
                    winter_span = second_div.select_one('span.mal')
                    if winter_span and "Winter" in second_div.text:
                        try:
                            medal_counts['winter'] = int(winter_span.text.strip())
                        except ValueError:
                            pass
            
            
            medal_type_map = {'1': 'gold', '2': 'silver', '3': 'bronze'}
            for medal_div in medal_teaser.select('div > div.the-medal'):
                medal_type = medal_div.get('data-medal')
                if not medal_type or medal_type not in medal_type_map:
                    continue
                
                count_span = medal_div.parent.select_one('span.mal')
                if count_span:
                    try:
                        medal_counts[medal_type_map[medal_type]] = int(count_span.text.strip())
                    except ValueError:
                        pass
            
            medal_counts['total'] = medal_counts['gold'] + medal_counts['silver'] + medal_counts['bronze']
        
        return {
            'nom': nom_pays,
            'periode': periode,
            'statistiques': stats,
            'medailles': medal_counts
        }
    
    except Exception as e:
        print(f"Erreur pour {url_pays}: {str(e)}")
        return None

def collecter_pays():
    try:
        response = requests.get("https://olympics-statistics.com/nations")
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        
        pays_links = []
        for card in soup.select('a.card.nation'):
            href = card.get('href', '')
            pays_id = None
            if href:
                id_match = re.search(r'/nation/[^/]+/(\d+)', href)
                if id_match:
                    pays_id = id_match.group(1)
            
            pays_links.append({
                'id': pays_id,
                'nom': card.select_one('div.bez').get_text(strip=True) if card.select_one('div.bez') else "",
                'url': f"https://olympics-statistics.com{href}" if href else "",
                'drapeau': f"https://olympics-statistics.com{card.select_one('img').get('src')}" if card.select_one('img') else ""
            })
        
        return pays_links
    
    except Exception as e:
        print(f"Erreur lors de la collecte des pays: {str(e)}")
        return []

def main():
    print("Collecte des pays...")
    pays_links = collecter_pays()
    print(f"{len(pays_links)} pays trouvés.")
    
    resultats = []
    limite_test = None  
    pays_a_traiter = pays_links[:limite_test] if limite_test else pays_links
    
    for i, pays in enumerate(pays_a_traiter):
        print(f"Traitement du pays {i+1}/{len(pays_a_traiter)}: {pays['nom']}")
        pays_info = extraire_informations_pays(pays['url'])
        
        if pays_info:
            pays_info.update({'id': pays['id'], 'drapeau': pays['drapeau']})
            resultats.append(pays_info)
            time.sleep(0.5)
    
    with open('olympics_nations.json', 'w', encoding='utf-8') as f:
        json.dump(resultats, f, ensure_ascii=False, indent=4)
    
    print(f"Terminé. Les données de {len(resultats)} pays ont été sauvegardées.")

if __name__ == "__main__":
    main()

Collecte des pays...
178 pays trouvés.
Traitement du pays 1/178: Afghanistan
Traitement du pays 2/178: Albania
Traitement du pays 3/178: Algeria
Traitement du pays 4/178: Argentina
Traitement du pays 5/178: Armenia
Traitement du pays 6/178: Australia


KeyboardInterrupt: 

In [27]:
import json
import pandas as pd
import numpy as np


pays_dict = {
    'afghanistan': 'af',
    'albania': 'al',
    'algeria': 'dz',         
    'argentina': 'ar',
    'armenia': 'am',
    'australia': 'au',
    'austria': 'at',
    'azerbaijan': 'az',
    'bahamas': 'bs',
    'bahrain': 'bh',
    'barbados': 'bb',
    'belarus': 'by',
    'belgium': 'be',
    'bermuda': 'bm',
    'botswana': 'bw',
    'brazil': 'br',
    'bulgaria': 'bg',
    'burkina-faso': 'bf',
    'burundi': 'bi',
    'cameroon': 'cm',         
    'canada': 'ca',
    'cape verde': 'cv',
    'chile': 'cl',
    'china': 'cn',
    'chinese taipei': 'tw',
    'colombia': 'co',
    'costa rica': 'cr',
    'cote divoire': 'ci',
    'croatia': 'hr',
    'cuba': 'cu',
    'cyprus': 'cy',
    'czech republic': 'cz',
    'denmark': 'dk',
    'djibouti': 'dj',
    'dominican': 'dm',
    'dominican republic': 'do',
    'ecuador': 'ec',
    'egypt': 'eg',
    'eritrea': 'er',
    'estonia': 'ee',
    'ethiopia': 'et',
    'fiji': 'fj',
    'finland': 'fi',
    'france': 'fr',
    'gabon': 'ga',
    'georgia': 'ge',
    'germany': 'de',
    'ghana': 'gh',
    'great britain': 'gb',
    'greece': 'gr',
    'grenada': 'gd',
    'guatemala': 'gt',
    'guyana': 'gy',
    'haiti': 'ht',
    'hong kong': 'hk',
    'hungary': 'hu',
    'iceland': 'is',
    'india': 'in',
    'indonesia': 'id',
    'iran': 'ir',
    'iraq': 'iq',
    'ireland': 'ie',
    'israel': 'il',
    'italy': 'it',
    'jamaica': 'jm',
    'japan': 'jp',
    'jordan': 'jo',         
    'kazakhstan': 'kz',
    'kenya': 'ke',
    'kosovo': 'xk',
    'kuwait': 'kw',
    'kyrgyzstan': 'kg',
    'latvia': 'lv',
    'lebanon': 'lb',
    'liechtenstein': 'li',
    'lithuania': 'lt',
    'luxembourg': 'lu',       
    'malaysia': 'my',
    'mauritius': 'mu',
    'mexico': 'mx',
    'moldova': 'md',
    'mongolia': 'mn',
    'montenegro': 'me',
    'morocco': 'ma',         
    'mozambique': 'mz',
    'namibia': 'na',
    'netherlands': 'nl',
    'new zealand': 'nz',
    'niger': 'ne',
    'nigeria': 'ng',
    'north korea': 'kp',     
    'north-macedonia': 'mk',
    'norway': 'no',
    'pakistan': 'pk',
    'panama': 'pa',
    'paraguay': 'py',
    'peru': 'pe',
    'philippines': 'ph',
    'poland': 'pl',
    'portugal': 'pt',
    'puerto rico': 'pr',     
    'qatar': 'qa',        
    'romania': 'ro',
    'russia': 'ru',
    'saint lucia': 'lc',     
    'samoa': 'ws',
    'san marino': 'sm',   
    'saudi arabia': 'sa',     
    'senegal': 'sn',
    'serbia': 'rs',
    'singapore': 'sg',
    'slovakia': 'sk',
    'slovenia': 'si',
    'south africa': 'za',
    'south korea': 'kr',
    'spain': 'es',
    'sri lanka': 'lk',
    'sudan': 'sd',
    'suriname': 'sr',
    'sweden': 'se',
    'switzerland': 'ch',
    'syria': 'sy',           
    'tajikistan': 'tj',      
    'tanzania': 'tz',
    'thailand': 'th',
    'togo': 'tg',
    'tonga': 'to',
    'trinidad & tobago': 'tt',
    'tunisia': 'tn',         
    'turkey': 'tr',
    'turkmenistan': 'tm',
    'uganda': 'ug',          
    'ukraine': 'ua',
    'united arab emirates': 'ae', 
    'united states of america': 'us',
    'united states': 'us',
    'uruguay': 'uy',       
    'uzbekistan': 'uz',      
    'venezuela': 've',
    'vietnam': 'vn',
    'virgin islands': 'vi',
    'zambia': 'zm',
    'zimbabwe': 'zw'
}

# Création d'un dictionnaire inversé pour recherche par code
codes_pays_dict = {v: k for k, v in pays_dict.items()}


try:
    with open('olympics_nations.json', 'r', encoding='utf-8') as f:
        nations = json.load(f)
    print(f"Données chargées : {len(nations)} nations trouvées")
except FileNotFoundError:
    print("Fichier olympics_nations.json introuvable. Exécutez d'abord le script principal.")
    nations = []


df = pd.DataFrame(nations)

# Aperçu initial
print("\nAperçu avant nettoyage:")
print(df.head())
print(f"Colonnes: {df.columns.tolist()}")


def nettoyer_data_nations():
    if len(df) == 0:
        print("Pas de données à nettoyer.")
        return df
    

    df['or'] = df['medailles'].apply(lambda x: x.get('gold', 0) if isinstance(x, dict) else 0)
    df['argent'] = df['medailles'].apply(lambda x: x.get('silver', 0) if isinstance(x, dict) else 0)
    df['bronze'] = df['medailles'].apply(lambda x: x.get('bronze', 0) if isinstance(x, dict) else 0)
    df['total_medailles'] = df['medailles'].apply(lambda x: x.get('total', 0) if isinstance(x, dict) else 0)
    
    # 2. Médailles par saison
    df['medailles_ete'] = df['medailles'].apply(lambda x: x.get('summer', 0) if isinstance(x, dict) else 0)
    df['medailles_hiver'] = df['medailles'].apply(lambda x: x.get('winter', 0) if isinstance(x, dict) else 0)
    
    
    
    df['nom'] = df['nom'].fillna('Unknown')
    df['pays_code'] = ''
    
    # Ajouter le code pays en fonction du nom
    for idx in df.index:
        nom_pays = df.loc[idx, 'nom'].lower().strip()
        
        if 'united states' in nom_pays or nom_pays == 'usa':
            df.loc[idx, 'pays_code'] = 'us'
        elif nom_pays in pays_dict:
            df.loc[idx, 'pays_code'] = pays_dict[nom_pays]
       
        elif '(' in nom_pays:
            nom_base = nom_pays.split('(')[0].strip().lower()
            if nom_base in pays_dict:
                df.loc[idx, 'pays_code'] = pays_dict[nom_base]

            elif len(nom_pays.split('(')) > 1:
                code_potentiel = nom_pays.split('(')[1].replace(')', '').lower()
                if len(code_potentiel) <= 3 and code_potentiel in codes_pays_dict:
                    df.loc[idx, 'pays_code'] = code_potentiel
    
    # Normalisation des noms de pays 
    df['nom'] = df['nom'].apply(lambda x: x.title() if isinstance(x, str) else x)
    
    # Trouver les pays sans code ISO détecté
    pays_sans_code = df[df['pays_code'] == '']['nom'].tolist()
    if pays_sans_code:
        print(f"\nATTENTION: Pays sans code ISO détecté: {', '.join(pays_sans_code)}")
    

    df_clean = df.drop(columns=['medailles', 'statistiques'])
    

    nations_clean_dict = df_clean.to_dict(orient='records')


    with open('olympics_nations_clean.json', 'w', encoding='utf-8') as f:
        json.dump(nations_clean_dict, f, ensure_ascii=False, indent=4)

    print(f"\nDonnées nettoyées enregistrées au format JSON dans 'olympics_nations_clean.json'")
    print(f"Nombre de nations enregistrées: {len(nations_clean_dict)}")


    if len(nations_clean_dict) > 0:
        print("\nAperçu de la première nation au format JSON:")
        print(json.dumps(nations_clean_dict[0], ensure_ascii=False, indent=2))
    
    return df_clean


df_clean = nettoyer_data_nations()


print("\nAperçu après nettoyage:")
print(df_clean.head())

Données chargées : 178 nations trouvées

Aperçu avant nettoyage:
           nom      periode statistiques  \
0  Afghanistan  2008 - 2012           {}   
1      Albania         2024           {}   
2      Algeria  1984 - 2024           {}   
3    Argentina  1924 - 2024           {}   
4      Armenia  1996 - 2024           {}   

                                           medailles  id  \
0  {'summer': 2, 'winter': 0, 'gold': 0, 'silver'...   1   
1  {'summer': 2, 'winter': 0, 'gold': 0, 'silver'...   3   
2  {'summer': 20, 'winter': 0, 'gold': 7, 'silver...   4   
3  {'summer': 80, 'winter': 0, 'gold': 22, 'silve...  10   
4  {'summer': 22, 'winter': 0, 'gold': 2, 'silver...  11   

                                             drapeau  
0  https://olympics-statistics.com/media/flagge/a...  
1  https://olympics-statistics.com/media/flagge/a...  
2  https://olympics-statistics.com/media/flagge/d...  
3  https://olympics-statistics.com/media/flagge/a...  
4  https://olympics-statistics.com