# Webscraping!

I use multithreading to speed up scraping and store results in a queue for thread safety.

In [12]:
import requests
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import logging
from queue import Queue
# Athlete ID range (I checked manually)
start_id = 1
end_id = 16460

## Athlete Information (birthday, country, height, etc)

In [None]:
logging.basicConfig(filename='athlete_data_errors.log', level=logging.ERROR, format='%(asctime)s %(levelname)s: %(message)s')

def fetch_athlete_data(athlete_id, data_queue, failed_queue):
    headers = {
        'X-Csrf-Token': 'QsiFWuxEY1S9h_-dQgRA_7S5w9uvvmXsjq56QbTPw4i_g_XR68rMCBFFhW6HngBRtHskfN5yjX8GQmawqs8BlQ',
        'Referer': 'https://ifsc.results.info',
        'Cookie': 'session_id=_verticallife_resultservice_session=6RHN3xZrXnftTiScNfSHg7BVvuebLzGAmC9P5vIpzdySn2vG7VwQpjSZRDHug%2BPKCWlkt831HjLvHsPoVKrzTGsPVR6mqSOtjHB%2Bwht%2Bj39KxYO%2FJlaU6zmh8VhNFEl9bXHiOlPGk8AxnZqiBSYKTxJFCqh34nqdurXfFDcsRnbEtYCixcOdx%2F32E4zYGLVw7DSXXIKOVTUivS43UJZq5zDWPctX95UWm%2FD7%2B6UYT2s0B%2B3XJVPgjMWCMR%2FVZs%2FQC45Gjm4uCpHHe8Yt73nM3J%2Br43V1HuHGSvRpRczrJ4QdovlJHDEpg4rjUA%3D%3D',
    }
    url = f"https://ifsc.results.info/api/v1/athletes/{athlete_id}"
    
    try:
        response = requests.get(url, headers=headers, timeout=120)
        if response.status_code == 200:
            athlete_data = response.json()
            # Ensure we handle missing fields safely using .get()
            data_queue.put({
                'athlete_id': athlete_data.get('id', None),
                'firstname': athlete_data.get('firstname', None),
                'lastname': athlete_data.get('lastname', None),
                'age': athlete_data.get('age', None),
                'gender': athlete_data.get('gender', None),
                'country': athlete_data.get('country', None),
                'height': athlete_data.get('height', None),
                'arm_span': athlete_data.get('arm_span', None),
                'paraclimbing_sport_class': athlete_data.get('paraclimbing_sport_class', None),
                'birthday': athlete_data.get('birthday', None),
            })
        else:
            logging.error(f"Failed to fetch athlete ID {athlete_id}: Status {response.status_code}, Reason: {response.reason}")
            failed_queue.put(athlete_id)
    
    except Exception as e:
        logging.error(f"Error fetching data for athlete ID {athlete_id}: {e}")
        failed_queue.put(athlete_id)

In [None]:
def retry_failed_athlete_info(failed_ids, max_retries=2, delay=2):
    retry_results = []
    failed_queue = Queue()

    for retry_count in range(max_retries):
        print(f"Retry attempt {retry_count + 1} for {len(failed_ids)} failed athlete IDs")
        retry_futures = []
        data_queue = Queue()
        
        with ThreadPoolExecutor(max_workers=20) as executor:
            retry_futures = {executor.submit(fetch_athlete_data, athlete_id, data_queue, failed_queue): athlete_id for athlete_id in failed_ids}
            failed_ids = []  # Reset failed_ids list for next retry
        
            for future in as_completed(retry_futures):
                try:
                    future.result()
                except Exception as e:
                    logging.error(f"Error during retry for athlete: {e}")
        
        # Collect results from queues
        while not data_queue.empty():
            retry_results.append(data_queue.get())
        
        while not failed_queue.empty():
            failed_ids.append(failed_queue.get())
        
        if not failed_ids:
            break  # Exit loop if no more failed IDs
        
        time.sleep(delay)  # Wait between retries to avoid overloading the server
    
    if failed_ids:
        print(f"Final failed athlete IDs after {max_retries} retries: {failed_ids}")
    
    return retry_results, failed_ids


In [None]:
def scrape_athletes_parallel(start_id, end_id, max_workers=25):
    athletes_info = []
    failed_ids = []
    data_queue = Queue()
    failed_queue = Queue()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks for each athlete ID
        futures = {executor.submit(fetch_athlete_data, athlete_id, data_queue, failed_queue): athlete_id for athlete_id in range(start_id, end_id + 1)}
        
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error during scraping: {e}")
    
    # Collect results from queues
    while not data_queue.empty():
        athletes_info.append(data_queue.get())
    
    while not failed_queue.empty():
        failed_ids.append(failed_queue.get())
    
    # Retry for failed athlete IDs
    if failed_ids:
        retry_results, failed_ids = retry_failed_athlete_info(failed_ids)
        athletes_info.extend(retry_results)  # Add successful retries

    return athletes_info, failed_ids


In [13]:
# Scrape athlete data from the range and create a DataFrame
athletes_info_list, failed_ids = scrape_athletes_parallel(start_id, end_id)
athletes_info_df = pd.DataFrame(athletes_info_list)

# Save the results to a CSV file
athletes_info_df.to_csv('athlete_information.csv', index=False)

print(f"Scraped {len(athletes_info_df)} athletes")
if failed_ids:
    print(f"Failed to fetch data for {len(failed_ids)} athlete IDs after retries: {failed_ids}")


Retry attempt 1 for 236 failed athlete IDs
Retry attempt 2 for 202 failed athlete IDs
Final failed athlete IDs after 2 retries: [1861, 1460, 2130, 332, 3423, 548, 581, 3307, 449, 3325, 608, 1414, 34, 545, 2300, 1096, 1253, 453, 2127, 590, 3666, 3720, 3495, 4250, 3902, 4255, 4259, 4264, 4256, 4262, 4268, 3924, 4254, 4257, 4263, 4253, 4265, 4270, 4267, 4271, 4276, 4273, 4278, 4269, 4261, 4279, 4277, 4280, 4266, 4294, 4295, 4281, 4298, 4258, 4302, 4297, 4287, 4291, 4292, 4312, 4309, 4311, 4315, 4313, 4314, 4321, 4317, 4326, 4296, 4348, 4351, 4350, 4331, 4288, 4325, 4316, 4354, 4332, 4353, 4356, 4352, 4349, 4335, 4355, 4357, 4346, 4362, 4897, 5680, 4318, 4671, 5867, 5864, 5866, 6186, 5967, 5519, 5899, 5553, 7249, 7844, 7491, 8727, 9340, 8366, 8038, 10336, 4361, 10828, 9807, 11003, 10042, 10045, 9780, 10146, 10174, 10332, 9712, 10592, 10341, 10794, 11183, 10548, 11002, 11236, 7370, 11392, 11922, 11649, 11923, 11927, 11532, 11539, 11930, 11925, 11926, 11929, 11407, 11924, 11928, 11933, 11934

## Athlete Results

In [None]:
logging.basicConfig(filename='scraping_errors.log', level=logging.ERROR, format='%(asctime)s %(levelname)s: %(message)s')

def fetch_athlete_results(athlete_id, results_queue, failed_queue):
    headers = {
        'X-Csrf-Token': 'QsiFWuxEY1S9h_-dQgRA_7S5w9uvvmXsjq56QbTPw4i_g_XR68rMCBFFhW6HngBRtHskfN5yjX8GQmawqs8BlQ',
        'Referer': 'https://ifsc.results.info',
        'Cookie': 'session_id=_verticallife_resultservice_session=6RHN3xZrXnftTiScNfSHg7BVvuebLzGAmC9P5vIpzdySn2vG7VwQpjSZRDHug%2BPKCWlkt831HjLvHsPoVKrzTGsPVR6mqSOtjHB%2Bwht%2Bj39KxYO%2FJlaU6zmh8VhNFEl9bXHiOlPGk8AxnZqiBSYKTxJFCqh34nqdurXfFDcsRnbEtYCixcOdx%2F32E4zYGLVw7DSXXIKOVTUivS43UJZq5zDWPctX95UWm%2FD7%2B6UYT2s0B%2B3XJVPgjMWCMR%2FVZs%2FQC45Gjm4uCpHHe8Yt73nM3J%2Br43V1HuHGSvRpRczrJ4QdovlJHDEpg4rjUA%3D%3D',
    }
    url = f"https://ifsc.results.info/api/v1/athletes/{athlete_id}"
    
    try:
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            athlete_data = response.json()
            results = []
            
            for result in athlete_data.get('all_results', []):
                # Use `.get()` with default values to avoid KeyErrors if fields are missing
                results.append({
                    'athlete_id': athlete_id,
                    'rank': result.get('rank', None),  # Use None if 'rank' is missing
                    'discipline': result.get('discipline', None),
                    'season': result.get('season', None),
                    'date': result.get('date', None),
                    'event_id': result.get('event_id', None),
                    'event_location': result.get('event_location', None),
                    'd_cat': result.get('d_cat', None),
                })
            results_queue.put(results)
        else:
            logging.error(f"Failed to fetch athlete ID {athlete_id}: Status {response.status_code}, Reason: {response.reason}")
            failed_queue.put(athlete_id)
    
    except Exception as e:
        logging.error(f"Error fetching results for athlete ID {athlete_id}: {e}")
        failed_queue.put(athlete_id)

In [None]:
def retry_failed_athletes(failed_ids, max_retries=3, delay=2):
    retry_results = []
    failed_queue = Queue()

    for retry_count in range(max_retries):
        print(f"Retry attempt {retry_count + 1} for {len(failed_ids)} failed athlete IDs")
        retry_futures = []
        results_queue = Queue()
        
        with ThreadPoolExecutor(max_workers=20) as executor:
            retry_futures = {executor.submit(fetch_athlete_results, athlete_id, results_queue, failed_queue): athlete_id for athlete_id in failed_ids}
            failed_ids = []  # Reset failed_ids list for next retry
        
            for future in as_completed(retry_futures):
                try:
                    future.result()
                except Exception as e:
                    logging.error(f"Error during retry for athlete: {e}")
        
        # Collect results from queues
        while not results_queue.empty():
            retry_results.extend(results_queue.get())
        
        while not failed_queue.empty():
            failed_ids.append(failed_queue.get())
        
        if not failed_ids:
            break  # Exit loop if no more failed IDs
        
        time.sleep(delay)  # Wait between retries to avoid overloading the server
    
    if failed_ids:
        print(f"Final failed athlete IDs after {max_retries} retries: {failed_ids}")
    
    return retry_results, failed_ids


In [None]:
def scrape_athlete_results_parallel(start_id, end_id, max_workers=60):
    athlete_results = []
    failed_ids = []
    results_queue = Queue()
    failed_queue = Queue()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_athlete_results, athlete_id, results_queue, failed_queue): athlete_id for athlete_id in range(start_id, end_id + 1)}
        
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                logging.error(f"Error during scraping: {e}")
    
    # Collect results from queues
    while not results_queue.empty():
        athlete_results.extend(results_queue.get())
    
    while not failed_queue.empty():
        failed_ids.append(failed_queue.get())
    
    # Retry for failed athlete IDs
    if failed_ids:
        retry_results, failed_ids = retry_failed_athletes(failed_ids)
        athlete_results.extend(retry_results)  # Add successful retries
    
    return athlete_results, failed_ids

In [10]:
athlete_results_list, failed_ids = scrape_athlete_results_parallel(start_id, end_id)
athlete_results_df = pd.DataFrame(athlete_results_list)

athlete_results_df.to_csv('athlete_results.csv', index=False)

print(f"Scraped results for {len(athlete_results_df)} athlete events")
if failed_ids:
    print(f"Failed to fetch data for {len(failed_ids)} athlete IDs after retries: {failed_ids}")


Retry attempt 1 for 221 failed athlete IDs
Retry attempt 2 for 202 failed athlete IDs
Retry attempt 3 for 202 failed athlete IDs
Final failed athlete IDs after 3 retries: [545, 3325, 34, 332, 2300, 548, 1414, 2127, 449, 590, 3423, 1460, 2130, 581, 1253, 1096, 3307, 453, 608, 1861, 3666, 3495, 3902, 3924, 3720, 4268, 4254, 4256, 4258, 4259, 4257, 4253, 4262, 4250, 4264, 4255, 4266, 4267, 4261, 4263, 4265, 4270, 4269, 4287, 4278, 4273, 4277, 4279, 4280, 4276, 4292, 4291, 4296, 4298, 4297, 4271, 4294, 4281, 4288, 4295, 4309, 4313, 4302, 4315, 4318, 4317, 4311, 4312, 4321, 4332, 4326, 4346, 4335, 4316, 4351, 4314, 4331, 4325, 4350, 4348, 4352, 4349, 4356, 4354, 4353, 4355, 4671, 4361, 4897, 5519, 4362, 5867, 5967, 5553, 5899, 5680, 4357, 5866, 6186, 5864, 7491, 8366, 7249, 7370, 7844, 9340, 8727, 8038, 9780, 9807, 10042, 10174, 10045, 10146, 10336, 9712, 10341, 10548, 10592, 10332, 10794, 10828, 11183, 11002, 11236, 11392, 11003, 11539, 11407, 11649, 11925, 11928, 11924, 11532, 11922, 1193