In [80]:
import json
import requests
import google.cloud.logging as cloud_logging

from datetime import date, datetime, timedelta
from dateutil.relativedelta import relativedelta
from google.cloud import storage
from itertools import product
from time import sleep

In [70]:
def generate_year_month_list(start_date: datetime, end_date: datetime):

    # Ensure we start from the first day of the start month
    current_date = start_date.replace(day=1)

    # Move end_date to the last complete month
    if end_date.day > 1:  
        end_date = end_date.replace(day=1)                           # Move to the first of the month
        end_date = end_date.replace(month=end_date.month - 1 or 12)  # Step back one month
        if end_date.month == 12:                                     # Handle year change if stepping back from January
            end_date = end_date.replace(year=end_date.year - 1)

    date_list = []

    while current_date <= end_date:
        date_list.append(current_date.strftime("%Y/%m"))
        # Move to the next month
        next_month = current_date.month % 12 + 1
        next_year = current_date.year + (1 if next_month == 1 else 0)
        current_date = datetime(next_year, next_month, 1)

    return date_list

def upload_json_to_gcs_bucket(bucket_name, object_name, data):
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(object_name)
    blob.upload_from_string(json.dumps(data.json()), content_type="application/json")
    print(f'Success | Uploaded {object_name} to GCS bucket: {bucket_name}')
    
def exponential_backoff_request(url, headers, max_retries=5, base_delay=1, max_delay=30):

    retries = 0
    while retries < max_retries:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response
        
        wait_time = min(base_delay * (2 ** retries) + random.uniform(0, 1), max_delay)
        print(f"Retry {retries + 1}/{max_retries} | URL: {url} | Waiting {wait_time:.2f} seconds before retrying...")
        time.sleep(wait_time)
        retries += 1
    
    print("Max retries reached. Request failed.")
    return None

def list_files_in_gcs(bucket_name):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blobs = bucket.list_blobs()
    file_list = [blob.name for blob in blobs]
    return file_list

def append_prefix_to_gcs_files(prefix, excluded_prefixes):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blobs = bucket.list_blobs()
    
    # The for loop will exlude any files that should not be targeted in the renaming
    for blob in blobs:
        if any(blob.name.startswith(f"{prefix}/") for prefix in excluded_prefixes):
            print("Skipping {blob.name} | Excluded from renaming process")
            continue

        new_name = f"{prefix}/{blob.name}"
        bucket.rename_blob(blob, new_name)
        print(f"Renamed {blob.name} -> {new_name}")

    

In [82]:
# Date Variables
script_setting = 'manual'

if script_setting == 'default':
    start_date = datetime.today() - relativedelta(months=12)
    end_date = datetime.today()

if script_setting == 'manual':
    start_date = datetime(2024, 9, 1)  
    end_date = datetime(2025, 2, 1)  
year_month_list = generate_year_month_list(start_date, end_date)

# Input Variables
headers = {"User-Agent": "gcs_chess_ingestion.ipynb (Python 3.11) (username: filiplivancic; contact: filiplivancic@gmail.com)"}
project_name = 'checkmate-453316'
bucket_name = 'chess-api'

# GCS Endpoints
gcs_player_endpoint = f"players/{player}/games/{period}" 
gcs_leaderboard_endpoint = f"leaderboards/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

In [77]:
# Use the function in case of need to ammend the GCS filename prefixes
# Default boolean value should be False
is_renaming_gcs_files = False
if is_renaming_gcs_files == True:
    print('Changing GCS file prefixes')
    prefix = 'players'
    excluded_prefixes = 'leaderboards/'
    append_prefix_to_gcs_files(prefix, excluded_prefixes)


In [83]:
# Getting leaderboard of the current top players
print('Requesting the latest leaderboards')
leaderboards_url = f'https://api.chess.com/pub/leaderboards'
leaderboards_response = exponential_backoff_request(leaderboards_url, headers)
upload_json_to_gcs_bucket(bucket_name, gcs_leaderboard_endpoint, leaderboards_response)

Requesting the latest leaderboards
Success | Uploaded leaderboards/2025-03-11_16-01-01 to GCS bucket: chess-api


In [49]:
# Find all possible chess formats being tracked on leaderboard
format_list = list(leaderboard_response.json().keys())

# Get all the top player names from each chess format
print('Retrieving the names of top chess players')
top_player_list = []
for form in format_list:
    for i in range(len(leaderboard_response.json().get(form))):
        user = leaderboard_response.json().get(form)[i].get('username')
        top_player_list.append(user.lower())
        
# Deduplicate usernames
top_player_list = list(set(top_player_list))

Retrieving the names of top chess players


In [69]:
# Listing the current objects in the chess api storage bucket
gcs_file_list = list_files_in_gcs(bucket_name)

# Cross product of usernames with the date period selected
player_date_permutations = [gcs_player_endpoint for player, period in product(top_player_list, year_month_list)]

# Check if those combos exist in GCS currently -- if not remove them from the list
remaining_game_requests = player_date_permutations[:]
for combo in remaining_game_requests:
    if combo in gcs_file_list:
        remaining_game_requests.remove(combo)
        
print(f"Total request combinations: {len(player_date_permutations)}")
print(f"Number of remaing requests: {len(remaining_game_requests)}")

Total request combinations: 7020
Number of remaing requests: 6691


In [None]:
# Iterating through each 
print('Requesting archived game data')
for player in top_player_list:
    for period in year_month_list:
        print(f'Requesting Game Data | player: {player} | period {period}')
        games_url = f'https://api.chess.com/pub/player/{player}/games/{period}'
        games_response = exponential_backoff_request(games_url, headers)
        upload_json_to_gcs_bucket(bucket_name, gcs_player_endpoint, games_response)
        sleep(1)