# Collecting (month, average players, peak players) per game per month each year
Takes in an array of arrays that looks like: [ [rank, app_id], [rank, app_id], ... ]
Outputs: [ [app_id, month, avg_players, peak_players], ... ] 

In [None]:
# imports and globals
import csv
import time
import hashlib
from pathlib import Path 
from typing import List, Tuple, Union, Optional

import requests
import pandas as pd
from bs4 import BeautifulSoup # for parsing HTML docs

# Base URL pattern for SteamCharts game pages
# inject app_id into {app_id}, e.g. app_id=730 -> https://steamcharts.com/app/730
BASE_URL = "https://steamcharts.com/app/{app_id}"

# Apprently some sites block requests that do not provide a browser-like user agent.
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; SteamChartsYearScraper/1.0)"
}

In [None]:

def _read_rank_appid_csv(csv_path: Union[str, Path]) -> List[Tuple[int, int]]:
    """
    Read rank/app_id pairs from CSV.

    - Validates file schema once in a dedicated place.
    - Keeps main collector focused on request/parse/output.

    Required CSV columns:
    - rank
    - app_id

    Returns:
    - list of tuples: [(rank, app_id), ...]
    """
    csv_path = Path(csv_path)
    pairs = []

    # utf-8-sig handles potential BOM from Excel-exported CSVs
    with csv_path.open("r", newline="", encoding="utf-8-sig") as f:
        reader = csv.DictReader(f)
        required = {"rank", "app_id"}

        if reader.fieldnames is None or not required.issubset(set(reader.fieldnames)):
            raise ValueError(
                f"CSV must contain columns {required}. Found: {reader.fieldnames}"
            )

        # Parse each row as integers
        # start=2 because row 1 is header (for accurate error messages)
        for line_num, row in enumerate(reader, start=2):
            try:
                rank = int(row["rank"])
                app_id = int(row["app_id"])
            except Exception:
                raise ValueError(f"Invalid rank/app_id at CSV line {line_num}: {row}")
            pairs.append((rank, app_id))

    return pairs


"""
collect_steamcharts_year_data(...)

Inputs:
- input_csv: file with columns rank, app_id (250 rows expected)
- year: target year, e.g. 2018 or 2023
- output_csv: optional file path to save result
- cache_dir: folder for permament cached game pages
- use_cache: if True, reuse cached HTML when available
- request_delay-sec: polite delay between real requests (steamcharts I am not a bot except I am but please be nice and let see your data :D)

For each app_id
- builds URL: https://steamcharts.com/app/{app_id}
- fetches HTML
- parses row for target year

Handles fail states? 
- request error 
- parse error 
- no_data_for_year

Returns DataFrame with:
- rank, app_id, month, avg_players, peak_players, status(is there info or not)

Output columns: 
- rank 
- app_id
- month 
- avg_players
- peak_players
- status:
    * ok
    * no_data_for_year
    * request_error
    * parse_error 
"""
