# Collecting (month, average players, peak players) per game per month each year
Takes in an array of arrays that looks like: [ [rank, app_id], [rank, app_id], ... ]
Outputs: [ [app_id, month, avg_players, peak_players], ... ] 

In [None]:
# imports and globals
import csv
import time
import hashlib
from pathlib import Path 
from typing import Optional

import requests
import pandas as pd
from bs4 import BeautifulSoup # for parsing HTML docs

# Base URL pattern for SteamCharts game pages
# inject app_id into {app_id}, e.g. app_id=730 -> https://steamcharts.com/app/730
BASE_URL = "https://steamcharts.com/app/{appid}"

# Apprently some sites block requests that do not provide a browser-like user agent.
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; SteamChartsYearScraper/1.0)"
}

In [4]:
# Some utlity helpers! ==========================================================================

def clean_num(value: str) -> Optional[float]:
    """
    Convert numeric text to float.

    Handles:
    - commas: "12,345.6" -> 12345.6
    - blanks/dashes -> None
    - invalid values -> None
    """
    if value is None:
        return None

    text = str(value).strip().replace(",", "")
    if text in {"", "-", "â€”"}:
        return None

    try:
        return float(text)
    except ValueError:
        return None


def build_game_url(appid: int) -> str:
    """
    Build SteamCharts URL for one appid.
    Example: appid=730 -> "https://steamcharts.com/app/730"
    """
    return BASE_URL.format(appid=int(appid))



def cache_key_for_url(url: str) -> str:
    """
    Build deterministic cache filename key from URL.
    Using md5 keeps filenames short and filesystem-safe.
    Why this exists:
    - URL text may not be ideal as a filename.
    - Hash gives stable and filesystem-safe names.
    """
    return hashlib.md5(url.encode("utf-8")).hexdigest()



# Input and loading validation ==================================================================


def load_input_csv(file_path: Path) -> pd.DataFrame:
    """
    Load and validate one input CSV.
    (Probably not neccessary, but I think it's good practice just in case)

    Required columns:
    - rank
    - name
    - appid

    Returns:
    - Cleaned DataFrame with normalized dtypes:
      rank:int, name:str, appid:int

    Why this exists:
    - Fail fast with clear errors if schema is wrong.
    - Avoid repeated type conversions elsewhere.
    """
    # TODO:
    # Read CSV (consider encoding="utf-8-sig")
    df = pd.read_csv(file_path, encoding="utf-8-sig")

    # Validate required columns
    required = {"rank", "name", "appid"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(
            f"{file_path.name} is missing required columns: {sorted(missing)}. "
            f"Found columns: {list(df.columns)}"
        )

    # Keep only needed columns in predictable order 
    df = df[["rank", "name", "appid"]].copy()

    # Convert numeric fields and fail LOUDLY if invalid
    df["rank"] = pd.to_numeric(df["rank"], errors="raise").astype(int)
    df["appid"] = pd.to_numeric(df["appid"], errors="raise").astype(int)

    # 4) Strip whitespace on name
    df["name"] = df["name"].astype(str).str.strip()

    # Remove rows with empty names 
    df = df[df["name"] != ""].copy()

    # drop duplicates on rank+appid
    df = df.drop_duplicates(subset=["rank", "appid"]).reset_index(drop=True)

    # Return cleaned DataFrame
    # Sort by rank for deterministic processing
    df = df.sort_values("rank").reset_index(drop=True)


    return df


# Network + cache =================================================================================


def get_game_page_html(
    appid: int,
    session: requests.Session,
    cache_dir: Path,
    use_cache: bool = True,
    request_delay_sec: float = 0.6,
) -> str:
    """
    Return HTML for one game page, using cache when available.

    Flow:
    1) Build game URL from appid
    2) Compute cache filename from URL hash
    3) If cache exists and use_cache=True -> return cached HTML
    4) Else fetch from network, save cache, sleep briefly, return HTML
    """
    # TODO:
    # 1) Ensure cache_dir exists
    cache_dir.mkdir(parents=True, exist_ok=True)

    # 2) Compute URL and cache filename
    url = build_game_url(appid)
    cache_file = cache_dir / f"{cache_key_for_url(url)}.html"

    # 3) If cache hit and use_cache: read + return
    if cache_file.exists() and use_cache:
        return cache_file.read_text(encoding="utf-8")

    # Live request path
    resp = session.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    html = resp.text

    # Save to cache for future runs
    cache_file.write_text(html, encoding="utf-8")

    # Don't attack our lord and savior GabeN with rapid-fire requests
    time.sleep(request_delay_sec)

    return html


# HTML parsing ===================================================================================


def parse_year_data_from_html(html: str, target_year: int) -> list[dict]:
    """
    Parse SteamCharts monthly table from one app page, filtered to target year.

    Input:
    - html: raw page HTML
    - target_year: year to keep (e.g., 2021)

    Output row shape:
    {
      "month": "YYYY-MM",
      "avg_players": float|None,
      "peak_players": float|None
    }

    Why this exists:
    - Pure parser function (HTML in -> structured rows out).
    - Easy to test independently from I/O.
    """
    # TODO:
    # 1) Parse html with BeautifulSoup
    soup = BeautifulSoup(html, "html.parser")
    parsed_rows = []

    # 2) Select monthly table rows (table.common-table tbody tr)
    rows = soup.select("table.common-table tbody tr")

    # 3) For each row, parse month/avg/peak
    for tr in rows: 
        tds = tr.find_all("td")

        # Monthly rows should have at least 5 columns
        # Month | Avg. Players | Gain | Gain % | Peak Players
        if len(tds) < 5:
            continue

        month_text = tds[0].get_text(" ", strip=True)

        # 4) Skip "Last 30 Days"
        if month_text.lower() == "last 30 days":
            continue

        # 5) Parse month text with pd.to_datetime(..., format="%B %Y")
        month_dt = pd.to_datetime(month_text, format="%B %Y", errors="coerce")
        if pd.isna(month_dt):
            continue

        # 6) Keep rows where parsed year == target_year
        if int(month_dt.year) != int(target_year):
            continue

        # 7) Clean numeric fields with clean_num()
        avg_text = tds[1].get_text(" ", strip=True)
        peak_text = tds[4].get_text(" ", strip=True)

        # 8) Return rows sorted by month asc
        parsed_rows.append({
            "month": month_dt.strftime("%Y-%m"),
            "avg_players": clean_num(avg_text),
            "peak_players": clean_num(peak_text),
        })

    # Keep output in chronological order
    parsed_rows.sort(key=lambda r: r["month"])
    return parsed_rows



# year collection ====================================================================================


def collect_one_year(
    input_csv: Path,
    year: int,
    cache_dir: Path,
    use_cache: bool = True,
    request_delay_sec: float = 0.6,
) -> pd.DataFrame:
    """
    Collect SteamCharts data for one year's input list.

    Steps:
    - Load CSV (rank, name, appid)
    - For each game:
      - Fetch or read cached HTML
      - Parse target-year monthly rows
      - Emit result rows with status labels

    Status values:
    - "ok"               : parsed monthly rows exist
    - "no_data_for_year" : page loaded, but no rows for that year
    - "request_error"    : failed HTTP request
    - "parse_error"      : page fetched but parse failed
    """
    # TODO:
    # 1) games_df = load_input_csv(input_csv)
    # 2) Initialize out_rows = []
    # 3) Create requests.Session()
    # 4) Loop games_df rows:
    #    a) read rank, name, appid
    #    b) try get_game_page_html(...)
    #       - on failure append one request_error row, continue
    #    c) try parse_year_data_from_html(...)
    #       - on failure append one parse_error row, continue
    #    d) if parsed rows empty:
    #          append one no_data_for_year row
    #       else:
    #          append one row per month with status="ok"
    # 5) Build DataFrame with fixed column order
    # 6) Return DataFrame
    pass


def collect_year_range(
    start_year: int,
    end_year: int,
    input_dir: Path,
    input_pattern: str,      # e.g. "{year}_top250_ids.csv"
    output_dir: Path,
    cache_dir: Path,
    use_cache: bool = True,
    request_delay_sec: float = 0.6,
    write_combined: bool = True,
) -> pd.DataFrame:
    """
    Run collection across a year range using predictable filenames.

    Example:
    - input_pattern "{year}_top250_ids.csv"
    - for year=2018 -> "2018_top250_ids.csv"

    Output:
    - Writes one CSV per year to output_dir
    - Optionally writes one combined CSV
    - Returns combined DataFrame (or empty DataFrame if nothing processed)
    """
    # TODO:
    # 1) Ensure output_dir exists
    # 2) all_parts = []
    # 3) for year in range(start_year, end_year + 1):
    #    a) build input_csv path from pattern
    #    b) if missing file: print skip and continue
    #    c) year_df = collect_one_year(...)
    #    d) write year_df to output_dir / f"steamcharts_{year}_top250.csv"
    #    e) append year_df to all_parts
    # 4) if all_parts empty: return empty DataFrame with expected columns
    # 5) combined_df = concat all_parts
    # 6) if write_combined: save combined CSV
    # 7) return combined_df
    pass

# TESTING ===================================================================================================
def _test_load_input_csv():
    test_path = Path("2018_top250_ids.csv")
    df = load_input_csv(test_path)

    print("Loaded rows:", len(df))
    print("Columns:", df.columns.tolist())
    print(df.head(10).to_string(index=False))


def _test_fetch_html():
    cache_dir = Path("steamcharts_cache_test")
    appid = 730  # CS

    with requests.Session() as session:
        # First call should fetch live and cache
        html1 = get_game_page_html(
            appid=appid,
            session=session,
            cache_dir=cache_dir,
            use_cache=True,
            request_delay_sec=0.6,
        )
        print("First fetch length:", len(html1))

        # Second call should come from cache (no live request needed)
        html2 = get_game_page_html(
            appid=appid,
            session=session,
            cache_dir=cache_dir,
            use_cache=True,
            request_delay_sec=0.6,
        )
        print("Second fetch length:", len(html2))

    print("Same content:", html1 == html2)
    print("Cache files:", len(list(cache_dir.glob("*.html"))))


def _test_parse_html():
    appid = 730
    target_year = 2021
    cache_dir = Path("steamcharts_cache_test")

    with requests.Session() as session:
        html = get_game_page_html(
            appid=appid,
            session=session,
            cache_dir=cache_dir,
            use_cache=True,          # should hit cache if Step 2 already ran
            request_delay_sec=0.6,
        )

    rows = parse_year_data_from_html(html, target_year=target_year)

    print(f"Parsed rows for appid={appid}, year={target_year}: {len(rows)}")
    print("Preview:")
    for r in rows[:12]:
        print(r)

# MAIN =======================================================================================================


def main():
    """
    Central run configuration.

    Keep all user-editable settings here so the rest of the code
    stays stable and easy to reason about.
    """
    # TODO: configure these
    start_year = 2018
    end_year = 2023
    input_dir = Path(".")
    input_pattern = "{year}_top250_ids.csv"
    output_dir = Path("outputs")
    cache_dir = Path("steamcharts_cache")

    # TODO: call collect_year_range(...)
    # TODO: print small preview (head) and summary counts
    pass


if __name__ == "__main__":
    _test_parse_html()

Parsed rows for appid=730, year=2021: 12
Preview:
{'month': '2021-01', 'avg_players': 743209.66, 'peak_players': 1124553.0}
{'month': '2021-02', 'avg_players': 741013.24, 'peak_players': 1123485.0}
{'month': '2021-03', 'avg_players': 740927.82, 'peak_players': 1198581.0}
{'month': '2021-04', 'avg_players': 723346.52, 'peak_players': 1148077.0}
{'month': '2021-05', 'avg_players': 659888.89, 'peak_players': 1087197.0}
{'month': '2021-06', 'avg_players': 549347.08, 'peak_players': 929940.0}
{'month': '2021-07', 'avg_players': 506067.36, 'peak_players': 763523.0}
{'month': '2021-08', 'avg_players': 512081.96, 'peak_players': 802544.0}
{'month': '2021-09', 'avg_players': 512350.92, 'peak_players': 942519.0}
{'month': '2021-10', 'avg_players': 512435.85, 'peak_players': 864966.0}
{'month': '2021-11', 'avg_players': 548161.67, 'peak_players': 935593.0}
{'month': '2021-12', 'avg_players': 546614.19, 'peak_players': 950586.0}
