In [None]:
from pathlib import Path
import sys
import polars as pl

# ── Set up imports ──────────────────────────────────────────────
nb_dir   = Path.cwd()                    # …/notebooks
repo_dir = nb_dir.parent
sys.path.insert(0, str(repo_dir))

from pipeline.datasets import (
    SINGLE_FILE_ASSETS_NAMES,
)
from pipeline.utils.polars_wrapper import PolarsWrapper

# ── Create the context ─────────────────────────────────────────
ctx = PolarsWrapper()


In [None]:
BASE_PATH = "data/opendata/clean"
ctx.bulk_register_data(
    repo_root=repo_dir,
    base_path=BASE_PATH,
    table_names=SINGLE_FILE_ASSETS_NAMES,
    wildcard="*.parquet",
)


ctx.show_tables()


In [None]:
ctx.show_tables()

In [None]:
lf = ctx.lazy("nfl_pbp_2024")

In [None]:
# 1. Total yards gained by possession team in the 2024 season
df_total_yards = (
    lf
    .filter(pl.col("season") == 2024)
    .group_by("posteam")
    .agg(
        pl.col("yards_gained")
          .sum()
          .alias("total_yards_2024")
    )
    .sort("total_yards_2024", descending=True)
    .collect()
)
print(df_total_yards)

In [None]:
df_avg_air = (
    lf
    .filter(
        (pl.col("season") == 2024) &
        (pl.col("qtr") == 1) &
        (pl.col("pass_attempt") == 1)
    )
    .group_by("pass_length")
    .agg(
        pl.col("air_yards")
          .mean()
          .round(2)
          .alias("avg_air_yards_q1")
    )
    .sort("avg_air_yards_q1", descending=True)
    .collect()
)
print(df_avg_air)

In [None]:
sql = """
SELECT COUNT(*) AS n FROM nfl_pbp_ten_years
"""
df_sql = ctx.run_query(sql)
print(df_sql)