In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

# -------------------------
# Paths
# -------------------------
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

DATA_PROCESSED = PROJECT_ROOT / "data_processed"
QA_DIR = PROJECT_ROOT / "qa"
QA_DIR.mkdir(exist_ok=True)

FILES_V4 = {
    "21-22": "pl_matchlist_21-22_v4.csv",
    "22-23": "pl_matchlist_22-23_v4.csv",
    "23-24": "pl_matchlist_23-24_v4.csv",
    "24-25": "pl_matchlist_24-25_v4.csv",
}

# Quick existence check
for season, fname in FILES_V4.items():
    p = DATA_PROCESSED / fname
    if not p.exists():
        raise FileNotFoundError(f"Missing file for {season}: {p.resolve()}")

print("✅ All v4 files found.")


✅ All v4 files found.


In [2]:
qa_rows = []
issues = []

for season, fname in FILES_V4.items():
    path = DATA_PROCESSED / fname
    df = pd.read_csv(path)

    # -------------------------
    # Required columns check
    # -------------------------
    required_cols = {"Season", "Team", "Date", "match_id", "points"}
    missing = required_cols - set(df.columns)
    if missing:
        issues.append({
            "season": season,
            "type": "missing_columns",
            "detail": f"Missing columns: {sorted(missing)}"
        })
        # Still continue to collect what we can
    # -------------------------
    # 1) 760 rows per season
    # -------------------------
    rows_ok = (len(df) == 760)

    # -------------------------
    # 2) 380 unique match_id per season
    # -------------------------
    unique_match_id = df["match_id"].nunique() if "match_id" in df.columns else np.nan
    unique_match_ok = (unique_match_id == 380)

    # -------------------------
    # 3) Each match_id appears exactly 2 times
    # -------------------------
    if "match_id" in df.columns:
        vc = df["match_id"].value_counts(dropna=False)
        match_id_eq_2 = int((vc == 2).sum())
        match_id_not_2 = int((vc != 2).sum())
        match_id_2_ok = (match_id_not_2 == 0)
    else:
        match_id_eq_2 = np.nan
        match_id_not_2 = np.nan
        match_id_2_ok = False

    # -------------------------
    # 4) Points per match: sum = 3 (win/loss) or 2 (draw)
    # -------------------------
    if {"match_id", "points"}.issubset(df.columns):
        pts_sum = df.groupby("match_id")["points"].sum()
        invalid_pts_matches = pts_sum[~pts_sum.isin([2, 3])]
        points_ok = (len(invalid_pts_matches) == 0)
        invalid_points_count = int(len(invalid_pts_matches))
    else:
        points_ok = False
        invalid_points_count = np.nan

    # -------------------------
    # 5) Each team has 38 games (per season)
    # -------------------------
    if "Team" in df.columns:
        games_per_team = df["Team"].value_counts()
        teams_with_38 = int((games_per_team == 38).sum())
        teams_not_38 = games_per_team[games_per_team != 38]
        team_games_ok = (len(teams_not_38) == 0)
        teams_not_38_count = int(len(teams_not_38))
    else:
        teams_with_38 = np.nan
        team_games_ok = False
        teams_not_38_count = np.nan
        teams_not_38 = pd.Series(dtype=int)

    # -------------------------
    # 6) Confirm 20 teams per season
    # -------------------------
    n_teams = df["Team"].nunique() if "Team" in df.columns else np.nan
    teams_ok = (n_teams == 20)

    # Collect season-level summary
    qa_rows.append({
        "season": season,
        "file": fname,
        "rows": len(df),
        "rows_ok_760": rows_ok,
        "unique_match_id": unique_match_id,
        "unique_match_ok_380": unique_match_ok,
        "match_id_eq_2": match_id_eq_2,
        "match_id_not_2": match_id_not_2,
        "match_id_all_eq_2": match_id_2_ok,
        "points_invalid_matches": invalid_points_count,
        "points_per_match_ok": points_ok,
        "unique_teams": n_teams,
        "teams_ok_20": teams_ok,
        "teams_with_38_games": teams_with_38,
        "teams_not_38_count": teams_not_38_count,
        "team_games_ok_38": team_games_ok,
    })

    # Log detailed issues (only if needed)
    if not rows_ok:
        issues.append({"season": season, "type": "rows_not_760", "detail": f"rows={len(df)}"})
    if not unique_match_ok:
        issues.append({"season": season, "type": "unique_match_id_not_380", "detail": f"unique_match_id={unique_match_id}"})
    if not match_id_2_ok and "match_id" in df.columns:
        bad = vc[vc != 2].head(10)
        issues.append({"season": season, "type": "match_id_count_not_2", "detail": f"examples={bad.to_dict()}"})
    if not points_ok and {"match_id","points"}.issubset(df.columns):
        sample = invalid_pts_matches.head(10).to_dict()
        issues.append({"season": season, "type": "invalid_points_sum", "detail": f"examples={sample}"})
    if not teams_ok:
        issues.append({"season": season, "type": "teams_not_20", "detail": f"unique_teams={n_teams}"})
    if not team_games_ok and "Team" in df.columns:
        issues.append({"season": season, "type": "team_games_not_38", "detail": f"teams={teams_not_38.to_dict()}"})

qa_df = pd.DataFrame(qa_rows)
issues_df = pd.DataFrame(issues)

qa_df


Unnamed: 0,season,file,rows,rows_ok_760,unique_match_id,unique_match_ok_380,match_id_eq_2,match_id_not_2,match_id_all_eq_2,points_invalid_matches,points_per_match_ok,unique_teams,teams_ok_20,teams_with_38_games,teams_not_38_count,team_games_ok_38
0,21-22,pl_matchlist_21-22_v4.csv,760,True,380,True,380,0,True,0,True,20,True,20,0,True
1,22-23,pl_matchlist_22-23_v4.csv,760,True,380,True,380,0,True,0,True,20,True,20,0,True
2,23-24,pl_matchlist_23-24_v4.csv,760,True,380,True,380,0,True,0,True,20,True,20,0,True
3,24-25,pl_matchlist_24-25_v4.csv,760,True,380,True,380,0,True,0,True,20,True,20,0,True


In [3]:
summary_path = QA_DIR / "qa_summary_by_season.csv"
issues_path = QA_DIR / "qa_issues.csv"
notes_path = QA_DIR / "QA_NOTES.md"

qa_df.to_csv(summary_path, index=False, encoding="utf-8")
issues_df.to_csv(issues_path, index=False, encoding="utf-8")

# Build a human-readable markdown report
lines = []
lines.append("# QA Notes — Premier League Matchlist (21-22 to 24-25)\n")
lines.append("This QA validates the per-season integrity of the enriched datasets (`*_v4.csv`).\n")
lines.append("## Checks\n")
lines.append("- 760 rows per season (20 teams × 38 matches)\n")
lines.append("- 380 unique `match_id` per season\n")
lines.append("- Each `match_id` appears exactly 2 times (home team + away team perspectives)\n")
lines.append("- Points sum per match is 3 (win/loss) or 2 (draw)\n")
lines.append("- Each team has exactly 38 games per season\n")
lines.append("- Exactly 20 teams per season\n")

lines.append("\n## Summary table\n")
lines.append(qa_df.to_markdown(index=False))

if len(issues_df) == 0:
    lines.append("\n\n## Issues\n✅ No issues found.\n")
else:
    lines.append("\n\n## Issues\n")
    lines.append(issues_df.to_markdown(index=False))
    lines.append("\n\n> Note: Only the first examples are shown for very large issue sets.\n")

notes_path.write_text("\n".join(lines), encoding="utf-8")

print(f"✅ Saved: {summary_path}")
print(f"✅ Saved: {issues_path}")
print(f"✅ Saved: {notes_path}")


✅ Saved: C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\qa\qa_summary_by_season.csv
✅ Saved: C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\qa\qa_issues.csv
✅ Saved: C:\Users\tozes\Documents\IronHack\pl_21-25_analysis\qa\QA_NOTES.md


In [4]:
all_ok = (
    qa_df["rows_ok_760"].all()
    and qa_df["unique_match_ok_380"].all()
    and qa_df["match_id_all_eq_2"].all()
    and qa_df["points_per_match_ok"].all()
    and qa_df["teams_ok_20"].all()
    and qa_df["team_games_ok_38"].all()
)

print("✅ QA PASS" if all_ok else "❌ QA FAIL — check qa/qa_issues.csv and qa/QA_NOTES.md")
all_ok


✅ QA PASS


np.True_