In [1]:
from pathlib import Path
import pandas as pd

# -------------------------
# Project paths (notebook is inside /notebooks)
# -------------------------
PROJECT_ROOT = Path.cwd().resolve().parent
DATA_PROCESSED = PROJECT_ROOT / "data_processed"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_PROCESSED exists:", DATA_PROCESSED.exists())


PROJECT_ROOT: C:\Users\tozes\Documents\IronHack\pl_21-25_analysis
DATA_PROCESSED exists: True


In [2]:
# -------------------------
# Input files (edit names if yours differ)
# -------------------------
FILES = [
    "pl_matchlist_21-22_v1.csv",
    "pl_matchlist_22-23_v1.csv",
    "pl_matchlist_23-24_v1.csv",
    "pl_matchlist_24-25_v1.csv",
]

# Save new versions to avoid overwriting
OUTPUT_SUFFIX = "_v2"

# Columns that should remain as text
TEXT_COLS = {"Team", "Match", "Season"}

# Columns that should be integer-like (nullable integer)
INT_COLS = {
    "Goals",
    "Goals Conceded",
    "GD",
    "Passes in Opposition Half",
    "Passes into Box",
    "Shots",
    "Shots Faced",
    "High Recoveries",
    "High Recoveries Against",
    "Crosses",
    "Corners",
    "Fouls",
    "Throw-Ins into the Box",
}

def read_csv_robust(path: Path) -> pd.DataFrame:
    """
    Read a CSV robustly:
    - Assumes comma-separated for processed files.
    - If that fails, tries semicolon.
    """
    try:
        return pd.read_csv(path, sep=",")
    except Exception:
        return pd.read_csv(path, sep=";")


In [4]:
results = []

for fname in FILES:
    fpath = DATA_PROCESSED / fname
    if not fpath.exists():
        raise FileNotFoundError(f"Missing file: {fpath}")

    df = read_csv_robust(fpath)

    # -------------------------
    # 1) Normalize Date column to YYYY-MM-DD
    # -------------------------
    if "Date" not in df.columns:
        raise ValueError(f"'Date' column not found in {fname}")

    date_na_before = df["Date"].isna().sum()

    # Parse dates robustly:
    # - first try day-first (common in PT exports)
    # - then fall back to default parsing for any remaining NaT
    d1 = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
    d2 = pd.to_datetime(df["Date"], dayfirst=False, errors="coerce")

    # Use d1 when it worked, otherwise d2
    dt = d1.fillna(d2)

    date_na_after = dt.isna().sum()

    df["Date"] = dt.dt.strftime("%Y-%m-%d")

    # -------------------------
    # 2) Convert numeric columns (avoid numeric columns being read as text)
    # -------------------------
    candidate_cols = [c for c in df.columns if c not in TEXT_COLS and c != "Date"]

    nan_added_by_col = {}
    int_cols_with_decimals = []

    for col in candidate_cols:
        na_before = df[col].isna().sum()
        df[col] = pd.to_numeric(df[col], errors="coerce")
        na_after = df[col].isna().sum()

        if na_after > na_before:
            nan_added_by_col[col] = int(na_after - na_before)

        # If the column is expected to be integer-like, cast ONLY if values have no decimals
        if col in INT_COLS:
            s = df[col].dropna()

            # Check if any value has a decimal part (e.g., 1.5)
            has_decimal = ((s % 1) != 0).any() if len(s) else False

            if has_decimal:
                int_cols_with_decimals.append(col)
            else:
                df[col] = df[col].round(0).astype("Int64")

    # -------------------------
    # 3) Quick per-file QA summary
    # -------------------------
    results.append({
        "file": fname,
        "rows": len(df),
        "cols": df.shape[1],
        "date_na_before": int(date_na_before),
        "date_na_after_parse": int(date_na_after),
        "numeric_cols_with_new_nans": len(nan_added_by_col),
        "new_nans_total": int(sum(nan_added_by_col.values())) if nan_added_by_col else 0,
        "int_cols_with_decimals": ", ".join(sorted(set(int_cols_with_decimals))) if int_cols_with_decimals else ""
    })

    # Save output
    out_name = fname.replace("_v1.csv", f"{OUTPUT_SUFFIX}.csv")
    out_path = DATA_PROCESSED / out_name
    df.to_csv(out_path, index=False, sep=",")

    print(f"Saved: {out_name}")
    if nan_added_by_col:
        print("  New NaNs introduced by numeric coercion (top 10):")
        for k in list(nan_added_by_col.keys())[:10]:
            print(f"   - {k}: +{nan_added_by_col[k]}")
    if int_cols_with_decimals:
        print("  ⚠️ Columns expected as INT but decimals were found (kept as float):")
        print("   - " + ", ".join(sorted(set(int_cols_with_decimals))))
    print("-" * 50)

qa = pd.DataFrame(results)
qa


  d1 = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
  d1 = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")


Saved: pl_matchlist_21-22_v2.csv
--------------------------------------------------
Saved: pl_matchlist_22-23_v2.csv
--------------------------------------------------
Saved: pl_matchlist_23-24_v2.csv
--------------------------------------------------
Saved: pl_matchlist_24-25_v2.csv
--------------------------------------------------


  d1 = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")


Unnamed: 0,file,rows,cols,date_na_before,date_na_after_parse,numeric_cols_with_new_nans,new_nans_total,int_cols_with_decimals
0,pl_matchlist_21-22_v1.csv,760,44,0,0,0,0,
1,pl_matchlist_22-23_v1.csv,760,44,0,0,0,0,
2,pl_matchlist_23-24_v1.csv,760,44,0,0,0,0,
3,pl_matchlist_24-25_v1.csv,760,44,0,0,0,0,
