In [11]:
import pandas as pd
file_path = '../data/processed/player_stats_Primera_Division_Argentina_2024.parquet'
df_final_limpio = pd.read_parquet(file_path)


In [12]:
import numpy as np

# ⚠️ USAR EL DF CORRECTO
df = df_final_limpio  # <- Este es el que acabás de leer de 'processed'

EXPECTED_COLS = [
    "Player","Nation","Pos","Squad","Born",
    "MatchesPlayed","Gls","Ast","xG","xAG",
    "Shots","SoT","PassCmp","PassAtt","PassCmpPct",
    "Tkl","TklW","Blocks","Int"
]

def parse_age_like_fbref(s):
    """
    Convierte 'YY-DDD' (FBref) a años con decimales. Si ya es número, lo devuelve.
    """
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return np.nan
    s = str(s)
    if '-' in s:
        try:
            y, d = s.split('-', 1)
            y = int(y)
            d = int(''.join([ch for ch in d if ch.isdigit()]) or 0)
            return round(y + d/365, 2)
        except:
            return np.nan
    try:
        return float(s)
    except:
        return np.nan

def quality_report(df):
    print("—"*60)
    print("Shape:", df.shape)
    print("Columnas presentes:", len(df.columns))
    missing = [c for c in EXPECTED_COLS if c not in df.columns]
    print("Faltantes (si hay):", missing)

    # Clave: muchos jugadores pueden cambiar de club; chequeemos duplicados por (Player,Squad)
    key = ["Player","Squad"] if {"Player","Squad"}.issubset(df.columns) else ["Player"]
    dup = df.duplicated(subset=key).sum()
    print(f"Duplicados en {key}:", dup)

    # Nulos (top 10)
    print("\n% Nulos (top 10):")
    print((df.isna().mean().sort_values(ascending=False).head(10)*100).round(2))

    # Tipos esperados
    text_cols = {"Player","Nation","Pos","Squad","Born"}
    numeric_cols = [c for c in EXPECTED_COLS if c not in text_cols and c in df.columns]
    for c in numeric_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # Reglas de consistencia
    if {"SoT","Shots"}.issubset(df.columns):
        bad = (df["SoT"] > df["Shots"]).sum()
        print(f"\nChequeo SoT <= Shots -> violaciones: {bad}")

    if {"PassCmp","PassAtt"}.issubset(df.columns):
        bad = (df["PassCmp"] > df["PassAtt"]).sum()
        print(f"Chequeo PassCmp <= PassAtt -> violaciones: {bad}")
    if "PassCmpPct" in df.columns:
        bad_rng = ((df["PassCmpPct"] < 0) | (df["PassCmpPct"] > 100)).sum()
        print(f"Chequeo 0 <= PassCmpPct <= 100 -> violaciones: {bad_rng}")

    # Si existe Age, agregamos AgeYears
    if "Age" in df.columns and "AgeYears" not in df.columns:
        df["AgeYears"] = df["Age"].map(parse_age_like_fbref)
        print("\nAgeYears creado a partir de Age (formato FBref).")

    # No-negativos básicos
    nonneg_cols = [c for c in ["Gls","Ast","Shots","SoT","PassCmp","PassAtt","Tkl","TklW","Blocks","Int"] if c in df.columns]
    if nonneg_cols:
        negs = {c: int((df[c] < 0).sum()) for c in nonneg_cols}
        print("\nNo-negativos (violaciones por col):", negs)

    # Dtypes sólo de las columnas esperadas (presentes)
    print("\nDtypes:")
    show_cols = [c for c in EXPECTED_COLS if c in df.columns]
    print(df[show_cols].dtypes)
    print("—"*60)
    return df

df_checked = quality_report(df)
df_checked.head(10)

# (Re)calcula PassCmpPct por seguridad
if {"PassCmp","PassAtt"}.issubset(df_checked.columns):
    denom = df_checked["PassAtt"].replace(0, np.nan)
    df_checked["PassCmpPct"] = (df_checked["PassCmp"] / denom * 100).fillna(0).round(2)

final_parquet = '../data/processed/player_stats_Primera_Division_Argentina_2024.clean.parquet'
final_csv     = '../data/processed/player_stats_Primera_Division_Argentina_2024.clean.csv'

df_checked.to_parquet(final_parquet, index=False)
df_checked.to_csv(final_csv, index=False)

print("Archivos generados:")
print(" -", final_parquet)
print(" -", final_csv)


————————————————————————————————————————————————————————————
Shape: (925, 20)
Columnas presentes: 20
Faltantes (si hay): []
Duplicados en ['Player', 'Squad']: 0

% Nulos (top 10):
Player           0.0
Nation           0.0
Pos              0.0
Squad            0.0
Age              0.0
Born             0.0
MatchesPlayed    0.0
Gls              0.0
Ast              0.0
xG               0.0
dtype: float64

Chequeo SoT <= Shots -> violaciones: 0
Chequeo PassCmp <= PassAtt -> violaciones: 0
Chequeo 0 <= PassCmpPct <= 100 -> violaciones: 0

AgeYears creado a partir de Age (formato FBref).

No-negativos (violaciones por col): {'Gls': 0, 'Ast': 0, 'Shots': 0, 'SoT': 0, 'PassCmp': 0, 'PassAtt': 0, 'Tkl': 0, 'TklW': 0, 'Blocks': 0, 'Int': 0}

Dtypes:
Player           string[python]
Nation           string[python]
Pos              string[python]
Squad            string[python]
Born             string[python]
MatchesPlayed           float64
Gls                       int64
Ast                       

In [None]:
import json

schema = {col: str(df_checked[col].dtype) for col in df_checked.columns}
with open('../data/processed/player_stats_schema_2024.json', 'w', encoding='utf-8') as f:
    json.dump(schema, f, ensure_ascii=False, indent=2)

schema
