In [14]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text 
import re

In [15]:
user = "root"
password = "Levp13aa"
host = "localhost"
database = "futbol_dw"

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}/{database}")

In [16]:
import numpy as np
import pandas as pd

# ============================================================
# 0. Constantes / helpers generales
# ============================================================

BIG5_LEAGUES = ["Laliga", "Premier_League", "Serie_A", "Bundesliga", "Ligue_1"]

# Mapa simple de roles -> posiciones (position_clean en silver_players)
POSITION_GROUPS = {
    "portero": ["GK"],
    "defensa": ["DF"],
    "mediocampista": ["MF"],
    "delantero": ["FW"],
    # si quieres ser más específico luego, puedes refinar esto
}

def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    """Cosine similarity segura (maneja vectores nulos como 0)."""
    a = np.asarray(a, dtype=float)
    b = np.asarray(b, dtype=float)
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(np.dot(a, b) / (na * nb))

def standardize(s: pd.Series) -> pd.Series:
    """Z-score con fallback a 0 si la varianza es 0 o NaN."""
    s = pd.to_numeric(s, errors="coerce")
    m = s.mean()
    sd = s.std()
    if sd == 0 or np.isnan(sd):
        return pd.Series(0.0, index=s.index)
    return (s - m) / sd

# ============================================================
# 1. Perfil del equipo (silver_teams -> vector de estilo)
# ============================================================

def get_team_profile(engine, team_name: str, season: str) -> tuple[dict, np.ndarray]:
    """
    Agrega silver_teams por equipo+temporada y construye el vector de estilo del equipo:
    x_team = [posesión, verticalidad, ofensivo, defensivo, agresividad]
    """
    query = """
        SELECT
            team_name,
            league,
            season,
            AVG(idx_possession_style)    AS pos_style,
            AVG(idx_verticality_style)   AS vert_style,
            AVG(idx_offensive_style)     AS off_style,
            AVG(idx_defensive_style)     AS def_style,
            AVG(idx_aggressiveness_style) AS aggr_style
        FROM silver_teams
        WHERE team_name = %(team_name)s
          AND season = %(season)s
        GROUP BY team_name, league, season;
    """
    df = pd.read_sql(query, engine, params={"team_name": team_name, "season": season})
    if df.empty:
        raise ValueError(f"No se encontró perfil de equipo para team_name={team_name}, season={season}")

    row = df.iloc[0].to_dict()

    x_team = np.array([
        row.get("pos_style", np.nan),
        row.get("vert_style", np.nan),
        row.get("off_style", np.nan),
        row.get("def_style", np.nan),
        row.get("aggr_style", np.nan),
    ], dtype=float)
    x_team = np.nan_to_num(x_team, nan=0.0)

    return row, x_team

# ============================================================
# 2. Pool de jugadores (silver_players -> 1 fila por jugador-equipo-temporada)
# ============================================================

def get_players_pool(
    engine,
    season: str,
    leagues: list[str] | None = None,
    leagues_big5: bool = True,
    min_minutes: int = 600,
) -> pd.DataFrame:
    """
    Agrega silver_players a nivel jugador-equipo-temporada-posicion.
    Calcula minutos totales, índices promedio e intenta sacar edad promedio.
    """
    query = """
        SELECT
            player_name_clean         AS player_name,
            team_name,
            league,
            season,
            position_clean            AS position,
            SUM(minutes_played)       AS minutes_total,
            AVG(idx_finishing)        AS finishing,
            AVG(idx_playmaking)       AS playmaking,
            AVG(idx_progression)      AS progression,
            AVG(idx_involvement)      AS involvement,
            AVG(idx_defending)        AS defending,
            AVG(idx_discipline)       AS discipline,
            AVG(
                CASE
                    WHEN player_age IS NULL THEN NULL
                    ELSE CAST(SUBSTRING_INDEX(player_age, '-', 1) AS SIGNED)
                END
            ) AS age_years
        FROM silver_players
        WHERE season = %(season)s
        GROUP BY
            player_name_clean, team_name, league, season, position_clean
        HAVING minutes_total >= %(min_minutes)s;
    """

    df = pd.read_sql(query, engine, params={"season": season, "min_minutes": min_minutes})

    # Filtrar por ligas
    if leagues_big5:
        df = df[df["league"].isin(BIG5_LEAGUES)]
    elif leagues is not None:
        df = df[df["league"].isin(leagues)]

    df = df.reset_index(drop=True)

    # Asegurar tipos numéricos
    for col in ["minutes_total", "finishing", "playmaking", "progression",
                "involvement", "defending", "discipline", "age_years"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    return df

# ============================================================
# 3. Proyección jugador -> espacio de estilo del equipo
#    x_player_team = [posesión, verticalidad, ofensivo, defensivo, agresividad]
# ============================================================

def project_players_to_team_space(df_players: pd.DataFrame) -> pd.DataFrame:
    """
    Usa los índices de jugador para construir un vector en el espacio del equipo.
    Aquí definimos:
      - pos_dim  ≈ participación / conexión en juego (involvement + playmaking)
      - vert_dim ≈ progresión (progression)
      - off_dim  ≈ amenaza ofensiva (finishing + playmaking)
      - def_dim  ≈ aporte defensivo (defending)
      - aggr_dim ≈ intensidad/combate (defending, reutilizado)
    """
    df = df_players.copy()

    df["style_pos"] = df[["involvement", "playmaking"]].mean(axis=1, skipna=True)
    df["style_vert"] = df["progression"]
    df["style_off"] = df[["finishing", "playmaking"]].mean(axis=1, skipna=True)
    df["style_def"] = df["defending"]
    df["style_aggr"] = df["defending"]  # podrías mezclar con disciplina si quieres

    style_cols = ["style_pos", "style_vert", "style_off", "style_def", "style_aggr"]
    df[style_cols] = df[style_cols].fillna(0.0)

    return df

# ============================================================
# 4. Construir vector objetivo del equipo (real + entrenador)
# ============================================================

def build_team_target_vector(
    x_team_real: np.ndarray,
    x_coach: np.ndarray | None = None,
    lam: float = 0.5,
) -> np.ndarray:
    """
    x_team_target = mezcla entre estilo real del equipo y preferencias del entrenador.
    lam = 0   -> solo estilo real
    lam = 1   -> solo preferencias del entrenador
    """
    x_team_real = np.asarray(x_team_real, dtype=float)
    if x_coach is None:
        return x_team_real

    x_coach = np.asarray(x_coach, dtype=float)
    if x_coach.shape != x_team_real.shape:
        raise ValueError(f"x_coach debe tener shape {x_team_real.shape} y recibí {x_coach.shape}")

    return (1 - lam) * x_team_real + lam * x_coach

# ============================================================
# 5. Recomendador principal
# ============================================================

def recommend_players_for_team(
    engine,
    team_name: str,
    season: str,
    x_ideal_role: list[float] | np.ndarray,
    *,
    leagues: list[str] | None = None,
    leagues_big5: bool = True,
    min_minutes: int = 600,
    role: str | None = None,
    position_whitelist: list[str] | None = None,
    x_coach: list[float] | np.ndarray | None = None,
    lam_coach: float = 0.5,
    alpha: float = 0.4,
    beta: float = 0.3,
    gamma: float = 0.2,
    delta: float = 0.1,
    top_n: int = 30,
) -> pd.DataFrame:
    """
    Recomendador Gold:

    - Obtiene el vector de estilo del equipo (silver_teams).
    - Construye un vector objetivo: mezcla equipo_real + preferencias del entrenador.
    - Define vector de necesidades: need = x_ideal_role - x_team_target.
    - Construye un pool de jugadores (silver_players agregados).
    - Proyecta a espacio de equipo.
    - Calcula:
        style_fit_j = cos(x_j, x_team_target)
        needs_fit_j = need · x_j
        age_potential_j (más joven = mejor)
        discipline_score_j (menos tarjetas/faltas = mejor)
    - Estandariza y combina con pesos alpha, beta, gamma, delta.
    """

    # -----------------
    # 5.1 Perfil del equipo
    # -----------------
    team_row, x_team_real = get_team_profile(engine, team_name, season)

    # Vector del rol ideal (x_ideal_role)
    x_ideal_role = np.asarray(x_ideal_role, dtype=float)
    if x_ideal_role.shape != x_team_real.shape:
        raise ValueError(
            f"x_ideal_role debe tener shape {x_team_real.shape}, recibí {x_ideal_role.shape}"
        )

    # Mezcla con preferencias del entrenador
    x_team_target = build_team_target_vector(
        x_team_real=x_team_real,
        x_coach=np.asarray(x_coach, dtype=float) if x_coach is not None else None,
        lam=lam_coach,
    )

    # Vector de necesidades del equipo
    need_vec = x_ideal_role - x_team_target

    # -----------------
    # 5.2 Pool de jugadores
    # -----------------
    df_players = get_players_pool(
        engine=engine,
        season=season,
        leagues=leagues,
        leagues_big5=leagues_big5,
        min_minutes=min_minutes,
    )

    if df_players.empty:
        raise ValueError("No se encontraron jugadores en el pool con esos filtros iniciales.")

    # Filtro por posición / rol
    allowed_positions = None
    if position_whitelist is not None:
        allowed_positions = position_whitelist
    elif role is not None and role in POSITION_GROUPS:
        allowed_positions = POSITION_GROUPS[role]

    if allowed_positions is not None and "position" in df_players.columns:
        df_players = df_players[df_players["position"].isin(allowed_positions)].copy()

    if df_players.empty:
        raise ValueError("No hay jugadores en el pool después de filtrar por posición/rol.")

    # -----------------
    # 5.3 Proyección a espacio de equipo
    # -----------------
    df_players = project_players_to_team_space(df_players)
    style_cols = ["style_pos", "style_vert", "style_off", "style_def", "style_aggr"]

    # -----------------
    # 5.4 Cálculo de style_fit y needs_fit
    # -----------------
    x_team_target_vec = np.asarray(x_team_target, dtype=float)
    need_vec = np.asarray(need_vec, dtype=float)

    def _style_fit_row(row):
        v = row[style_cols].to_numpy(dtype=float)
        return cosine_sim(v, x_team_target_vec)

    def _needs_fit_row(row):
        v = row[style_cols].to_numpy(dtype=float)
        return float(np.dot(need_vec, v))

    df_players["style_fit_raw"] = df_players.apply(_style_fit_row, axis=1)
    df_players["needs_fit_raw"] = df_players.apply(_needs_fit_row, axis=1)

    # -----------------
    # 5.5 Edad potencial y disciplina
    # -----------------
    # Edad: más joven = mejor
    if "age_years" in df_players.columns:
        age = pd.to_numeric(df_players["age_years"], errors="coerce")
        age_min, age_max = age.min(), age.max()
        if np.isfinite(age_min) and np.isfinite(age_max) and age_max > age_min:
            df_players["age_potential_raw"] = (age_max - age) / (age_max - age_min)
        else:
            df_players["age_potential_raw"] = 0.0
    else:
        df_players["age_potential_raw"] = 0.0

    # Disciplina: más limpias = mejor
    # idx_discipline alto = más tarjetas/faltas -> lo invertimos
    df_players["discipline_raw"] = -pd.to_numeric(
        df_players["discipline"], errors="coerce"
    ).fillna(0.0)

    # -----------------
    # 5.6 Estandarizar componentes y calcular score total
    # -----------------
    df_players["style_fit_z"] = standardize(df_players["style_fit_raw"])
    df_players["needs_fit_z"] = standardize(df_players["needs_fit_raw"])
    df_players["age_potential_z"] = standardize(df_players["age_potential_raw"])
    df_players["discipline_z"] = standardize(df_players["discipline_raw"])

    df_players["score"] = (
        alpha * df_players["style_fit_z"]
        + beta * df_players["needs_fit_z"]
        + gamma * df_players["age_potential_z"]
        + delta * df_players["discipline_z"]
    )

    # -----------------
    # 5.7 Ordenar y devolver top_n
    # -----------------
    df_players = df_players.sort_values("score", ascending=False).reset_index(drop=True)

    cols_out = [
        "player_name", "team_name", "league", "position",
        "minutes_total",
        "style_fit_raw", "needs_fit_raw",
        "age_years",
        "discipline",
        "score",
    ]
    cols_out = [c for c in cols_out if c in df_players.columns]

    return df_players[cols_out].head(top_n)


In [18]:
# Estilo ideal del rol (delantero): [posesión, verticalidad, ofensivo, defensivo, agresividad]
x_ideal_role = [0.5, 0.8, 1.0, 0.0, 0.3]

# Preferencias del entrenador para el equipo (sliders en el dash)
x_coach = [0.6, 0.7, 0.9, 0.3, 0.4]

recs = recommend_players_for_team(
    engine=engine,
    team_name="Barcelona",     # debe coincidir con silver_teams.team_name
    season="2024-2025",
    x_ideal_role=x_ideal_role,
    leagues_big5=True,
    min_minutes=600,
    role="delantero",             # usa POSITION_GROUPS
    x_coach=x_coach,
    lam_coach=0.5,                # mezcla 50% equipo real, 50% preferencias DT
    alpha=0.4, beta=0.3,
    gamma=0.2, delta=0.1,
    top_n=30,
)

recs.head(10)

Unnamed: 0,player_name,team_name,league,position,minutes_total,style_fit_raw,needs_fit_raw,age_years,discipline,score
0,tomáš čvančara,Borussia_Monchengladbach,Bundesliga,FW,831.0,-0.074104,0.025525,24.0,-1.345573,1.139939
1,mathys tel,Tottenham,Premier_League,FW,1121.0,0.732835,0.037548,19.3889,-0.045561,1.065293
2,nelson weiper,Mainz_05,Bundesliga,FW,704.0,0.935077,-0.040574,19.3478,0.001718,0.844905
3,yankuba minteh,Brighton,Premier_League,FW,1838.0,0.9128,-0.022079,20.0,0.049613,0.841117
4,adam hložek,Hoffenheim,Bundesliga,FW,2392.0,0.381392,0.063546,22.0,0.088524,0.75175
5,ethan nwaneri,Arsenal,Premier_League,FW,895.0,0.95019,-0.089552,17.2692,0.048743,0.740227
6,wilson odobert,Tottenham,Premier_League,FW,990.0,0.687771,-0.00335,19.8,0.15375,0.734327
7,armindo sieb,Mainz_05,Bundesliga,FW,762.0,0.581896,0.013784,21.2222,0.057844,0.728327
8,augustine boakye,Saint_Etienne,Ligue_1,FW,733.0,0.81535,-0.034946,23.6471,-0.239955,0.727668
9,alberto moleiro,Las Palmas,Laliga,FW,2714.0,0.750545,-0.022491,20.8,0.073743,0.692395
