# PGA Tour – Predicción de Top 10 (Clasificación)

**Objetivo:** Entrenar un modelo de Machine Learning que prediga si un jugador termina en **Top 10** (1) o **No Top 10** (0), usando estadísticas del PGA Tour.

**Dataset:** `ASA-All-PGA-Raw-Data-Tourn-Level.csv`

**Target:** Se construye a partir de la columna `Finish`:
- `top_10 = 1` si el puesto final es ≤ 10
- `top_10 = 0` en caso contrario (incluye `CUT`)

In [2]:
import os
import json
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [3]:
DATA_PATH = "data/ASA-All-PGA-Raw-Data-Tourn-Level.csv"

df = pd.read_csv(DATA_PATH)

df = df.loc[:, ~df.columns.astype(str).str.contains(r"^Unnamed")]

print("Shape:", df.shape)
display(df.head())
print("\nColumnas:")
print(df.columns.tolist())

Shape: (36864, 34)


Unnamed: 0,Player_initial_last,tournament id,player id,hole_par,strokes,hole_DKP,hole_FDP,hole_SDP,streak_DKP,streak_FDP,...,purse,season,no_cut,Finish,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total
0,A. Ancer,401353224,9261,288,289,60.0,51.1,56,3,7.6,...,12.0,2022,0,T32,0.2,-0.13,-0.08,0.86,0.65,0.85
1,A. Hadwin,401353224,5548,288,286,72.5,61.5,61,8,13.0,...,12.0,2022,0,T18,0.36,0.75,0.31,0.18,1.24,1.6
2,A. Lahiri,401353224,4989,144,147,21.5,17.4,27,0,0.0,...,12.0,2022,0,CUT,-0.56,0.74,-1.09,0.37,0.02,-0.54
3,A. Long,401353224,6015,144,151,20.5,13.6,17,0,0.4,...,12.0,2022,0,CUT,-1.46,-1.86,-0.02,0.8,-1.08,-2.54
4,A. Noren,401353224,3832,144,148,23.5,18.1,23,0,1.2,...,12.0,2022,0,CUT,0.53,-0.36,-1.39,0.19,-1.56,-1.04



Columnas:
['Player_initial_last', 'tournament id', 'player id', 'hole_par', 'strokes', 'hole_DKP', 'hole_FDP', 'hole_SDP', 'streak_DKP', 'streak_FDP', 'streak_SDP', 'n_rounds', 'made_cut', 'pos', 'finish_DKP', 'finish_FDP', 'finish_SDP', 'total_DKP', 'total_FDP', 'total_SDP', 'player', 'tournament name', 'course', 'date', 'purse', 'season', 'no_cut', 'Finish', 'sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total']


In [4]:
print("Info general:\n")
df.info()

print("\nNulos por columna (solo >0):")
nulls = df.isna().sum()
display(nulls[nulls > 0].sort_values(ascending=False))

print("\nResumen estadístico (numéricas):")
display(df.describe())

Info general:

<class 'pandas.DataFrame'>
RangeIndex: 36864 entries, 0 to 36863
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Player_initial_last  36864 non-null  str    
 1   tournament id        36864 non-null  int64  
 2   player id            36864 non-null  int64  
 3   hole_par             36864 non-null  int64  
 4   strokes              36864 non-null  int64  
 5   hole_DKP             36864 non-null  float64
 6   hole_FDP             36864 non-null  float64
 7   hole_SDP             36864 non-null  int64  
 8   streak_DKP           36864 non-null  int64  
 9   streak_FDP           36864 non-null  float64
 10  streak_SDP           36864 non-null  int64  
 11  n_rounds             36864 non-null  int64  
 12  made_cut             36864 non-null  int64  
 13  pos                  21317 non-null  float64
 14  finish_DKP           36864 non-null  int64  
 15  finish_FDP           36864 non-n

pos         15547
sg_putt      7684
sg_arg       7684
sg_app       7684
sg_ott       7684
sg_t2g       7684
Finish       7683
sg_total     7683
dtype: int64


Resumen estadístico (numéricas):


Unnamed: 0,tournament id,player id,hole_par,strokes,hole_DKP,hole_FDP,hole_SDP,streak_DKP,streak_FDP,streak_SDP,...,total_SDP,purse,season,no_cut,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total
count,36864.0,36864.0,36864.0,36864.0,36864.0,36864.0,36864.0,36864.0,36864.0,36864.0,...,36864.0,36864.0,36864.0,36864.0,29180.0,29180.0,29180.0,29180.0,29180.0,29181.0
mean,233180700.0,79790.41,225.547065,224.114502,50.130249,44.375789,49.323703,1.764052,7.686844,1.683485,...,52.177762,7.529612,2018.539686,0.065294,-0.121005,-0.040744,-0.101759,-0.045896,-0.188346,-0.305491
std,197922100.0,575381.6,70.29559,66.789007,24.025854,24.153313,22.202915,2.846552,7.252924,2.668202,...,25.313295,2.192712,2.21305,0.247047,1.119451,0.727315,1.119763,0.808139,1.640817,1.966669
min,2230.0,5.0,70.0,66.0,-2.5,-21.4,-11.0,0.0,0.0,0.0,...,-11.0,3.0,2015.0,0.0,-5.99,-6.43,-9.25,-7.74,-13.95,-13.67
25%,2696.0,1170.0,143.0,146.0,27.0,22.6,28.0,0.0,0.8,0.0,...,28.0,6.4,2017.0,0.0,-0.77,-0.45,-0.74,-0.45,-1.08,-1.37
50%,401056500.0,3793.0,280.0,272.0,53.5,46.1,55.0,0.0,6.4,0.0,...,56.0,7.1,2019.0,0.0,-0.04,0.0,0.0,0.05,-0.01,-0.16
75%,401219500.0,6151.0,286.0,281.0,69.0,64.0,69.0,3.0,12.4,3.0,...,72.0,8.7,2021.0,0.0,0.63,0.42,0.64,0.48,0.92,1.06
max,401366900.0,4845309.0,292.0,325.0,174.0,134.7,107.0,23.0,43.6,22.0,...,141.0,20.0,2022.0,1.0,4.43,3.17,4.67,2.77,6.3,8.52


In [5]:
df["Finish_clean"] = (
    df["Finish"]
    .astype(str)
    .str.replace("T", "", regex=False)
)

df["Finish_clean"] = pd.to_numeric(df["Finish_clean"], errors="coerce")

df["top_10"] = (df["Finish_clean"] <= 10).astype(int)

print("Balance de clases top_10:")
print(df["top_10"].value_counts())
print(df["top_10"].value_counts(normalize=True))
display(df[["Finish", "Finish_clean", "top_10"]].head(15))

Balance de clases top_10:
top_10
0    34066
1     2798
Name: count, dtype: int64
top_10
0    0.924099
1    0.075901
Name: proportion, dtype: float64


Unnamed: 0,Finish,Finish_clean,top_10
0,T32,32.0,0
1,T18,18.0,0
2,CUT,,0
3,CUT,,0
4,CUT,,0
5,CUT,,0
6,T26,26.0,0
7,T26,26.0,0
8,T67,67.0,0
9,CUT,,0


In [6]:
TARGET = "top_10"
leak_cols = ["Finish", "Finish_clean", "top_10", "winner"]
leak_cols = [c for c in leak_cols if c in df.columns]

X = df.drop(columns=leak_cols)
y = df[TARGET]

print("Leak columns removidas:", leak_cols)
print("X shape:", X.shape)
print("y shape:", y.shape)

Leak columns removidas: ['Finish', 'Finish_clean', 'top_10']
X shape: (36864, 33)
y shape: (36864,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)
print("\nBalance train:")
print(y_train.value_counts(normalize=True))

Train: (29491, 33) Test: (7373, 33)

Balance train:
top_10
0    0.924112
1    0.075888
Name: proportion, dtype: float64


In [9]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns

print("Numéricas:", len(numeric_features), list(numeric_features)[:10])
print("Categóricas:", len(categorical_features), list(categorical_features)[:10])

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

Numéricas: 28 ['tournament id', 'player id', 'hole_par', 'strokes', 'hole_DKP', 'hole_FDP', 'hole_SDP', 'streak_DKP', 'streak_FDP', 'streak_SDP']
Categóricas: 5 ['Player_initial_last', 'player', 'tournament name', 'course', 'date']


In [10]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "RandomForest": RandomForestClassifier(random_state=42, class_weight="balanced"),
    "SVC": SVC(probability=True, class_weight="balanced")
}

param_grids = {
    "LogisticRegression": {
        "model__C": [0.1, 1, 10]
    },
    "RandomForest": {
        "model__n_estimators": [200, 300],
        "model__max_depth": [None, 10, 20]
    },
    "SVC": {
        "model__C": [0.5, 1, 5],
        "model__kernel": ["rbf", "linear"]
    }
}

results = []
best_estimators = {}

for name, model in models.items():
    print(f"\n=== Entrenando {name} ===")

    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ])

    grid = GridSearchCV(
        pipe,
        param_grids[name],
        cv=5,
        scoring="f1",
        n_jobs=-1
    )

    grid.fit(X_train, y_train)
    best_estimators[name] = grid.best_estimator_

    y_pred = grid.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    results.append({
        "model": name,
        "best_params": grid.best_params_,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    })

    print("Best params:", grid.best_params_)
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification report:\n", classification_report(y_test, y_pred, zero_division=0))

results_df = pd.DataFrame(results).sort_values(by="f1", ascending=False)
display(results_df)


=== Entrenando LogisticRegression ===
Best params: {'model__C': 10}
Confusion matrix:
 [[6787   26]
 [  11  549]]
Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6813
           1       0.95      0.98      0.97       560

    accuracy                           0.99      7373
   macro avg       0.98      0.99      0.98      7373
weighted avg       1.00      0.99      1.00      7373


=== Entrenando RandomForest ===
Best params: {'model__max_depth': 20, 'model__n_estimators': 200}
Confusion matrix:
 [[6812    1]
 [   8  552]]
Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6813
           1       1.00      0.99      0.99       560

    accuracy                           1.00      7373
   macro avg       1.00      0.99      1.00      7373
weighted avg       1.00      1.00      1.00      7373


=== Entrenando SVC ===
Best params: {

Unnamed: 0,model,best_params,accuracy,precision,recall,f1
1,RandomForest,"{'model__max_depth': 20, 'model__n_estimators'...",0.998779,0.998192,0.985714,0.991914
2,SVC,"{'model__C': 5, 'model__kernel': 'rbf'}",0.996609,0.98725,0.967857,0.977457
0,LogisticRegression,{'model__C': 10},0.994982,0.954783,0.980357,0.967401


In [11]:
best_name = results_df.iloc[0]["model"]
best_model = best_estimators[best_name]

print("MEJOR MODELO:", best_name)

y_pred = best_model.predict(X_test)

print("Confusion matrix final:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report final:\n", classification_report(y_test, y_pred, zero_division=0))

MEJOR MODELO: RandomForest
Confusion matrix final:
 [[6812    1]
 [   8  552]]

Classification report final:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6813
           1       1.00      0.99      0.99       560

    accuracy                           1.00      7373
   macro avg       1.00      0.99      1.00      7373
weighted avg       1.00      1.00      1.00      7373



In [15]:
import os

BASE_DIR = os.getcwd()  # carpeta raíz del proyecto
MODEL_DIR = os.path.join(BASE_DIR, "models")

os.makedirs(MODEL_DIR, exist_ok=True)

MODEL_PATH = os.path.join(MODEL_DIR, "pga_top10_model.joblib")
META_PATH = os.path.join(MODEL_DIR, "meta.json")

joblib.dump(best_model, MODEL_PATH)

with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Modelo guardado en:", MODEL_PATH)
print("Metadata guardada en:", META_PATH)

Modelo guardado en: /Users/alexgarcia/PycharmProjects/ExporarModelo/models/pga_top10_model.joblib
Metadata guardada en: /Users/alexgarcia/PycharmProjects/ExporarModelo/models/meta.json
