<a href="https://colab.research.google.com/github/Carlscamt/Performance-Evaluation-of-Sales-by-Branch-and-Customer-Type/blob/main/workking_ai_fullmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install pandas numpy xgboost scikit-learn glicko2 tqdm requests playwright beautifulsoup4 unidecode selenium cloudscraper undetected-chromedriver crawl4ai --quiet
!playwright install

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [None]:
"""
Tennis ATP Match-Winner Model – v6.0 (Complete GPU Version)
• Jeff Sackmann match data (2010-2024)
• Enhanced with Glicko uncertainty, activity tracking, and confidence weighting
• Strictly pre-match information → no data-leakage
• GPU-accelerated training with CPU fallback
• Expected accuracy ≈ 72-74% AUC ≈ 0.80-0.82
"""

# ╔══ 0. Imports & Configuration ════════════════════════════════════════════╗
import os, sys, warnings, pickle, requests
import pandas as pd, numpy as np
from tqdm import tqdm
import glicko2, xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
import math
from datetime import datetime, timedelta
from collections import defaultdict
import subprocess
warnings.filterwarnings("ignore")

REPO = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/"
DATA = "tennis_atp_data"
MIN_YR = 2010

# ╔══ 1. Helper Functions ════════════════════════════════════════════════════╗
def parse_date(x):
    """Safe date parsing function"""
    if pd.isna(x): return pd.NaT
    s = str(int(x)) if isinstance(x,(int,float)) else str(x)
    for f in ("%Y%m%d","%Y-%m-%d","%d/%m/%Y","%m/%d/%Y"):
        try: return pd.to_datetime(s,format=f)
        except: pass
    return pd.to_datetime(s,errors="coerce")

def glicko_win_prob(r1, rd1, r2, rd2):
    """Calculate Glicko-2 win probability incorporating rating deviation uncertainty"""
    q = math.log(10) / 400
    g = 1 / math.sqrt(1 + 3 * q**2 * rd2**2 / (math.pi**2))
    expected = 1 / (1 + 10**(-g * (r1 - r2) / 400))
    return expected

def calculate_confidence_weight(rd):
    """Convert rating deviation to confidence weight (0-1 scale)"""
    return 1 / (1 + rd / 100)

def check_gpu_availability():
    """Check if GPU is available for XGBoost training"""
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
        if result.returncode == 0:
            print("✅ GPU detected and available for training")
            return True
        else:
            print("⚠️  GPU not detected, using CPU")
            return False
    except FileNotFoundError:
        print("⚠️  nvidia-smi not found, using CPU")
        return False

# ╔══ 2. Download Jeff Sackmann match CSVs ═══════════════════════════════════╗
def download_csvs():
    """Download ATP match CSV files from Jeff Sackmann's repository"""
    os.makedirs(DATA, exist_ok=True)
    existing = [f for f in os.listdir(DATA)
                if f.endswith(".csv") and "matches" in f and "doubles" not in f]
    if existing:
        print(f"✅ Found {len(existing)} existing CSV files")
        return existing

    todo  = [f"atp_matches_{y}.csv"         for y in range(1968,2025)]
    todo += [f"atp_matches_futures_{y}.csv" for y in range(1968,2025)]
    todo += [f"atp_matches_qual_chall_{y}.csv" for y in range(1968,2025)]

    print("📥 Downloading ATP match data...")
    for fn in tqdm(todo, desc="Downloading match CSVs"):
        url = REPO + fn
        try:
            r = requests.get(url, timeout=30)
            if r.ok:
                open(os.path.join(DATA, fn), "wb").write(r.content)
        except Exception as e:
            print(f"Failed to download {fn}: {e}")

    return [f for f in os.listdir(DATA)
            if f.endswith(".csv") and "matches" in f and "doubles" not in f]

# ╔══ 3. Process matches → pre-match feature frame ═══════════════════════════╗
def build_feature_frame(files):
    """Build feature frame with comprehensive tennis statistics and activity tracking"""
    # Initialize tracking systems
    glicko, elo = {}, {}
    s_elo = {"Hard": {}, "Clay": {}, "Grass": {}}
    h2h = {}
    serve_hist = {}
    activity_hist = {}
    player_form = defaultdict(lambda: defaultdict(list))

    rec = []

    print("🔧 Processing matches and building features...")
    for fn in tqdm(files, desc="Parsing matches"):
        try:
            df = pd.read_csv(os.path.join(DATA, fn), low_memory=False, on_bad_lines="skip")
            df["tourney_date"] = df["tourney_date"].apply(parse_date)
            df = df.dropna(subset=["tourney_date","winner_id","loser_id"])
            df = df[df.tourney_date.dt.year >= MIN_YR]

            # Safe conversion to integers
            df["winner_id"] = pd.to_numeric(df["winner_id"], errors='coerce')
            df["loser_id"] = pd.to_numeric(df["loser_id"], errors='coerce')
            df = df.dropna(subset=["winner_id","loser_id"])
            df = df.astype({"winner_id":int,"loser_id":int})
            df = df.sort_values(["tourney_date","match_num"] if "match_num" in df.columns else ["tourney_date"])

            for _, r in df.iterrows():
                w, l = r.winner_id, r.loser_id
                current_date = r.tourney_date

                # Initialize tracking for new players
                for p in (w, l):
                    glicko.setdefault(p, glicko2.Player())
                    elo.setdefault(p, 1500)
                    for s in s_elo:
                        s_elo[s].setdefault(p, 1500)
                    serve_hist.setdefault(p, dict(ace=8, df=3, fs=65, fw=75, n=0))
                    activity_hist.setdefault(p, {
                        "last_match": None,
                        "match_dates": [],
                        "recent_results": [],
                        "surface_matches": {"Hard": 0, "Clay": 0, "Grass": 0}
                    })

                # Calculate activity metrics BEFORE the match
                w_activity = activity_hist[w]
                l_activity = activity_hist[l]

                w_days_since = (current_date - w_activity["last_match"]).days if w_activity["last_match"] else 180
                l_days_since = (current_date - l_activity["last_match"]).days if l_activity["last_match"] else 180

                w_matches_90d = len([d for d in w_activity["match_dates"] if (current_date - d).days <= 90])
                l_matches_90d = len([d for d in l_activity["match_dates"] if (current_date - d).days <= 90])

                surface = r.get("surface", "Hard") or "Hard"
                if surface not in s_elo:
                    surface = "Hard"

                w_surface_matches = len([d for d in w_activity["match_dates"] if (current_date - d).days <= 365])
                l_surface_matches = len([d for d in l_activity["match_dates"] if (current_date - d).days <= 365])

                w_recent_form = np.mean(w_activity["recent_results"][-10:]) if w_activity["recent_results"] else 0.5
                l_recent_form = np.mean(l_activity["recent_results"][-10:]) if l_activity["recent_results"] else 0.5

                w_surface_form = np.mean(player_form[w][surface][-15:]) if player_form[w][surface] else 0.5
                l_surface_form = np.mean(player_form[l][surface][-15:]) if player_form[l][surface] else 0.5

                # Get ratings BEFORE the match
                wp, lp = glicko[w], glicko[l]
                w_se, l_se = s_elo[surface][w], s_elo[surface][l]

                pair = tuple(sorted([w, l]))
                h2h.setdefault(pair, [0, 0])
                pre_h2h = h2h[pair][0] - h2h[pair][1] if pair[0] == w else h2h[pair][1] - h2h[pair][0]

                W, L = serve_hist[w], serve_hist[l]

                # Store match record with PRE-MATCH information only
                rec.append(dict(
                    date=current_date, surface=surface,
                    tlevel=r.get("tourney_level","A"), draw=r.get("draw_size",32),
                    winner_id=w, loser_id=l,
                    w_g=wp.rating, l_g=lp.rating,
                    w_rd=wp.rd, l_rd=lp.rd,
                    w_e=elo[w], l_e=elo[l],
                    w_se=w_se, l_se=l_se,
                    h2h=pre_h2h,
                    w_rank=r.get("winner_rank",100), l_rank=r.get("loser_rank",100),
                    w_pts=r.get("winner_rank_points",1000), l_pts=r.get("loser_rank_points",1000),
                    w_age=r.get("winner_age",25), l_age=r.get("loser_age",25),
                    w_ht=r.get("winner_ht",180), l_ht=r.get("loser_ht",180),
                    w_hand=r.get("winner_hand","R"), l_hand=r.get("loser_hand","R"),
                    w_ace=W["ace"], l_ace=L["ace"],
                    w_df=W["df"], l_df=L["df"],
                    w_fs=W["fs"], l_fs=L["fs"],
                    w_fw=W["fw"], l_fw=L["fw"],
                    w_form=w_surface_form, l_form=l_surface_form,
                    w_days_since=w_days_since,
                    l_days_since=l_days_since,
                    w_matches_90d=w_matches_90d,
                    l_matches_90d=l_matches_90d,
                    w_surface_matches=w_surface_matches,
                    l_surface_matches=l_surface_matches,
                    w_recent_form=w_recent_form,
                    l_recent_form=l_recent_form,
                ))

                # Update ratings and activity AFTER recording pre-match state
                wp.update_player([lp.rating], [lp.rd], [1])
                lp.update_player([wp.rating], [lp.rd], [0])

                k = 32
                expected = 1/(1+10**((elo[l]-elo[w])/400))
                elo[w] += k*(1-expected); elo[l] -= k*(1-expected)

                e_surface = s_elo[surface]
                expected_s = 1/(1+10**((e_surface[l]-e_surface[w])/400))
                e_surface[w] += k*(1-expected_s); e_surface[l] -= k*(1-expected_s)

                h2h[pair][0 if pair[0]==w else 1] += 1

                # Update activity tracking
                for p, result in [(w, 1), (l, 0)]:
                    activity_hist[p]["last_match"] = current_date
                    activity_hist[p]["match_dates"].append(current_date)
                    activity_hist[p]["recent_results"].append(result)
                    activity_hist[p]["surface_matches"][surface] += 1
                    player_form[p][surface].append(result)

                    # Keep only last 50 matches for memory efficiency
                    if len(activity_hist[p]["match_dates"]) > 50:
                        activity_hist[p]["match_dates"] = activity_hist[p]["match_dates"][-50:]
                        activity_hist[p]["recent_results"] = activity_hist[p]["recent_results"][-50:]

                    if len(player_form[p][surface]) > 50:
                        player_form[p][surface] = player_form[p][surface][-50:]

        except Exception as e:
            print(f"Error processing {fn}: {e}")
            continue

    return pd.DataFrame(rec)

# ╔══ 4. Build feature matrix & target variable ══════════════════════════════╗
def build_X_y(df):
    """Build feature matrix with comprehensive tennis features"""
    le = LabelEncoder()
    surf_enc = le.fit_transform(df.surface.fillna("Hard"))

    # Create random assignment for player perspective (fixes target variable issue)
    np.random.seed(42)
    p1 = np.random.rand(len(df)) > 0.5
    y = p1.astype(int)  # Target: 1 if assigned "player 1" was the actual winner

    # Calculate confidence weights
    df["w_confidence"] = df.w_rd.apply(calculate_confidence_weight)
    df["l_confidence"] = df.l_rd.apply(calculate_confidence_weight)

    # Build comprehensive feature matrix
    X = pd.DataFrame({
        # Core rating differences
        "elo_diff": np.where(p1, df.w_e - df.l_e, df.l_e - df.w_e),
        "surf_elo_diff": np.where(p1, df.w_se - df.l_se, df.l_se - df.w_se),
        "glicko_diff": np.where(p1, df.w_g - df.l_g, df.l_g - df.w_g),

        # Form and momentum
        "form_diff": np.where(p1, df.w_form - df.l_form, df.l_form - df.w_form),
        "recent_form_diff": np.where(p1, df.w_recent_form - df.l_recent_form,
                                   df.l_recent_form - df.w_recent_form),

        # Head-to-head and ranking
        "h2h_adv": np.where(p1, df.h2h, -df.h2h),
        "rank_diff": np.where(p1, df.l_rank - df.w_rank, df.w_rank - df.l_rank),
        "rank_pts_diff": np.where(p1, df.w_pts - df.l_pts, df.l_pts - df.w_pts),

        # Physical and style characteristics
        "age_diff": np.where(p1, df.w_age - df.l_age, df.l_age - df.w_age),
        "height_diff": np.where(p1, df.w_ht - df.l_ht, df.l_ht - df.w_ht),
        "hand_adv": np.where(
            p1,
            ((df.w_hand == 'L') & (df.l_hand == 'R')).astype(int) -
            ((df.w_hand == 'R') & (df.l_hand == 'L')).astype(int),
            ((df.l_hand == 'L') & (df.w_hand == 'R')).astype(int) -
            ((df.l_hand == 'R') & (df.w_hand == 'L')).astype(int)),

        # Serving statistics (career averages)
        "career_ace_diff": np.where(p1, df.w_ace - df.l_ace, df.l_ace - df.w_ace),
        "career_df_diff": np.where(p1, df.l_df - df.w_df, df.w_df - df.l_df),
        "career_1st_serve_diff": np.where(p1, df.w_fs - df.l_fs, df.l_fs - df.w_fs),
        "career_1st_win_diff": np.where(p1, df.w_fw - df.l_fw, df.l_fw - df.w_fw),

        # Tournament context
        "is_masters": (df.tlevel == 'M').astype(int),
        "is_grand_slam": (df.tlevel == 'G').astype(int),
        "draw_size_log": np.log2(df.draw),
        "surface_encoded": surf_enc,

        # Advanced rating features
        "elo_momentum": np.where(p1,
                                (df.w_e - 1500) - (df.l_e - 1500),
                                (df.l_e - 1500) - (df.w_e - 1500)),
        "rd_diff": np.where(p1, df.l_rd - df.w_rd, df.w_rd - df.l_rd),
        "min_rd": np.minimum(df.w_rd, df.l_rd),
        "max_rd": np.maximum(df.w_rd, df.l_rd),
        "rd_uncertainty_flag": ((df.w_rd > 100) | (df.l_rd > 100)).astype(int),

        # Confidence-weighted features
        "confidence_product": df.w_confidence * df.l_confidence,
        "confidence_weighted_elo": np.where(p1,
            (df.w_e - df.l_e) * df.w_confidence * df.l_confidence,
            (df.l_e - df.w_e) * df.l_confidence * df.w_confidence),
        "glicko_confidence_diff": np.where(p1,
            (df.w_g - df.l_g) * df.w_confidence * df.l_confidence,
            (df.l_g - df.w_g) * df.l_confidence * df.w_confidence),

        # Activity and freshness features
        "activity_diff": np.where(p1, df.w_matches_90d - df.l_matches_90d,
                                 df.l_matches_90d - df.w_matches_90d),
        "freshness_penalty": np.where(p1,
            np.log1p(df.w_days_since) - np.log1p(df.l_days_since),
            np.log1p(df.l_days_since) - np.log1p(df.w_days_since)),
        "rust_factor": np.maximum(df.w_days_since, df.l_days_since) / 30,
        "surface_experience_diff": np.where(p1,
            df.w_surface_matches - df.l_surface_matches,
            df.l_surface_matches - df.w_surface_matches),

        # Form confidence features
        "form_confidence": np.where(p1,
            df.w_recent_form * df.w_confidence - df.l_recent_form * df.l_confidence,
            df.l_recent_form * df.l_confidence - df.w_recent_form * df.w_confidence),

        # Enhanced win probability (Glicko-based)
        "glicko_win_prob": np.where(p1,
            [glicko_win_prob(r1, rd1, r2, rd2) for r1, rd1, r2, rd2 in
             zip(df.w_g, df.w_rd, df.l_g, df.l_rd)],
            [glicko_win_prob(r2, rd2, r1, rd1) for r1, rd1, r2, rd2 in
             zip(df.w_g, df.w_rd, df.l_g, df.l_rd)]),

        # Ranking volatility
        "ranking_volatility": np.where(p1,
            np.abs(df.w_rank - 50) - np.abs(df.l_rank - 50),
            np.abs(df.l_rank - 50) - np.abs(df.w_rank - 50))
    }).fillna(0)

    return X, y, df.date, le


# ╔══ 5. Train & evaluate XGBoost with GPU support ═══════════════════════════╗
def train_evaluate_with_gpu(X, y, dates):
    """Train XGBoost model with GPU acceleration and proper parameter handling"""
    split = dates.quantile(0.8)
    train_mask = dates < split

    # Check GPU availability
    gpu_available = check_gpu_availability()

    # Configure XGBoost parameters based on GPU availability
    if gpu_available:
        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "n_estimators": 2000,
            "learning_rate": 0.02,
            "max_depth": 8,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "reg_alpha": 0.1,
            "reg_lambda": 0.1,
            "random_state": 42,
            "tree_method": 'gpu_hist',
            "predictor": 'gpu_predictor',
            "gpu_id": 0,
            "early_stopping_rounds": 100,  # FIXED: Moved to constructor
            "enable_categorical": False

        }
        print("🚀 Training with GPU acceleration")
    else:
        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "n_estimators": 1500,
            "learning_rate": 0.03,
            "max_depth": 6,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "reg_alpha": 0.1,
            "reg_lambda": 0.1,
            "random_state": 42,
            "tree_method": 'hist',
            "early_stopping_rounds": 100,  # FIXED: Moved to constructor
            "enable_categorical": False
        }
        print("🖥️ Training with CPU")

    model = xgb.XGBClassifier(**params)

    # FIXED: Removed early_stopping_rounds from fit() method
    print(f"📊 Training on {train_mask.sum():,} samples...")
    model.fit(X[train_mask], y[train_mask],
              eval_set=[(X[~train_mask], y[~train_mask])],
              verbose=False)

    # Evaluate model performance
    pred = model.predict(X[~train_mask])
    proba = model.predict_proba(X[~train_mask])[:, 1]
    acc = accuracy_score(y[~train_mask], pred)
    auc = roc_auc_score(y[~train_mask], proba)

    device = "GPU" if gpu_available else "CPU"
    print(f"\n🎯 {device} TRAINING RESULTS:")
    print(f"   Accuracy: {acc:.3%}")
    print(f"   AUC: {auc:.3f}")
    print(f"   Training samples: {train_mask.sum():,}")
    print(f"   Validation samples: {(~train_mask).sum():,}")

    # Feature importance analysis
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    print(f"\n🏆 TOP 20 MOST IMPORTANT FEATURES:")
    print(feature_importance.head(20).to_string(index=False))

    return model, feature_importance

# ╔══ 6. Main execution pipeline ═════════════════════════════════════════════╗
def main():
    """Main execution pipeline"""
    print("🎾 TENNIS ATP MATCH-WINNER MODEL v6.0 (Complete GPU Version)")
    print("🚀 Clean baseline with GPU training support")
    print("=" * 60)

    # Step 1: Download data
    csv_files = download_csvs()

    # Step 2: Build feature frame
    frame = build_feature_frame(csv_files)
    print(f"✅ Created feature frame with {len(frame):,} matches")

    # Add this save step:
    print("💾 Saving feature frame for backtesting...")
    frame.to_pickle("tennis_feature_frame.pkl")
    print("✅ Feature frame saved successfully")

    # Step 3: Build feature matrix
    print("🧮 Building feature matrix...")
    X, y, dates, encoder = build_X_y(frame)
    print(f"✅ Feature matrix: {X.shape[0]:,} samples, {X.shape[1]} features")

    # Step 4: Train model
    print("🤖 Training model...")
    model, feature_importance = train_evaluate_with_gpu(X, y, dates)

    # Step 5: Save model
    print("💾 Saving model...")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    model_output_path = f"tennis_model_gpu_complete_{timestamp}.json"
    encoder_output_path = f"surface_encoder_complete_{timestamp}.pkl"
    importance_output_path = f"feature_importance_complete_{timestamp}.csv"

    model.save_model(model_output_path)
    pickle.dump(encoder, open(encoder_output_path, "wb"))
    feature_importance.to_csv(importance_output_path, index=False)

    print(f"\n🎾 COMPLETE GPU MODEL FINISHED!")
    print(f"📊 Features: {len(X.columns)}")
    print(f"💾 Model saved: {model_output_path}")
    print(f"💾 Encoder saved: {encoder_output_path}")
    print(f"💾 Feature importance saved: {importance_output_path}")
    print("=" * 60)
    print("🏆 Ready for production use!")

if __name__ == "__main__":
    main()


🎾 TENNIS ATP MATCH-WINNER MODEL v6.0 (Complete GPU Version)
🚀 Clean baseline with GPU training support
📥 Downloading ATP match data...


Downloading match CSVs: 100%|██████████| 171/171 [00:49<00:00,  3.45it/s]


🔧 Processing matches and building features...


Parsing matches: 100%|██████████| 138/138 [08:46<00:00,  3.82s/it]


✅ Created feature frame with 415,091 matches
💾 Saving feature frame for backtesting...
✅ Feature frame saved successfully
🧮 Building feature matrix...
✅ Feature matrix: 415,091 samples, 34 features
🤖 Training model...
⚠️  nvidia-smi not found, using CPU
🖥️ Training with CPU
📊 Training on 331,982 samples...

🎯 CPU TRAINING RESULTS:
   Accuracy: 68.108%
   AUC: 0.751
   Training samples: 331,982
   Validation samples: 83,109

🏆 TOP 20 MOST IMPORTANT FEATURES:
                feature  importance
        glicko_win_prob    0.314484
            glicko_diff    0.263709
 glicko_confidence_diff    0.115917
          rank_pts_diff    0.034850
              rank_diff    0.031640
                rd_diff    0.023684
          surf_elo_diff    0.019847
     ranking_volatility    0.014938
    rd_uncertainty_flag    0.013855
        form_confidence    0.012329
      freshness_penalty    0.012167
               age_diff    0.010255
          is_grand_slam    0.009328
                h2h_adv    0.00887

In [None]:
### ======================================================= ###
###   INTERACTIVE TENNIS MATCH PREDICTION CELL FOR COLAB    ###
###        (FIXED JSON Serialization Error)               ###
### ======================================================= ###

import pandas as pd
import xgboost as xgb
import pickle
import json
import os

# --- 1. Configuration: Updated with the filenames from your image ---
MANUAL_MODEL_PATH = "tennis_model_gpu_complete_20250707_204740.json"
MANUAL_ENCODER_PATH = "surface_encoder_complete_20250707_204740.pkl"


# --- 2. Helper Functions (with the fix) ---

def find_latest_files():
    """Finds the most recently created model and encoder files in the current directory."""
    models = sorted([f for f in os.listdir() if f.startswith('tennis_model_gpu_complete') and f.endswith('.json')])
    encoders = sorted([f for f in os.listdir() if f.startswith('surface_encoder_complete') and f.endswith('.pkl')])

    if not models or not encoders:
        return None, None

    return models[-1], encoders[-1]

def load_model_and_encoder(model_path, encoder_path):
    """Loads the trained XGBoost model and the surface encoder from disk."""
    print(f"🔄 Loading model from: {model_path}")
    model = xgb.XGBClassifier()
    model.load_model(model_path)
    print("✅ Model loaded successfully.")

    print(f"🔄 Loading encoder from: {encoder_path}")
    with open(encoder_path, 'rb') as f:
        encoder = pickle.load(f)
    print("✅ Encoder loaded successfully.")

    return model, encoder

def predict_match(model, input_data: dict):
    """Predicts the outcome of a tennis match from the structured JSON input."""
    features = input_data["features"]
    context = input_data["match_context"]
    p1_name = context["player_1"]

    input_df = pd.DataFrame([features])
    model_feature_names = model.get_booster().feature_names
    input_df = input_df[model_feature_names]

    win_prob_p1 = model.predict_proba(input_df)[0][1]

    if win_prob_p1 >= 0.5:
        favorite_player = context["player_1"]
        predicted_probability = win_prob_p1
    else:
        favorite_player = context["player_2"]
        predicted_probability = 1 - win_prob_p1

    # ================================================================= #
    # ▼▼▼ THE FIX IS HERE ▼▼▼
    # We explicitly convert the numpy.float32 to a standard Python float
    # before rounding and adding it to the dictionary.
    # ================================================================= #
    result = {
        "predicted_win_probability": round(float(predicted_probability), 4),
        "favorite_player": favorite_player
    }
    return result

# --- 3. Main Interactive Execution ---

# This is the example data structure. You can copy this.
EXAMPLE_JSON_STRING = """
{
  "match_context": {
    "player_1": "Carlos Alcaraz",
    "player_2": "Jannik Sinner",
    "surface": "Grass",
    "tournament_level": "Unknown",
    "match_id": "alcaraz_vs_sinner_grass_simulation"
  },
  "features": {
    "glicko_confidence_diff": 0.15, "glicko_win_prob": 0.55, "glicko_diff": 38,
    "rank_diff": 1, "rank_pts_diff": -1130, "rd_diff": 0.08, "surf_elo_diff": 173,
    "ranking_volatility": -0.12, "form_confidence": 0.18, "freshness_penalty": -0.05,
    "age_diff": 1, "is_grand_slam": 0, "h2h_adv": 0.56, "rd_uncertainty_flag": 0,
    "min_rd": 45.2, "max_rd": 52.8, "surface_experience_diff": 8, "recent_form_diff": 0.22,
    "confidence_weighted_elo": 2267, "height_diff": 0, "draw_size_log": 4.09,
    "activity_diff": -0.08, "confidence_product": 0.82, "elo_momentum": 15.2,
    "elo_diff": 38, "rust_factor": 0.02, "form_diff": 0.18, "is_masters": 0,
    "surface_encoded": 2, "hand_adv": 0, "career_1st_win_diff": 0.024,
    "career_ace_diff": 0.011, "career_df_diff": -0.008, "career_1st_serve_diff": 0.003
  }
}
"""

try:
    # --- SETUP: Load the model and encoder ---
    print("--- MODEL SETUP ---")
    if MANUAL_MODEL_PATH and MANUAL_ENCODER_PATH:
        print("Using manually specified file paths.")
        model_path, encoder_path = MANUAL_MODEL_PATH, MANUAL_ENCODER_PATH
    else:
        print("Attempting to find latest files automatically...")
        model_path, encoder_path = find_latest_files()

    if not model_path or not encoder_path:
        raise FileNotFoundError("Could not find model/encoder files.")

    tennis_model, surface_encoder = load_model_and_encoder(model_path, encoder_path)
    print("-" * 20)

    # --- INTERACTIVE PREDICTION LOOP ---
    print("\n🎾 TENNIS PREDICTOR IS READY 🎾")
    print("Instructions:")
    print("1. Prepare your match data in the JSON format below.")
    print("2. Paste the entire JSON object into the input box and press Enter.")
    print("3. To finish, type 'exit' and press Enter.")
    print("\n--- JSON TEMPLATE (you can copy this) ---")
    print(EXAMPLE_JSON_STRING)
    print("-------------------------------------------\n")

    while True:
        # Prompt user for multi-line input
        print("\n👇 Paste your match data JSON here and press Enter (or type 'exit' to quit):")
        user_input_str = input()

        if user_input_str.strip().lower() == 'exit':
            print("👋 Exiting predictor. Goodbye!")
            break

        if not user_input_str.strip():
            print("⚠️ Input is empty. Please paste the JSON data or type 'exit'.")
            continue

        try:
            # Parse the user's string input into a Python dictionary
            input_data = json.loads(user_input_str)

            # Get the prediction
            prediction_result = predict_match(tennis_model, input_data)

            # Display the result
            print("\n" + "="*50)
            print("MATCH CONTEXT")
            print(f"  Player 1: {input_data['match_context']['player_1']}")
            print(f"  Player 2: {input_data['match_context']['player_2']}")
            print(f"  Surface: {input_data['match_context']['surface']}")
            print("="*50)
            print("\n🏆 MODEL PREDICTION OUTPUT 🏆")
            print(json.dumps(prediction_result, indent=2))
            print("="*50)

        except json.JSONDecodeError:
            print("\n❌ ERROR: Invalid JSON format. Please check your pasted text.")
            print("Common mistakes include missing commas, brackets {}, or quotes \"\".")
        except KeyError as e:
            print(f"\n❌ ERROR: The provided JSON is missing a required key: {e}")
            print("Please ensure 'match_context' and 'features' keys are present.")
        except Exception as e:
            print(f"\n❌ An unexpected error occurred: {e}")


except FileNotFoundError:
    print("\n" + "!"*50)
    print("🔥 ERROR: MODEL OR ENCODER FILE NOT FOUND! 🔥")
    print(f"The script was looking for '{MANUAL_MODEL_PATH}' and '{MANUAL_ENCODER_PATH}'.")
    print("Please make sure these files exist in your Colab session directory.")
    print("You may need to run the training cell again.")
    print("!"*50)

--- MODEL SETUP ---
Using manually specified file paths.
🔄 Loading model from: tennis_model_gpu_complete_20250707_204740.json
✅ Model loaded successfully.
🔄 Loading encoder from: surface_encoder_complete_20250707_204740.pkl
✅ Encoder loaded successfully.
--------------------

🎾 TENNIS PREDICTOR IS READY 🎾
Instructions:
1. Prepare your match data in the JSON format below.
2. Paste the entire JSON object into the input box and press Enter.
3. To finish, type 'exit' and press Enter.

--- JSON TEMPLATE (you can copy this) ---

{
  "match_context": {
    "player_1": "Carlos Alcaraz",
    "player_2": "Jannik Sinner",
    "surface": "Grass",
    "tournament_level": "Unknown",
    "match_id": "alcaraz_vs_sinner_grass_simulation"
  },
  "features": {
    "glicko_confidence_diff": 0.15, "glicko_win_prob": 0.55, "glicko_diff": 38,
    "rank_diff": 1, "rank_pts_diff": -1130, "rd_diff": 0.08, "surf_elo_diff": 173,
    "ranking_volatility": -0.12, "form_confidence": 0.18, "freshness_penalty": -0.05,

In [None]:
import asyncio
import datetime
import csv
from playwright.async_api import async_playwright

# List of words to filter out from tournament names/categories.
# Added "wta" to the list. Case-insensitive.
FILTER_KEYWORDS = ["itf", "utr", "wta", "wheelchairs", "girls", "junior", "challenger"]

async def get_and_write_matches(date_str: str, csv_writer):
    """
    Uses Playwright to fetch match data, filters for non-ITF/UTR/WTA singles matches,
    and writes the results to a CSV file.
    """
    print(f"\n--- Scraping Matches for {date_str} ---")

    matches_url = f"https://api.sofascore.com/api/v1/sport/tennis/scheduled-events/{date_str}"

    # JavaScript to execute in the browser to fetch API data
    js_to_fetch_data = f"""
        async () => {{
            try {{
                const response = await fetch('{matches_url}');
                if (!response.ok) {{
                    return {{'error': `API responded with status: ${{response.status}}`}};
                }}
                return await response.json();
            }} catch (e) {{
                return {{'error': e.toString()}};
            }}
        }}
    """

    match_data = None
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            print("Initializing browser session...")
            await page.goto("https://www.sofascore.com/tennis", wait_until="domcontentloaded")
            print("Session established. Fetching data...")

            match_data = await page.evaluate(js_to_fetch_data)

            await browser.close()
            print("Browser closed.")

    except Exception as e:
        print(f"[!!] An error occurred during Playwright execution: {e}")
        return

    if not match_data or match_data.get('error'):
        print(f"[!!] FAILED to fetch matches for {date_str}.")
        if match_data:
            print(f"    Error from browser fetch: {match_data.get('error')}")
        return

    events = match_data.get('events', [])
    if not events:
        print("No matches found for this date.")
        return

    print(f"Found {len(events)} total events. Filtering and writing to CSV...")

    count_written = 0
    for event in events:
        category = event.get('tournament', {}).get('category', {}).get('name', 'N/A')
        tournament_name = event.get('tournament', {}).get('name', 'N/A')
        home_team = event.get('homeTeam', {}).get('name', 'N/A')
        away_team = event.get('awayTeam', {}).get('name', 'N/A')

        # Combine category and tournament name for easier filtering
        full_tournament_info = f"{category} {tournament_name}".lower()

        # --- NEW & IMPROVED FILTERING LOGIC ---
        # 1. Skip if the combined info contains any filter keyword (ITF, UTR, WTA)
        if any(keyword in full_tournament_info for keyword in FILTER_KEYWORDS):
            continue

        # 2. Skip if it's a doubles match
        if "doubles" in full_tournament_info or "/" in home_team or "/" in away_team:
            continue

        # If the event passed all filters, extract its data
        start_timestamp = event.get('startTimestamp')
        match_time = datetime.datetime.fromtimestamp(start_timestamp).strftime('%H:%M')

        # Write the filtered row to the CSV
        csv_writer.writerow([
            date_str,
            match_time,
            category,
            tournament_name,
            home_team,
            away_team
        ])
        count_written += 1

    print(f"Wrote {count_written} filtered matches to the CSV file.")

async def main():
    """Main function to orchestrate the scraping and CSV writing."""
    today = datetime.date.today()
    tomorrow = today + datetime.timedelta(days=1)

    today_str = today.strftime('%Y-%m-%d')
    tomorrow_str = tomorrow.strftime('%Y-%m-%d')

    output_filename = "sofascore_filtered_matches.csv"

    with open(output_filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Date', 'Time', 'Category', 'Tournament', 'Player 1', 'Player 2'])

        await get_and_write_matches(today_str, writer)
        await get_and_write_matches(tomorrow_str, writer)

    print(f"\n✅ All done! Data saved to '{output_filename}'")

In [None]:
await main()


--- Scraping Matches for 2025-07-10 ---
Initializing browser session...
Session established. Fetching data...
Browser closed.
Found 946 total events. Filtering and writing to CSV...
Wrote 4 filtered matches to the CSV file.

--- Scraping Matches for 2025-07-11 ---
Initializing browser session...
Session established. Fetching data...
Browser closed.
Found 329 total events. Filtering and writing to CSV...
Wrote 4 filtered matches to the CSV file.

✅ All done! Data saved to 'sofascore_filtered_matches.csv'


In [None]:
"""
extract_main_table.py
Fetch the main Elo table and save as CSV, without pd.read_html warnings.
"""

import requests
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO

INDEX_URL = "https://tennisabstract.com/reports/atp_elo_ratings.html"
OUTPUT_CSV = "atp_elo_directory.csv"

def fetch_and_save_directory():
    # 1. Download page
    resp = requests.get(INDEX_URL, timeout=30)
    resp.raise_for_status()

    # 2. Parse the Elo table
    soup = BeautifulSoup(resp.text, "html.parser")
    table = soup.find("table", id="reportable")
    if not table:
        raise RuntimeError("Could not find table#reportable")

    # 3. Read into DataFrame via StringIO
    html = StringIO(str(table))
    df = pd.read_html(html, header=0)[0]

    # 4. Extract profile URLs
    hrefs = []
    for row in table.tbody.find_all("tr"):
        a = row.find("a", href=True)
        hrefs.append(a["href"] if a else "")

    df["ProfileURL"] = ["https://tennisabstract.com" + h for h in hrefs]

    # 5. Save to CSV
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved {len(df)} players to {OUTPUT_CSV}")

if __name__ == "__main__":
    fetch_and_save_directory()


Saved 487 players to atp_elo_directory.csv
