In [4]:
"""
AI CUP 2025 Spring – Table‑Tennis Smart Racket
================================================
End‑to‑end pipeline with **leak‑free group splits** (player_id),
richer swing‑level feature set (time + frequency domain) and
robust file‑level aggregation.  Produces:

  • local 80/20 player‑wise hold‑out AUCs that track the public LB
  • submission_fixed.csv ready for upload

Author : ChatGPT‑o3  (May 2025)
"""

# 1️⃣ Imports + basic helpers

# Cell 1  ▶ imports
from pathlib import Path
from datetime import datetime
import numpy as np, pandas as pd, math, warnings
from tqdm import tqdm

import lightgbm as lgb
from sklearn.model_selection import GroupKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")


In [5]:
# 2️⃣ Configuration (paths, global constants)

# Cell 2  ▶ config
TRAIN_DIR   = Path("39_Training_Dataset")
TEST_DIR    = Path("39_Test_Dataset")

TRAIN_TXT   = TRAIN_DIR / "train_data"
TEST_TXT    = TEST_DIR  / "test_data"

INFO_CSV    = TRAIN_DIR / "train_info.csv"
TEST_INFO   = TEST_DIR  / "test_info.csv"
SAMPLE_SUB  = TEST_DIR  / "sample_submission.csv"

RANDOM_SEED = 42         # reproducible splits

# Cell 2  ▶ config   (add this line at the bottom of the cell)
DROP_COLS = ["file_id", "swing_id",
             "player_id", "gender", "handed", "years", "level"]


In [6]:
# ▶ Stage 1 – inspect binary columns & decide orientation automatically
binary_cols = {
    "gender": "gender",                 # in train_info.csv
    "handed": "hold racket handed"      # organiser's exact header
}

info_df = pd.read_csv(INFO_CSV)

ORIENT = {}   # will be filled: {'gender': 0 or 1, 'handed': 0 or 1}

for key, col in binary_cols.items():
    if col not in info_df.columns:
        raise KeyError(f"❌ Column '{col}' not found in train_info.csv!")

    le = LabelEncoder().fit(info_df[col].astype(str))
    classes = list(le.classes_)
    counts  = info_df[col].value_counts().to_dict()

    print(f"\n{key.upper()}  → raw classes:", classes, "  counts:", counts)

    if len(classes) != 2:
        raise ValueError(f"{key} should have exactly 2 classes!")

    # --- choose orientation:     send probability of the *minority* class
    minority_cls = min(classes, key=lambda c: counts.get(c, 0))
    minority_idx = classes.index(minority_cls)
    ORIENT[key]  = minority_idx

    print(f"  ↳ Leaderboard expects prob('{minority_cls}'), "
          f"which is column {minority_idx} in LightGBM output.")
    print("  ✔ ORIENT updated.")

print("\nFinal ORIENT mapping  ➜ ", ORIENT)



GENDER  → raw classes: ['1', '2']   counts: {1: 1627, 2: 328}
  ↳ Leaderboard expects prob('1'), which is column 0 in LightGBM output.
  ✔ ORIENT updated.

HANDED  → raw classes: ['1', '2']   counts: {1: 1589, 2: 366}
  ↳ Leaderboard expects prob('1'), which is column 0 in LightGBM output.
  ✔ ORIENT updated.

Final ORIENT mapping  ➜  {'gender': 0, 'handed': 0}


In [10]:
# 3️⃣ Low-level math utilities

# Cell 3  ▶ math helpers
def _rms(x):      return float(np.sqrt((x**2).mean())) if len(x) else 0.0
def _skew(x, m, s):   return 0.0 if s == 0 else float(((x-m)**3).mean() / s**3)
def _kurt(x, m, s):   return 0.0 if s == 0 else float(((x-m)**4).mean() / s**4)

def _spectral_feats(sig):
    """Return (FFT-mag-mean, PSD-mean, spectral-entropy)."""
    if len(sig) < 4:     # guard against very short swings
        return 0.0, 0.0, 0.0
    fft  = np.fft.rfft(sig - sig.mean())
    mag  = np.abs(fft)
    psd  = (mag**2) / len(sig)
    p    = psd / psd.sum()
    ent  = -np.sum(p * np.log(p + 1e-12)) / math.log(len(p))
    return float(mag.mean()), float(psd.mean()), float(ent)

In [11]:
# 4️⃣ Swing-level feature extractor
# (≈ 60 features with both time- & frequency-domain stats)

# Cell 4  ▶ feature extraction
def extract_features(swing: np.ndarray) -> dict:
    Ax, Ay, Az, Gx, Gy, Gz = (swing[:, i].astype(float) for i in range(6))
    feats = {}
    for name, arr in zip(["Ax","Ay","Az","Gx","Gy","Gz"], [Ax,Ay,Az,Gx,Gy,Gz]):
        m, s = arr.mean(), arr.std()
        feats |= {
            f"{name}_mean": m,    f"{name}_std":  s,
            f"{name}_rms":  _rms(arr),
            f"{name}_min":  arr.min(),            f"{name}_max": arr.max(),
            f"{name}_skew": _skew(arr, m, s),     f"{name}_kurt": _kurt(arr, m, s),
        }

    for lbl, arr in [("acc", np.linalg.norm(swing[:, :3], axis=1)),
                     ("gyro",np.linalg.norm(swing[:, 3:], axis=1))]:
        m, s = arr.mean(), arr.std()
        fft_m, psd_m, ent = _spectral_feats(arr)
        feats |= {
            f"{lbl}_mean": m,    f"{lbl}_std": s,     f"{lbl}_rms": _rms(arr),
            f"{lbl}_min":  arr.min(),                f"{lbl}_max": arr.max(),
            f"{lbl}_skew": _skew(arr, m, s),         f"{lbl}_kurt": _kurt(arr, m, s),
            f"{lbl}_fft_mean": fft_m,  f"{lbl}_psd_mean": psd_m,
            f"{lbl}_entropy":  ent,
        }
    return feats

In [12]:
# ▶ Stage 2 – feature extraction sanity-check on ONE file
from itertools import islice

def extract_single_file_features(txt_dir, info_df):
    # --- pick the very first .txt file ---
    txt_path = next(iter(sorted(txt_dir.glob("*.txt"))))   # ← fix here
    fid = int(txt_path.stem)

    meta = info_df.loc[info_df["unique_id"] == fid].iloc[0]
    cps  = np.fromstring(meta["cut_point"].strip("[]"), sep=" ", dtype=int)
    data = np.loadtxt(txt_path, skiprows=1)

    rows = []
    for i in range(len(cps) - 1):
        swing = data[cps[i]:cps[i+1]]
        feats = extract_features(swing)
        feats.update(file_id=fid, swing_id=i)
        rows.append(feats)
    return pd.DataFrame(rows), fid

one_df, one_id = extract_single_file_features(TRAIN_TXT, info_df)

# ----------  summaries ----------
print(f"\nFile {one_id}  – swings extracted:", one_df.shape[0])
print("Columns (total =", one_df.shape[1], "):")
print(list(islice(one_df.columns, 0, 10)), "...")

print("\nHead:")
display(one_df.head())

print("\nNaN count across all cells:", one_df.isna().sum().sum())



File 1  – swings extracted: 27
Columns (total = 64 ):
['Ax_mean', 'Ax_std', 'Ax_rms', 'Ax_min', 'Ax_max', 'Ax_skew', 'Ax_kurt', 'Ay_mean', 'Ay_std', 'Ay_rms'] ...

Head:


Unnamed: 0,Ax_mean,Ax_std,Ax_rms,Ax_min,Ax_max,Ax_skew,Ax_kurt,Ay_mean,Ay_std,Ay_rms,...,gyro_rms,gyro_min,gyro_max,gyro_skew,gyro_kurt,gyro_fft_mean,gyro_psd_mean,gyro_entropy,file_id,swing_id
0,3856.721311,2129.387398,4405.518127,1698.0,7532.0,0.537805,1.711119,-1702.213115,2199.471271,2781.223357,...,35577.58591,1330.982344,56754.686168,-0.163876,1.848227,81598.214998,325062100.0,0.486607,1,0
1,3903.393443,2252.045775,4506.460977,-636.0,7486.0,-0.332064,2.206401,-2576.081967,2145.392717,3352.448093,...,41893.487426,7865.75114,56754.686168,-0.699487,2.808111,46798.293575,167862700.0,0.382594,1,1
2,3771.770492,2298.776256,4417.083304,-838.0,7596.0,-0.304744,2.446748,-2319.032787,2164.546543,3172.250747,...,39776.74491,8441.384247,56754.686168,-0.315995,2.546671,45460.399393,168347200.0,0.381986,1,2
3,2952.901639,2328.757783,3760.683568,-495.0,7662.0,0.003536,1.967154,-2401.95082,1997.532211,3124.020274,...,36145.882576,11451.118766,55909.919281,-0.145628,2.206373,54708.649532,145168200.0,0.559648,1,3
4,3850.0,2206.016743,4437.229977,-378.0,7533.0,-0.289517,2.507603,-2258.295082,2119.006954,3096.786584,...,39914.111505,12634.362548,56754.686168,-0.241722,2.322529,42877.287186,162303600.0,0.345134,1,4



NaN count across all cells: 0
