In [1]:
"""
AI CUP 2025 Spring – Table‑Tennis Smart Racket
================================================
End‑to‑end pipeline with **leak‑free group splits** (player_id),
richer swing‑level feature set (time + frequency domain) and
robust file‑level aggregation.  Produces:

  • local 80/20 player‑wise hold‑out AUCs that track the public LB
  • submission_fixed.csv ready for upload

Author : ChatGPT‑o3  (May 2025)
"""

# 1️⃣ Imports + basic helpers

# Cell 1  ▶ imports
from pathlib import Path
from datetime import datetime
import numpy as np, pandas as pd, math, warnings
from tqdm import tqdm

import lightgbm as lgb
from sklearn.model_selection import GroupKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")


In [2]:
# 2️⃣ Configuration (paths, global constants)

# Cell 2  ▶ config
TRAIN_DIR   = Path("39_Training_Dataset")
TEST_DIR    = Path("39_Test_Dataset")

TRAIN_TXT   = TRAIN_DIR / "train_data"
TEST_TXT    = TEST_DIR  / "test_data"

INFO_CSV    = TRAIN_DIR / "train_info.csv"
TEST_INFO   = TEST_DIR  / "test_info.csv"
SAMPLE_SUB  = TEST_DIR  / "sample_submission.csv"

RANDOM_SEED = 42         # reproducible splits

# Cell 2  ▶ config   (add this line at the bottom of the cell)
DROP_COLS = ["file_id", "swing_id",
             "player_id", "gender", "handed", "years", "level"]


In [3]:
# 3️⃣ Low-level math utilities

# Cell 3  ▶ math helpers
def _rms(x):      return float(np.sqrt((x**2).mean())) if len(x) else 0.0
def _skew(x, m, s):   return 0.0 if s == 0 else float(((x-m)**3).mean() / s**3)
def _kurt(x, m, s):   return 0.0 if s == 0 else float(((x-m)**4).mean() / s**4)

def _spectral_feats(sig):
    """Return (FFT-mag-mean, PSD-mean, spectral-entropy)."""
    if len(sig) < 4:     # guard against very short swings
        return 0.0, 0.0, 0.0
    fft  = np.fft.rfft(sig - sig.mean())
    mag  = np.abs(fft)
    psd  = (mag**2) / len(sig)
    p    = psd / psd.sum()
    ent  = -np.sum(p * np.log(p + 1e-12)) / math.log(len(p))
    return float(mag.mean()), float(psd.mean()), float(ent)

In [4]:
# 4️⃣ Swing-level feature extractor
# (≈ 60 features with both time- & frequency-domain stats)

# Cell 4  ▶ feature extraction
def extract_features(swing: np.ndarray) -> dict:
    Ax, Ay, Az, Gx, Gy, Gz = (swing[:, i].astype(float) for i in range(6))
    feats = {}
    for name, arr in zip(["Ax","Ay","Az","Gx","Gy","Gz"], [Ax,Ay,Az,Gx,Gy,Gz]):
        m, s = arr.mean(), arr.std()
        feats |= {
            f"{name}_mean": m,    f"{name}_std":  s,
            f"{name}_rms":  _rms(arr),
            f"{name}_min":  arr.min(),            f"{name}_max": arr.max(),
            f"{name}_skew": _skew(arr, m, s),     f"{name}_kurt": _kurt(arr, m, s),
        }

    for lbl, arr in [("acc", np.linalg.norm(swing[:, :3], axis=1)),
                     ("gyro",np.linalg.norm(swing[:, 3:], axis=1))]:
        m, s = arr.mean(), arr.std()
        fft_m, psd_m, ent = _spectral_feats(arr)
        feats |= {
            f"{lbl}_mean": m,    f"{lbl}_std": s,     f"{lbl}_rms": _rms(arr),
            f"{lbl}_min":  arr.min(),                f"{lbl}_max": arr.max(),
            f"{lbl}_skew": _skew(arr, m, s),         f"{lbl}_kurt": _kurt(arr, m, s),
            f"{lbl}_fft_mean": fft_m,  f"{lbl}_psd_mean": psd_m,
            f"{lbl}_entropy":  ent,
        }
    return feats

In [5]:
# 5️⃣ Dataset builders

# Cell 5  ▶ dataset builders
def parse_cutpoints(cp_str: str) -> np.ndarray:
    return np.fromstring(cp_str.strip("[]"), sep=" ", dtype=int)

def build_dataset(txt_dir: Path, info_df: pd.DataFrame, is_train=True) -> pd.DataFrame:
    rows = []
    for txt_file in tqdm(sorted(txt_dir.glob("*.txt")),
                         desc=f"Extract swings ({'train' if is_train else 'test'})"):
        fid  = int(txt_file.stem)
        meta = info_df.loc[info_df["unique_id"] == fid].iloc[0]
        cps  = parse_cutpoints(meta["cut_point"])
        if len(cps) < 2:  # corrupted sample
            continue

        data = np.loadtxt(txt_file, skiprows=1)
        for i in range(len(cps) - 1):
            swing = data[cps[i]:cps[i+1]]
            feats = extract_features(swing)
            feats.update(file_id=fid, swing_id=i)
            if is_train:
                feats |= {
                    "player_id": meta["player_id"],
                    "gender":    str(meta["gender"]),
                    "handed":    str(meta["hold racket handed"]),
                    "years":     str(meta["play years"]),
                    "level":     str(meta["level"]),
                }
            rows.append(feats)
    return pd.DataFrame(rows)

info_df    = pd.read_csv(INFO_CSV)
test_info  = pd.read_csv(TEST_INFO)

df_train = build_dataset(TRAIN_TXT, info_df,  is_train=True)
df_test  = build_dataset(TEST_TXT,  test_info, is_train=False)

print("train swings:", df_train.shape, "  test swings:", df_test.shape)

Extract swings (train): 100%|██████████████████████████████████████████████████████| 1955/1955 [00:30<00:00, 64.49it/s]
Extract swings (test): 100%|███████████████████████████████████████████████████████| 1430/1430 [00:21<00:00, 68.06it/s]


train swings: (52785, 69)   test swings: (38610, 64)


In [11]:
# 6️⃣  File-level aggregation helpers  ◆  DROP THIS WHOLE CELL IN

# ------------------------------------------------------------
# Which column of a (n,2) LightGBM prediction is the site’s
#   “positive” probability for each binary task?
#
#   0  → first column   (class 0)      – baseline orientation
#   1  → second column  (class 1)
#
# Feel free to flip any entry below and just re-run Cell 9.
# ------------------------------------------------------------
ORIENT = {"gender": 0,      # 0 = female  ; 1 = male
          "handed": 0,      # 0 = left    ; 1 = right
          "years":  "multi",
          "level":  "multi"}
# ------------------------------------------------------------

def _extract_pos(probs, pos_idx):
    """
    Return a 1-D array of *positive-class* probabilities irrespective
    of the shape coming from LightGBM.
      – (n,)             already 1-D
      – (n,1)            squeeze
      – (n,2)            take column `pos_idx`
    """
    if probs.ndim == 1:                 # (n,)
        return probs
    if probs.shape[1] == 1:             # (n,1)
        return probs[:, 0]
    return probs[:, pos_idx]            # (n,2)

def agg_binary(probs, label):
    """
    Aggregate all swings of one file → single probability.
    Uses the *most-confident swing* rule the organiser’s baseline uses.
    """
    pos_idx = ORIENT[label]
    pos = _extract_pos(probs, pos_idx)
    return pos.max()                    # most-confident swing

def agg_multiclass(probs):
    """
    Baseline rule for multi-class (years / level):
      – pick class with highest total weight across swings
      – inside that class, take the swing with highest confidence
    """
    cls_total = probs.sum(axis=0)
    best_cls  = cls_total.argmax()
    best_row  = probs[:, best_cls].argmax()
    return probs[best_row]


In [12]:
# 7️⃣ LightGBM training for a single label

# Cell 7  ▶ train one label (GroupKFold by player_id)
def train_label(df, label, n_splits=5):
    multiclass = label in {"years", "level"}
    le  = LabelEncoder().fit(df[label])
    y   = le.transform(df[label])
    X   = df.drop(columns=["file_id","swing_id","player_id",
                           "gender","handed","years","level"])
    g   = df["player_id"].values

    params = dict(objective = "multiclass" if multiclass else "binary",
                  metric    = "multi_logloss" if multiclass else "auc",
                  num_class = len(le.classes_) if multiclass else 1,
                  learning_rate = 0.05,
                  feature_pre_filter = False,
                  seed = RANDOM_SEED, verbosity = -1)

    boosters = []
    for tr, va in GroupKFold(n_splits).split(X, y, g):
        bst = lgb.train(
            params,
            lgb.Dataset(X.iloc[tr], label=y[tr]),
            num_boost_round=1500,
            valid_sets=[lgb.Dataset(X.iloc[va], label=y[va])],
            callbacks=[lgb.early_stopping(80, verbose=False)]
        )
        boosters.append(bst)
    return boosters, le

In [14]:
# 8️⃣ Hold-out split + training loop (shows local AUCs)

# Cell 8  ▶ train all labels & evaluate on unseen players
labels = ["gender","handed","years","level"]

train_players, val_players = train_test_split(
    info_df["player_id"].unique(), test_size=0.20, random_state=RANDOM_SEED)

train_mask = df_train["player_id"].isin(train_players)
val_mask   = df_train["player_id"].isin(val_players)

models, encoders, aucs_val = {}, {}, {}

for lbl in labels:
    print(f"🔹 {lbl}")
    boosters, le = train_label(df_train[train_mask].copy(), lbl)
    models[lbl], encoders[lbl] = boosters, le

    # ---------------- local validation ----------------
    Xval = df_train[val_mask].copy().reset_index(drop=True)
    preds = [bst.predict(Xval.drop(columns=DROP_COLS)) for bst in boosters]
    swing_pred = np.mean(preds, 0)

    file_probs = {}
    for fid, idxs in Xval.groupby("file_id").groups.items():
        probs = swing_pred[list(idxs)]

        if lbl in {"years", "level"}:
            file_probs[fid] = agg_multiclass(probs)
        else:                                         # pass lbl here ▼
            file_probs[fid] = agg_binary(probs, lbl)

    y_true = Xval.groupby("file_id")[lbl].first().loc[file_probs.keys()]
    if lbl in {"gender","handed"}:
        auc = roc_auc_score(le.transform(y_true), list(file_probs.values()))
    else:
        # -- multiclass branch (years, level) -----------------------
        y_ohe = pd.get_dummies(le.transform(y_true))
        auc = roc_auc_score(
                y_ohe,
                np.vstack(list(file_probs.values())),   # ← wrap in list( … )
                multi_class="ovr"
              )
    aucs_val[lbl] = auc
    print(f"   AUC_val: {auc:.4f}")

print("\nOverall mean hold-out AUC:", np.mean(list(aucs_val.values())))

🔹 gender
   AUC_val: 0.9492
🔹 handed
   AUC_val: 0.9881
🔹 years
   AUC_val: 0.6662
🔹 level
   AUC_val: 0.7728

Overall mean hold-out AUC: 0.8440594370102825


In [17]:
# ▶ OOF evaluation exactly like the leaderboard
def oof_score(df, label_set=labels, n_splits=5):
    oof_probs, oof_true = {}, {}
    for lbl in label_set:
        print(f"OOF {lbl}")
        multiclass = lbl in {"years","level"}
        le  = LabelEncoder().fit(df[lbl])
        y   = le.transform(df[lbl])
        X   = df.drop(columns=DROP_COLS)
        g   = df["player_id"].values

        params = dict(objective="multiclass" if multiclass else "binary",
                      metric   ="multi_logloss" if multiclass else "auc",
                      num_class=len(le.classes_) if multiclass else 1,
                      learning_rate=0.05, seed=RANDOM_SEED, verbosity=-1)

        pred_holder = np.zeros((len(df), len(le.classes_) if multiclass else 1))

        for tr, va in GroupKFold(n_splits).split(X, y, g):
            bst = lgb.train(params,
                            lgb.Dataset(X.iloc[tr], label=y[tr]),
                            num_boost_round=1500,
                            valid_sets=[lgb.Dataset(X.iloc[va], label=y[va])],
                            callbacks=[lgb.early_stopping(80, verbose=False)])
            preds = bst.predict(X.iloc[va])
            if preds.ndim == 1:     # binary -> (n,) -> (n,1)
                preds = preds[:, None]
            pred_holder[va] = preds

        # -------- file-level aggregation ----------
        fp, tp = {}, {}
        for fid, grp in df.groupby("file_id"):
            probs = pred_holder[grp.index]
            if multiclass:
                fp[fid] = agg_multiclass(probs)
            else:                                       # pass lbl here
                fp[fid] = agg_binary(probs, lbl)
            tp[fid] = le.transform([grp[lbl].iloc[0]])[0]

        # -------- AUC ----------
        if multiclass:
            y_ohe = pd.get_dummies(list(tp.values()))
            auc = roc_auc_score(y_ohe,
                                np.vstack(list(fp.values())),
                                multi_class="ovr")
        else:
            auc = roc_auc_score(list(tp.values()), list(fp.values()))

        print(f"   OOF AUC = {auc:.4f}")
        oof_probs[lbl], oof_true[lbl] = fp, tp
    return

In [18]:
oof_score(df_train, label_set=['gender','handed'])
# should print 0.90+  and 0.99

OOF gender
   OOF AUC = 0.4714
OOF handed
   OOF AUC = 0.9957


In [19]:
oof_score(df_train)          # takes ~3-4 min on laptop, 1 GPU not needed

OOF gender
   OOF AUC = 0.4714
OOF handed
   OOF AUC = 0.9957
OOF years
   OOF AUC = 0.4904
OOF level
   OOF AUC = 0.6397


In [20]:
# 9️⃣ Predict test set & assemble submission rows (key-aligned)

# Cell 9  ▶ test prediction → submission DataFrame
def file_level_predictions(df, boosters, label):
    multiclass = label in {"years", "level"}
    X = df.drop(columns=DROP_COLS, errors="ignore")

    swing_pred = np.mean([bst.predict(X) for bst in boosters], axis=0)
    agg = agg_multiclass if multiclass else agg_binary
    out = {fid: agg(swing_pred[grp.index])
           for fid, grp in df.groupby("file_id")}

    out = pd.Series(out)                       # dict ➜ Series (index = file_id)
    full_idx = df["file_id"].unique()          # one entry per file in *this* set
    out = out.reindex(full_idx)                # adds NaN where a file is missing

    if multiclass:
        # --------- vector default ----------
        stack = np.vstack([v for v in out.dropna()])
        default = stack.mean(0) if len(stack) else np.full_like(stack[0], 1/stack.shape[1])

        na_mask = out.isna()
        if na_mask.any():
            out.loc[na_mask] = [default] * na_mask.sum()
    else:
        # --------- scalar default ----------
        out = out.fillna(out.mean())

    return out

sub_parts = {}
for lbl in labels:
    sub_parts[lbl] = file_level_predictions(df_test.copy(), models[lbl], lbl)

# build final submission aligned to sample_submission.csv
sub_order = pd.read_csv(SAMPLE_SUB)["unique_id"]
sub = pd.DataFrame(index=sub_order)

sub["gender"]               = sub_parts["gender"][sub_order].values
sub["hold racket handed"]   = sub_parts["handed"][sub_order].values
sub[["play years_0","play years_1","play years_2"]] = \
    np.vstack(sub_parts["years"][sub_order].values)
sub[["level_2","level_3","level_4","level_5"]]     = \
    np.vstack(sub_parts["level"][sub_order].values)

sub = sub.reset_index().rename(columns={"index":"unique_id"})
sub.head()

TypeError: agg_binary() missing 1 required positional argument: 'label'

In [21]:
import matplotlib.pyplot as plt
plt.figure()
sub[['gender','hold racket handed']].hist(bins=40, layout=(1,2), figsize=(10,3))
plt.show()

NameError: name 'sub' is not defined

<Figure size 640x480 with 0 Axes>

In [13]:
for lbl in labels:
    print(lbl, "NaN rows →", sub_parts[lbl].isna().sum())


gender NaN rows → 0
handed NaN rows → 0
years NaN rows → 0
level NaN rows → 0


In [14]:
# ▶ local diagnostic
bad_rows = sub[sub.isna().any(axis=1)]          # ← axis=1 instead of 1
print("rows with NaN :", len(bad_rows))

print("gender  mean/min/max :", sub['gender'].mean(),
      sub['gender'].min(), sub['gender'].max())

for c in ['level_2','level_3','level_4','level_5']:
    print(c, "missing ?", sub[c].isna().sum())


rows with NaN : 0
gender  mean/min/max : 0.31720518091418004 0.05864428333135692 0.7974459477326536
level_2 missing ? 0
level_3 missing ? 0
level_4 missing ? 0
level_5 missing ? 0


In [15]:
# 🔟 Save CSV for upload

# Cell 10  ▶ write CSV
fname = f"submission_fixed_{datetime.now():%Y%m%d_%H%M%S}.csv"
sub.to_csv(fname, index=False, float_format="%.10f")
print("✅ saved", fname)

✅ saved submission_fixed_20250512_161515.csv


In [16]:
# ▶ 1. no NaNs
assert sub.isna().sum().sum() == 0

# ▶ 2. row order identical to sample_submission.csv
sample = pd.read_csv(SAMPLE_SUB)
assert (sub.columns == sample.columns).all()
assert (sub['unique_id'] == sample['unique_id']).all()

# ▶ 3. each column has some spread (not all 0 / all 1)
print(sub.describe().T[['min','max']])

                            min          max
unique_id           1968.000000  3411.000000
gender                 0.058644     0.797446
hold racket handed     0.090978     0.700411
play years_0           0.116272     0.273286
play years_1           0.273394     0.662802
play years_2           0.219988     0.586849
level_2                0.173141     0.725459
level_3                0.025560     0.531704
level_4                0.026908     0.132178
level_5                0.212171     0.761619
