# AI‑Cup 2025 – Leak‑Free End‑to‑End Pipeline 🏓
This single notebook will:
1. Extract swing‑level features
2. Train LightGBM base models with **GroupKFold(file_id)**
3. Save swing‑level probabilities to disk (`swing_level_with_preds.pkl`)
4. Train per‑label meta models (mean / max / official rule features) with **GroupKFold**
5. Create `submission_meta.csv` containing one row per **unique_id** with probabilities ready for upload.

Just set `DATA_DIR` to the folder that contains `train_info.csv` and `train_data/*.txt`, then run all cells.

In [1]:
import numpy as np, pandas as pd, lightgbm as lgb, warnings, os
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import GroupKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegressionCV
warnings.filterwarnings('ignore')

DATA_DIR = Path('39_Training_Dataset')  # <- adjust if needed
INFO_CSV = DATA_DIR / 'train_info.csv'
TXT_DIR  = DATA_DIR / 'train_data'

In [2]:
def extract_features_from_swing(arr: np.ndarray):
    Ax,Ay,Az = arr[:,0],arr[:,1],arr[:,2]
    Gx,Gy,Gz = arr[:,3],arr[:,4],arr[:,5]
    acc = np.linalg.norm(arr[:,:3],1)
    gyro= np.linalg.norm(arr[:,3:],1)
    return dict(
        Ax_mean=Ax.mean(), Ax_std=Ax.std(),
        Ay_mean=Ay.mean(), Ay_std=Ay.std(),
        Az_mean=Az.mean(), Az_std=Az.std(),
        Gx_mean=Gx.mean(), Gx_std=Gx.std(),
        Gy_mean=Gy.mean(), Gy_std=Gy.std(),
        Gz_mean=Gz.mean(), Gz_std=Gz.std(),
        acc_mag_mean=acc.mean(), gyro_mag_mean=gyro.mean(),
        acc_vs_gyro_ratio=acc.mean()/(gyro.mean()+1e-6))

In [3]:
def build_dataset(txt_dir:Path, info_df:pd.DataFrame):
    rows=[]
    for txt in tqdm(sorted(txt_dir.glob('*.txt')),desc='Extract swings'):
        fid=int(txt.stem)
        meta=info_df[info_df['unique_id']==fid].iloc[0]
        cps=np.fromstring(meta['cut_point'].strip('[]'),sep=' ',dtype=int)
        if len(cps)<2: continue
        data=np.loadtxt(txt,skiprows=1)
        for i in range(len(cps)-1):
            swing=data[cps[i]:cps[i+1]]
            feats=extract_features_from_swing(swing)
            feats.update(file_id=fid,swing_id=i,
                         gender=meta['gender'],
                         handed=meta['hold racket handed'],
                         years=meta['play years'],
                         level=meta['level'])
            rows.append(feats)
    return pd.DataFrame(rows)

info_df=pd.read_csv(INFO_CSV)
df=build_dataset(TXT_DIR,info_df)
print('Dataset',df.shape)

Extract swings: 100%|██████████████████████████████████████████████████████████████| 1955/1955 [00:30<00:00, 64.75it/s]


Dataset (52785, 21)


In [4]:
file_ids=df['file_id'].unique()
train_ids, test_ids = train_test_split(file_ids,test_size=0.15,random_state=42)
train_ids, val_ids  = train_test_split(train_ids,test_size=0.1765,random_state=42)
df['split']=df['file_id'].apply(lambda x:'train' if x in train_ids else 'val' if x in val_ids else 'test')
print(df['split'].value_counts())

split
train    36909
val       7938
test      7938
Name: count, dtype: int64


In [5]:
def train_lgb(label, multiclass=False):
    tr=df[df['split']=='train']
    drop=['file_id','swing_id','split','gender','handed','years','level',label]
    X=tr.drop(columns=drop)
    le=LabelEncoder(); y=le.fit_transform(tr[label])

    params=dict(objective='multiclass' if multiclass else 'binary',
                metric='multi_logloss' if multiclass else 'auc',
                learning_rate=0.05,verbosity=-1,seed=42,
                num_class=len(le.classes_) if multiclass else 1)
    gkf=GroupKFold(9); groups=tr['file_id'].values
    oof=np.zeros((len(tr),len(le.classes_)) if multiclass else len(tr))
    models=[]
    for tr_idx,va_idx in gkf.split(X,y,groups):
        dtrain=lgb.Dataset(X.iloc[tr_idx],y[tr_idx])
        dval  =lgb.Dataset(X.iloc[va_idx],y[va_idx])
        m=lgb.train(params,dtrain,1000,[dval],callbacks=[lgb.early_stopping(50)])
        models.append(m)
        oof[va_idx]=m.predict(X.iloc[va_idx])
    auc=roc_auc_score(y,oof,multi_class='ovr') if multiclass else roc_auc_score(y,oof)
    print(f'{label} OOF AUC {auc:.4f}')
    return models,le

In [6]:
models_gender, le_gender = train_lgb('gender')
models_hand,   le_hand   = train_lgb('handed')
models_years,  le_years  = train_lgb('years',True)
models_level,  le_level  = train_lgb('level',True)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[233]	valid_0's auc: 0.966146
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[807]	valid_0's auc: 0.990947
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[126]	valid_0's auc: 0.963629
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[724]	valid_0's auc: 0.989121
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[912]	valid_0's auc: 0.991284
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[711]	valid_0's auc: 0.984612
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[233]	valid_0's auc: 0.959846
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[666]	valid_0's auc: 0

In [18]:
def predict_swings(models, subset_df, label, le, multiclass=False):
    # --- argument sanity check ------------------------------------------------
    if not isinstance(label, str):
        raise ValueError("3ʳᵈ arg must be the *label string* (e.g. 'gender').")
    if 'classes_' not in dir(le):
        raise ValueError("4ᵗʰ arg must be a scikit-learn LabelEncoder.")

    # --- build feature matrix -------------------------------------------------
    base_drop = ['file_id','swing_id','split',
                 'gender','handed','years','level', label]

    # also drop *any* previously attached prediction columns
    extra_drop = [c for c in subset_df.columns if c.startswith('pred_')]
    X = subset_df.drop(columns=base_drop + extra_drop)

    # --- mean of fold predictions --------------------------------------------
    probs = np.mean([m.predict(X) for m in models], axis=0)

    # wrap into the expected DataFrame
    return (probs, pd.DataFrame({
        'file_id': subset_df['file_id'].values,
        'true'   : le.transform(subset_df[label].values),
        'pred'   : list(probs)                # keep arrays in a single column
    }))


for label, (mods,le,mc) in {
    'gender':(models_gender,le_gender,False),
    'handed':(models_hand, le_hand, False),
    'years': (models_years,le_years,True),
    'level': (models_level,le_level,True)}.items():
    df['pred_'+label]=list(predict_swings(mods,df,label,le,mc)[0])

df.to_pickle('swing_level_with_preds.pkl')
print('Saved swing_level_with_preds.pkl')

Saved swing_level_with_preds.pkl


# Test set check

In [21]:
def official_file_probs(pred_df, multiclass=False):
    out=[]
    for fid,grp in pred_df.groupby('file_id'):
        if multiclass:
            p=np.stack(grp['pred'].values)
        else:
            p_raw=grp['pred'].values.astype(float)
            p=np.stack([1-p_raw, p_raw],1)
        winner=np.argmax(p.sum(0))
        best=p[np.argmax(p[:,winner])]
        out.append((fid, grp['true'].iloc[0], best))
    ids, y_true, probs = zip(*out)
    y_true=np.array(y_true); probs=np.stack(probs)
    auc = roc_auc_score(y_true, probs, multi_class='ovr') if multiclass else roc_auc_score(y_true, probs[:,1])
    return auc

val_df=df[df['split']=='val']
print("Val gender AUC:", official_file_probs(predict_swings(models_gender,val_df,'gender',le_gender)[1], False))


Val gender AUC: 0.9809016393442622


In [22]:
print("Val hand AUC:", official_file_probs(predict_swings(models_hand,val_df,'handed',le_hand)[1], False))
print("Val years AUC:", official_file_probs(predict_swings(models_years,val_df,'years',le_years)[1], True))

Val hand AUC: 1.0
Val years AUC: 0.9898643524101773


In [23]:
print("Val level AUC:", official_file_probs(predict_swings(models_level,val_df,'level',le_level)[1], True))

Val level AUC: 0.9967153213683777


In [24]:
test_df = df[df['split'] == 'test']

In [25]:
test_preds_gender = predict_swings(models_gender, test_df, 'gender', le_gender, multiclass=False)[1]
test_preds_hand   = predict_swings(models_hand,   test_df,  'handed', le_hand, multiclass=False)[1]
test_preds_years  = predict_swings(models_years,  test_df,  'years', le_years,  multiclass=True)[1]
test_preds_level  = predict_swings(models_level,  test_df,  'level', le_level,  multiclass=True)[1]

In [26]:
print("Test gender AUC:", official_file_probs(test_preds_gender, multiclass=False))
print("Test handed AUC:", official_file_probs(test_preds_hand,   multiclass=False))
print("Test years AUC:",  official_file_probs(test_preds_years,  multiclass=True))
print("Test level AUC:",  official_file_probs(test_preds_level,  multiclass=True))

Test gender AUC: 0.9830820476858345
Test handed AUC: 0.9998334027488547
Test years AUC: 0.9908969370750618
Test level AUC: 0.99938161297444


In [28]:
from sklearn.model_selection import GroupKFold
labels={'gender':False,'handed':False,'years':True,'level':True}
meta_models={}
def build_meta(label,mc):
    feats,y,groups=[],[],[]
    for fid,grp in df.groupby('file_id'):
        p=np.stack(grp['pred_'+label].values)
        if not mc: p=p.reshape(-1,1)
        feats.append(np.concatenate([p.mean(0),p.max(0),p[p.sum(1).argmax()]]))
        y.append(grp[label].iloc[0]); groups.append(fid)
    return np.vstack(feats),np.array(y),np.array(groups)

for label,mc in labels.items():
    X,y,g=build_meta(label,mc)
    n_class = len(np.unique(y))
    oof = np.zeros((len(y), n_class) if mc else len(y))
    gkf=GroupKFold(9)
    for tr,va in gkf.split(X,y,g):
        clf=LogisticRegressionCV(max_iter=1000,multi_class='multinomial' if mc else 'auto',cv=3)
        clf.fit(X[tr],y[tr])
        pro=clf.predict_proba(X[va])
        if not mc: pro=pro[:,1]
        oof[va]=pro
    auc=roc_auc_score(y,oof,multi_class='ovr') if mc else roc_auc_score(y,oof)
    print(f'Meta {label} AUC {auc:.4f}')
    final=LogisticRegressionCV(max_iter=1000,multi_class='multinomial' if mc else 'auto',cv=5)
    final.fit(X,y)
    meta_models[label]=final

Meta gender AUC 0.9995
Meta handed AUC 1.0000
Meta years AUC 0.9990
Meta level AUC 0.9998


In [30]:
# records=[]
# for fid,grp in df[df['split']=='test'].groupby('file_id'):
#     rec={'unique_id':int(fid)}
#     for label,mc in labels.items():
#         p=np.stack(grp['pred_'+label].values)
#         if not mc: p=p.reshape(-1,1)
#         feat=np.concatenate([p.mean(0),p.max(0),p[p.sum(1).argmax()]]).reshape(1,-1)
#         pro=meta_models[label].predict_proba(feat)[0]
#         if mc:
#             for k,pr in enumerate(pro): rec[f'{label}_{k}']=pr
#         else:
#             rec[f'{label}_prob']=pro[1]
#     records.append(rec)
# sub=pd.DataFrame(records).sort_values('unique_id')
# sub.to_csv('submission_meta.csv',index=False)
# print('submission_meta.csv saved',sub.shape)

submission_meta.csv saved (294, 10)
