In [1]:
# ================================================================
#  Strictly Leak-Free Meta-Model Pipeline  (AI-Cup 2025 üèì)
#  ‚Äì builds swing-level LightGBM models with GroupKFold
#  ‚Äì trains both: (a) full-data meta model, and (b) 9-fold ensemble meta model
#  ‚Äì generates 2 separate competition submissions
# ================================================================

import numpy as np, pandas as pd, lightgbm as lgb, warnings
from pathlib import Path
from tqdm import tqdm
from datetime import datetime
from sklearn.model_selection import GroupKFold, train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")

# 1. Feature Extraction
DATA_DIR = Path("39_Training_Dataset")
INFO_CSV = DATA_DIR / "train_info.csv"
TXT_DIR  = DATA_DIR / "train_data"

def extract_features_from_swing(a: np.ndarray) -> dict:
    Ax,Ay,Az = a[:,0],a[:,1],a[:,2]
    Gx,Gy,Gz = a[:,3],a[:,4],a[:,5]
    acc  = np.linalg.norm(a[:,:3],1)
    gyro = np.linalg.norm(a[:,3:],1)
    return dict(
        Ax_mean=Ax.mean(), Ax_std=Ax.std(),
        Ay_mean=Ay.mean(), Ay_std=Ay.std(),
        Az_mean=Az.mean(), Az_std=Az.std(),
        Gx_mean=Gx.mean(), Gx_std=Gx.std(),
        Gy_mean=Gy.mean(), Gy_std=Gy.std(),
        Gz_mean=Gz.mean(), Gz_std=Gz.std(),
        acc_mag_mean =acc.mean(),
        gyro_mag_mean=gyro.mean(),
        acc_vs_gyro_ratio=acc.mean()/(gyro.mean()+1e-6)
    )

def build_dataset(txt_dir: Path, info_df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for txt in tqdm(sorted(txt_dir.glob("*.txt")), desc="Extract swings"):
        fid = int(txt.stem)
        meta = info_df[info_df['unique_id']==fid].iloc[0]
        cps = np.fromstring(meta['cut_point'].strip('[]'), sep=' ', dtype=int)
        if len(cps) < 2: continue
        data = np.loadtxt(txt, skiprows=1)
        for i in range(len(cps)-1):
            swing = data[cps[i]:cps[i+1]]
            d = extract_features_from_swing(swing)
            d.update(file_id=fid, swing_id=i,
                     gender=meta['gender'],
                     handed=meta['hold racket handed'],
                     years=meta['play years'],
                     level=meta['level'])
            rows.append(d)
    return pd.DataFrame(rows)

In [2]:
info_df = pd.read_csv(INFO_CSV)
df = build_dataset(TXT_DIR, info_df)
print("Dataset", df.shape)

# 2. Treat ALL data as train (no holdout split)
df['split'] = 'train'

Extract swings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1955/1955 [00:29<00:00, 65.73it/s]


Dataset (52785, 21)


In [3]:
# 3. OOF Model Helpers
def get_oof_models(df, label, le, multiclass=False):
    drop = ['file_id','swing_id','split','gender','handed','years','level',label]
    X = df.drop(columns=drop)
    y = le.transform(df[label])
    gid = df['file_id'].values
    boosters, meta_models = [], []
    oof_dict = {}

    for tr, va in GroupKFold(9).split(X, y, gid):
        dtr, dva = lgb.Dataset(X.iloc[tr], label=y[tr]), lgb.Dataset(X.iloc[va], label=y[va])
        booster = lgb.train(
            dict(objective='multiclass' if multiclass else 'binary',
                 metric='multi_logloss' if multiclass else 'auc',
                 num_class=len(le.classes_) if multiclass else 1,
                 learning_rate=0.05, verbosity=-1, seed=42),
            dtr, num_boost_round=1000, valid_sets=[dva], callbacks=[lgb.early_stopping(50)]
        )
        boosters.append(booster)
        probs = booster.predict(X.iloc[va])
        for fid, p in zip(df.iloc[va]['file_id'].values, probs):
            oof_dict.setdefault(fid, []).append(p)
    return boosters, oof_dict

def convert_to_meta_feats(preds_per_file, df_all, label, multiclass):
    feats, y, groups = [], [], []
    for fid, swings in preds_per_file.items():
        swings = np.array(swings)
        if not multiclass:
            swings = swings.reshape(-1,1)
        feats.append(np.concatenate([
            swings.mean(0), swings.max(0), swings[swings.sum(1).argmax()]
        ]))
        y.append(df_all[df_all['file_id']==fid][label].iloc[0])
        groups.append(fid)
    return np.vstack(feats), np.array(y), np.array(groups)

def meta_feats_from_booster(booster, swings_df, multiclass):
    probs = booster.predict(swings_df)
    if not multiclass:
        probs = probs.reshape(-1,1)
    return np.concatenate([probs.mean(0), probs.max(0), probs[probs.sum(1).argmax()]])

def predict_ensemble_meta(boosters, meta_models, test_df, label, le, multiclass):
    feats = []
    for fid, grp in test_df.groupby("file_id"):
        pred_list = [meta_feats_from_booster(b, grp.drop(columns=['file_id','swing_id']), multiclass) for b in boosters]
        X = np.vstack(pred_list)
        X_mean = X.mean(axis=0).reshape(1,-1)
        proba = np.mean([m.predict_proba(X_mean) for m in meta_models], axis=0)
        feats.append((fid, proba))
    return dict(feats)

In [4]:
# 4. Train both full model and ensemble
labels_cfg = {'gender':False, 'handed':False, 'years':True, 'level':True}
meta_models = {}
full_models = {}
ensemble_meta = {}
ensemble_boost = {}

for label, multiclass in labels_cfg.items():
    print(f"\n‚è≥ Training: {label}")
    df[label] = df[label].astype(str)
    le = LabelEncoder().fit(df[label])

    boosters, oof_dict = get_oof_models(df, label, le, multiclass)
    X_meta, y_meta, g_meta = convert_to_meta_feats(oof_dict, df, label, multiclass)

    metas = []
    for tr, va in GroupKFold(9).split(X_meta, y_meta, g_meta):
        clf = LogisticRegressionCV(max_iter=1000, multi_class='multinomial', cv=3)
        clf.fit(X_meta[tr], y_meta[tr])
        metas.append(clf)
    ensemble_boost[label] = boosters
    ensemble_meta[label]  = metas

    final_meta = LogisticRegressionCV(max_iter=1000, multi_class='multinomial', cv=5)
    final_meta.fit(X_meta, y_meta)
    meta_models[label] = final_meta

    drop = ['file_id','swing_id','split','gender','handed','years','level',label]
    booster_full = lgb.train(
        dict(objective='multiclass' if multiclass else 'binary',
             metric='multi_logloss' if multiclass else 'auc',
             num_class=len(le.classes_) if multiclass else 1,
             learning_rate=0.05, verbosity=-1, seed=42),
        lgb.Dataset(df.drop(columns=drop), label=le.transform(df[label])),
        num_boost_round=800
    )
    full_models[label] = booster_full


‚è≥ Training: gender
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[499]	valid_0's auc: 0.981392
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[399]	valid_0's auc: 0.980582
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[268]	valid_0's auc: 0.973297
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's auc: 0.982325
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's auc: 0.984025
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's auc: 0.981611
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[942]	valid_0's auc: 0.986017
Training until validation scores don't improve for 50 rou

# Submission

In [12]:
# 5. Submission Generation
TEST_DATA_DIR = Path("39_Test_Dataset")
test_info     = pd.read_csv(TEST_DATA_DIR / "test_info.csv")
test_txt_dir  = TEST_DATA_DIR / "test_data"
sample_sub    = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")

def build_test_df(txt_dir, info_df):
    rows = []
    for fid in tqdm(info_df['unique_id'].values, desc="Loading test swings"):
        path = txt_dir / f"{fid}.txt"
        if not path.exists(): continue
        cps = np.fromstring(info_df[info_df['unique_id']==fid]['cut_point'].values[0].strip('[]'), sep=' ', dtype=int)
        if len(cps) < 2: continue
        data = np.loadtxt(path, skiprows=1)
        for i in range(len(cps)-1):
            swing = data[cps[i]:cps[i+1]]
            d = extract_features_from_swing(swing)
            d.update(file_id=fid, swing_id=i)
            rows.append(d)
    return pd.DataFrame(rows)

df_test = build_test_df(test_txt_dir, test_info)
submission_ids = sample_sub['unique_id'].values
sample_cols = sample_sub.columns.tolist()[1:]

Loading test swings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1430/1430 [00:08<00:00, 160.10it/s]


In [23]:
def make_submission(model_dict, meta_dict, name):
    out = []

    for label, multiclass in labels_cfg.items():
        if name == "full":
            feats = [meta_feats_from_booster(model_dict[label],
                     grp.drop(columns=['file_id','swing_id']), multiclass)
                     for _, grp in df_test.groupby("file_id")]
            prob = meta_dict[label].predict_proba(np.vstack(feats))
        else:
            preds = predict_ensemble_meta(model_dict[label], meta_dict[label],
                                          df_test, label, None, multiclass)
            prob = np.vstack([preds[fid] for fid in test_info['unique_id'].values])

        # Map output probabilities to proper columns
        if label == "gender":
            out.append(prob[:, 1].reshape(-1, 1))  # probability of male
        elif label == "handed":
            out.append(prob[:, 1].reshape(-1, 1))  # probability of right-handed
        elif label == "years":
            out.append(prob[:, [0, 1, 2]])  # years 0,1,2
        elif label == "level":
            out.append(prob[:, [0, 1, 2, 3]])  # level 2,3,4,5

    # Match submission column order
    final = pd.DataFrame(np.hstack(out), columns=sample_cols)
    final.insert(0, "unique_id", submission_ids)
    fname = f"submission_meta_{name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    # final.to_csv(fname, index=False)
    final.to_csv(fname, index=False, float_format="%.10f")
    print(f"‚úÖ Saved {fname}")
    print(final.describe())

make_submission(full_models, meta_models, name="full")

‚úÖ Saved submission_meta_full_20250502_091626.csv
         unique_id       gender  hold racket handed  play years_0  \
count  1430.000000  1430.000000         1430.000000   1430.000000   
mean   2689.603497     0.146825            0.150078      0.106361   
std     417.642283     0.314767            0.346843      0.235696   
min    1968.000000     0.001447            0.001703      0.000059   
25%    2329.250000     0.001786            0.001703      0.000670   
50%    2687.500000     0.003814            0.001703      0.002360   
75%    3051.750000     0.043252            0.001715      0.043139   
max    3411.000000     0.999954            0.979594      0.994757   

       play years_1  play years_2      level_2      level_3      level_4  \
count   1430.000000   1430.000000  1430.000000  1430.000000  1430.000000   
mean       0.789221      0.104418     0.114536     0.251808     0.018177   
std        0.332664      0.254019     0.275714     0.334030     0.066671   
min        0.000530    

In [24]:
make_submission(ensemble_boost, ensemble_meta, name="ensemble")

‚úÖ Saved submission_meta_ensemble_20250502_091830.csv
         unique_id       gender  hold racket handed  play years_0  \
count  1430.000000  1430.000000         1430.000000   1430.000000   
mean   2689.603497     0.140542            0.150203      0.107118   
std     417.642283     0.300785            0.342585      0.243131   
min    1968.000000     0.001213            0.003146      0.000020   
25%    2329.250000     0.001635            0.003147      0.000381   
50%    2687.500000     0.004099            0.003160      0.001425   
75%    3051.750000     0.047647            0.003753      0.040092   
max    3411.000000     0.999443            0.969929      0.997174   

       play years_1  play years_2      level_2      level_3      level_4  \
count   1430.000000   1430.000000  1430.000000  1430.000000  1430.000000   
mean       0.793066      0.099817     0.111643     0.249593     0.017528   
std        0.335404      0.250923     0.272579     0.335429     0.055528   
min        0.000458

In [25]:
# Reload all 3 files
sample_sub = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")
# submission_full = pd.read_csv(f"submission_meta_full_{ts}.csv")
# submission_avg  = pd.read_csv(f"submission_meta_ensemble_{ts}.csv")
submission_full = pd.read_csv(f"submission_meta_full_20250502_091626.csv")
submission_avg  = pd.read_csv(f"submission_meta_ensemble_20250502_091830.csv")

# === 1. Compare column names and order ===
print("üîé Checking columns match sample_submission.csv...\n")

for name, df in {
    "Full Submission": submission_full,
    "Ensemble Submission": submission_avg
}.items():
    if df.columns.tolist() != sample_sub.columns.tolist():
        print(f"‚ùå {name} column mismatch!")
        missing = set(sample_sub.columns) - set(df.columns)
        extra   = set(df.columns) - set(sample_sub.columns)
        print(f"  Missing: {missing}" if missing else "", 
              f"  Extra: {extra}" if extra else "")
    else:
        print(f"‚úÖ {name} columns match exactly.\n")

# === 2. Describe all side-by-side ===
print("\nüìä Descriptive Statistics (first 3 columns shown per file):")
print("\nüü¶ sample_submission.csv")
print(sample_sub.describe().iloc[:, :3])

print("\nüü© submission_meta_full")
print(submission_full.describe().iloc[:, :3])

print("\nüü® submission_meta_ensemble")
print(submission_avg.describe().iloc[:, :3])

FileNotFoundError: [Errno 2] No such file or directory: 'submission_meta_full_20250502_090215.csv'