In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelBinarizer

# === Feature Engineering per swing ===
def extract_features_from_swing(swing):
    # swing: np.array of shape (N, 6)
    Ax, Ay, Az = swing[:, 0], swing[:, 1], swing[:, 2]
    Gx, Gy, Gz = swing[:, 3], swing[:, 4], swing[:, 5]
    
    acc_mag = np.linalg.norm(swing[:, :3], axis=1)
    gyro_mag = np.linalg.norm(swing[:, 3:], axis=1)

    feats = {
        'Ax_mean': Ax.mean(), 'Ax_std': Ax.std(),
        'Ay_mean': Ay.mean(), 'Ay_std': Ay.std(),
        'Az_mean': Az.mean(), 'Az_std': Az.std(),
        'Gx_mean': Gx.mean(), 'Gx_std': Gx.std(),
        'Gy_mean': Gy.mean(), 'Gy_std': Gy.std(),
        'Gz_mean': Gz.mean(), 'Gz_std': Gz.std(),
        'acc_mag_mean': acc_mag.mean(),
        'gyro_mag_mean': gyro_mag.mean(),
        'acc_vs_gyro_ratio': acc_mag.mean() / (gyro_mag.mean() + 1e-6),
        'acc_early_mean': acc_mag[:len(acc_mag)//2].mean(),
        'acc_late_mean': acc_mag[len(acc_mag)//2:].mean(),
        'acc_early_late_ratio': acc_mag[:len(acc_mag)//2].mean() / (acc_mag[len(acc_mag)//2:].mean() + 1e-6)
    }
    return feats


In [2]:
from tqdm import tqdm

def process_file_to_swing_features(file_path: Path, cut_points: list[int], mode: int) -> pd.DataFrame:
    with open(file_path, 'r') as f:
        lines = f.readlines()[1:]  # skip header

    data = [list(map(int, line.strip().split())) for line in lines if len(line.strip().split()) == 6]
    data = np.array(data)

    swings = [data[cut_points[i]:cut_points[i+1]] for i in range(len(cut_points) - 1)]

    swing_rows = []
    for i, swing in enumerate(swings):
        row = extract_features_from_swing(swing)
        row['swing_id'] = i
        row['file_id'] = int(file_path.stem)
        # one-hot encode mode_1 ~ mode_10
        for m in range(1, 11):
            row[f'mode_{m}'] = 1 if mode == m else 0
        swing_rows.append(row)

    return pd.DataFrame(swing_rows)


In [3]:
import numpy as np

def build_full_swing_dataset(data_folder: Path, info_df: pd.DataFrame) -> pd.DataFrame:
    all_rows = []
    for file_path in tqdm(sorted(data_folder.glob("*.txt")), desc="⛏ Extracting swings"):
        uid = int(file_path.stem)
        row = info_df[info_df['unique_id'] == uid]
        if row.empty or pd.isna(row['cut_point'].values[0]):
            continue
        cut_points_str = row['cut_point'].values[0]
        cut_points = np.fromstring(cut_points_str.strip("[]"), sep=' ', dtype=int).tolist()
        if len(cut_points) < 2:
            continue
        mode = int(row['mode'].values[0])
        df = process_file_to_swing_features(file_path, cut_points, mode)
        df['gender'] = row['gender'].values[0]
        df['hold racket handed'] = row['hold racket handed'].values[0]
        df['play years'] = row['play years'].values[0]
        df['level'] = row['level'].values[0]
        all_rows.append(df)
    return pd.concat(all_rows, ignore_index=True)

In [4]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

def train_lightgbm_per_label(df, target_col, is_multiclass=False, n_splits=9):
    print(f"\n📚 Training LightGBM for {target_col}...")
    
    drop_cols = ['file_id', 'swing_id', 'gender', 'hold racket handed', 'play years', 'level']
    X = df.drop(columns=drop_cols)

    y_raw = df[target_col].values
    le = LabelEncoder()
    y = le.fit_transform(y_raw)

    if is_multiclass:
        params = {
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'learning_rate': 0.05,
            'verbosity': -1,
            'num_class': len(np.unique(y)),
            'seed': 42
        }
    else:
        params = {
            'objective': 'binary',
            'metric': 'auc',
            'learning_rate': 0.05,
            'verbosity': -1,
            'seed': 42
        }

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros((len(X), len(np.unique(y))) if is_multiclass else len(X))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"▶ Fold {fold}")
        dtrain = lgb.Dataset(X.iloc[train_idx], label=y[train_idx])
        dval = lgb.Dataset(X.iloc[val_idx], label=y[val_idx])

        model = lgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            valid_sets=[dtrain, dval],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
        )

        preds = model.predict(X.iloc[val_idx])
        oof_preds[val_idx] = preds

    # Evaluate
    if is_multiclass:
        score = roc_auc_score(y, oof_preds, multi_class='ovr', average='macro')
    else:
        score = roc_auc_score(y, oof_preds)

    print(f"✅ {target_col} AUC = {score:.5f}")
    return model, le


In [5]:
from pathlib import Path
import pandas as pd

# Load official cut points
train_info = pd.read_csv("39_Training_Dataset/train_info.csv")
data_folder = Path("39_Training_Dataset/train_data")

# Generate swing-level dataframe (~27 rows per file)
swing_df = build_full_swing_dataset(data_folder, train_info)

# (Optional) Save to CSV for reuse
# swing_df.to_csv("swing_df.csv", index=False)


⛏ Extracting swings: 100%|█████████████████████████████████████████████████████████| 1955/1955 [00:22<00:00, 87.31it/s]


In [7]:
# swing_df = build_full_swing_dataset(...)
model_gender, le_gender = train_lightgbm_per_label(swing_df, 'gender', is_multiclass=False, n_splits=9)
model_hand, le_hand = train_lightgbm_per_label(swing_df, 'hold racket handed', is_multiclass=False, n_splits=9)
model_years, le_years = train_lightgbm_per_label(swing_df, 'play years', is_multiclass=True, n_splits=9)
model_level, le_level = train_lightgbm_per_label(swing_df, 'level', is_multiclass=True, n_splits=9)


📚 Training LightGBM for gender...
▶ Fold 0
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.985913	valid_1's auc: 0.974849
[200]	training's auc: 0.993991	valid_1's auc: 0.983132
[300]	training's auc: 0.996925	valid_1's auc: 0.986008
[400]	training's auc: 0.998509	valid_1's auc: 0.987339
[500]	training's auc: 0.999259	valid_1's auc: 0.988258
[600]	training's auc: 0.999649	valid_1's auc: 0.988812
[700]	training's auc: 0.999858	valid_1's auc: 0.989217
[800]	training's auc: 0.999942	valid_1's auc: 0.989546
[900]	training's auc: 0.999981	valid_1's auc: 0.989894
[1000]	training's auc: 0.999994	valid_1's auc: 0.990058
Did not meet early stopping. Best iteration is:
[998]	training's auc: 0.999993	valid_1's auc: 0.990059
▶ Fold 1
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.98607	valid_1's auc: 0.978756
[200]	training's auc: 0.993843	valid_1's auc: 0.985315
[300]	training's auc: 0.99695	valid_1's auc: 0.988035
[400]	tr

In [8]:
def build_swing_prediction_table(model, df, label_encoder, target_col, is_multiclass=False):
    """
    Returns a DataFrame with one row per swing prediction, including:
    - file_id
    - true_label
    - prediction vector (as list if multiclass, float if binary)
    """
    drop_cols = ['file_id', 'swing_id', 'gender', 'hold racket handed', 'play years', 'level']
    # Always drop the target column if it’s not already listed
    if target_col not in drop_cols:
        drop_cols.append(target_col)
    X = df.drop(columns=drop_cols)
    
    # Safety check to match feature count
    if X.shape[1] != model.num_feature():
        raise ValueError(f"[❌] Mismatch in feature count: X has {X.shape[1]} features, model expects {model.num_feature()}.")

    y = label_encoder.transform(df[target_col])
    preds = model.predict(X)

    swing_preds = []
    for i, file_id in enumerate(df['file_id']):
        row = {
            'file_id': file_id,
            'true_label': y[i],
            'pred': preds[i] if not is_multiclass else preds[i].tolist()
        }
        swing_preds.append(row)

    return pd.DataFrame(swing_preds)


In [9]:
from sklearn.model_selection import StratifiedKFold

def train_aggregator_model(pred_df, is_multiclass=False, n_splits=9):
    """
    Group swing-level predictions into (file_id, [27 preds])
    Train a LightGBM meta-model using CV to avoid overfitting
    """
    grouped = pred_df.groupby('file_id')

    X_meta = []
    y_meta = []

    for file_id, group in grouped:
        if is_multiclass:
            swing_preds = np.stack(group['pred'].values)
            features = swing_preds.flatten()
        else:
            features = group['pred'].values
        X_meta.append(features)
        y_meta.append(group['true_label'].iloc[0])

    X_meta = np.array(X_meta)
    y_meta = np.array(y_meta)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    all_preds = np.zeros((len(X_meta), len(np.unique(y_meta))) if is_multiclass else len(X_meta))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_meta, y_meta)):
        X_train, X_val = X_meta[train_idx], X_meta[val_idx]
        y_train, y_val = y_meta[train_idx], y_meta[val_idx]

        model_cv = lgb.LGBMClassifier(
            objective='multiclass' if is_multiclass else 'binary',
            num_class=len(np.unique(y_meta)) if is_multiclass else None,
            random_state=42
        )
        model_cv.fit(X_train, y_train)

        preds_val = model_cv.predict_proba(X_val)
        if is_multiclass:
            all_preds[val_idx] = preds_val
        else:
            all_preds[val_idx] = preds_val[:, 1]

    if is_multiclass:
        score = roc_auc_score(y_meta, all_preds, multi_class='ovr')
    else:
        score = roc_auc_score(y_meta, all_preds)

    print(f"✅ Aggregation model CV AUC = {score:.5f}")

    # Train final model on all data (used for inference later)
    final_model = lgb.LGBMClassifier(
        objective='multiclass' if is_multiclass else 'binary',
        num_class=len(np.unique(y_meta)) if is_multiclass else None,
        random_state=42
    )
    final_model.fit(X_meta, y_meta)

    return final_model


In [11]:
def train_pipeline_for_label(swing_df, model, label_encoder, target_col, is_multiclass, n_splits=9):
    swing_preds = build_swing_prediction_table(
        model=model,
        df=swing_df,
        label_encoder=label_encoder,
        target_col=target_col,
        is_multiclass=is_multiclass,
    )
    meta_model = train_aggregator_model(
        pred_df=swing_preds,
        is_multiclass=is_multiclass,
        n_splits = n_splits,
    )
    return swing_preds, meta_model

In [12]:
# --- 1. Gender (already done) ---
swing_pred_gender, meta_gender_model = train_pipeline_for_label(swing_df, model_gender, le_gender, target_col='gender', is_multiclass=False)



✅ Aggregation model CV AUC = 1.00000




In [13]:
# --- 2. Hold Racket Handed ---
swing_pred_hand, meta_hand_model = train_pipeline_for_label(swing_df, model_hand, le_hand, target_col='hold racket handed', is_multiclass=False)



✅ Aggregation model CV AUC = 1.00000




In [14]:
# --- 3. Play Years ---
swing_pred_years, meta_years_model = train_pipeline_for_label(swing_df, model_years, le_years, target_col='play years', is_multiclass=True)



✅ Aggregation model CV AUC = 1.00000


In [15]:
# 4. Build swing-level predictions and train meta-model for level
swing_pred_level, meta_level_model = train_pipeline_for_label(
    swing_df, model_level, le_level, target_col='level', is_multiclass=True
)



✅ Aggregation model CV AUC = 1.00000


In [16]:
def predict_file(file_path, cut_point_str, mode, model_swing, model_meta, is_multiclass=False):
    from numpy import fromstring

    # Step 1: parse cut points
    cut_points = np.fromstring(cut_point_str.strip("[]"), sep=' ', dtype=int).tolist()

    # Step 2: load sensor data
    with open(file_path, 'r') as f:
        lines = f.readlines()[1:]
    data = np.array([list(map(int, line.strip().split())) for line in lines if len(line.strip().split()) == 6])

    # Step 3: per-swing feature extraction
    swings = [data[cut_points[i]:cut_points[i+1]] for i in range(len(cut_points) - 1)]
    feature_rows = []
    for swing in swings:
        row = extract_features_from_swing(swing)
        for m in range(1, 11):
            row[f'mode_{m}'] = 1 if mode == m else 0
        feature_rows.append(row)

    df = pd.DataFrame(feature_rows)
    X = df.values

    # Step 4: swing-level prediction
    preds = model_swing.predict(X)
    if is_multiclass:
        swing_preds = np.stack(preds)
        meta_input = swing_preds.flatten().reshape(1, -1)
    else:
        swing_preds = preds  # shape (27,)
        meta_input = np.array(swing_preds).reshape(1, -1)

    # Step 5: wrap meta_input in DataFrame with feature names to suppress warning
    meta_input_df = pd.DataFrame(meta_input, columns=[f'swing_{i}' for i in range(meta_input.shape[1])])
    prob = model_meta.predict_proba(meta_input_df)[0]

    return prob if is_multiclass else prob[1]


In [17]:
def run_test_pipeline():
    test_info = pd.read_csv("39_Test_Dataset/test_info.csv")
    test_folder = Path("39_Test_Dataset/test_data")

    gender_preds = []
    hand_preds = []
    play_years_preds = []
    level_preds = []  # ← Add this line

    for file_path in tqdm(sorted(test_folder.glob("*.txt")), desc="🎯 Predicting test"):
        uid = int(file_path.stem)
        row = test_info[test_info['unique_id'] == uid]
        cut_str = row['cut_point'].values[0]
        mode = row['mode'].values[0]

        pred_gender = predict_file(file_path, cut_str, mode, model_gender, meta_gender_model, is_multiclass=False)
        pred_hand = predict_file(file_path, cut_str, mode, model_hand, meta_hand_model, is_multiclass=False)
        pred_years = predict_file(file_path, cut_str, mode, model_years, meta_years_model, is_multiclass=True)
        pred_level = predict_file(file_path, cut_str, mode, model_level, meta_level_model, is_multiclass=True)

        gender_preds.append(pred_gender)
        hand_preds.append(pred_hand)
        play_years_preds.append(pred_years)
        level_preds.append(pred_level)  # ← This now works

    return test_info['unique_id'].values, gender_preds, hand_preds, play_years_preds, level_preds


In [18]:
from datetime import datetime

def save_submission(uids, gender_preds, hand_preds, play_years_preds, level_preds):
    submission = pd.DataFrame()
    submission['unique_id'] = uids
    submission['gender'] = gender_preds
    submission['hold racket handed'] = hand_preds

    # play years (no shift needed)
    for i in range(len(play_years_preds[0])):
        submission[f'play years_{i}'] = [p[i] for p in play_years_preds]

    # level (shift class index by +2)
    for i in range(len(level_preds[0])):
        submission[f'level_{i+2}'] = [p[i] for p in level_preds]

    # Pad missing columns
    sample = pd.read_csv("39_Test_Dataset/sample_submission.csv")
    for col in sample.columns:
        if col not in submission.columns:
            submission[col] = 0.0

    submission = submission[sample.columns]

    # Add timestamp to filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    filename = f"submission_{timestamp}.csv"

    submission.to_csv(filename, index=False, float_format='%.10f')
    print(f"✅ {filename} saved.")


In [19]:
uids, g_preds, h_preds, y_preds, l_preds = run_test_pipeline()

🎯 Predicting test: 100%|██████████████████████████████████████████████████████████| 1430/1430 [02:24<00:00,  9.89it/s]


In [20]:
save_submission(uids, g_preds, h_preds, y_preds, l_preds)

✅ submission_20250430_1047.csv saved.
