In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelBinarizer

# === Feature Engineering per swing ===
def extract_features_from_swing(swing):
    # swing: np.array of shape (N, 6)
    Ax, Ay, Az = swing[:, 0], swing[:, 1], swing[:, 2]
    Gx, Gy, Gz = swing[:, 3], swing[:, 4], swing[:, 5]
    
    acc_mag = np.linalg.norm(swing[:, :3], axis=1)
    gyro_mag = np.linalg.norm(swing[:, 3:], axis=1)

    feats = {
        'Ax_mean': Ax.mean(), 'Ax_std': Ax.std(),
        'Ay_mean': Ay.mean(), 'Ay_std': Ay.std(),
        'Az_mean': Az.mean(), 'Az_std': Az.std(),
        'Gx_mean': Gx.mean(), 'Gx_std': Gx.std(),
        'Gy_mean': Gy.mean(), 'Gy_std': Gy.std(),
        'Gz_mean': Gz.mean(), 'Gz_std': Gz.std(),
        'acc_mag_mean': acc_mag.mean(),
        'gyro_mag_mean': gyro_mag.mean(),
        'acc_vs_gyro_ratio': acc_mag.mean() / (gyro_mag.mean() + 1e-6),
        'acc_early_mean': acc_mag[:len(acc_mag)//2].mean(),
        'acc_late_mean': acc_mag[len(acc_mag)//2:].mean(),
        'acc_early_late_ratio': acc_mag[:len(acc_mag)//2].mean() / (acc_mag[len(acc_mag)//2:].mean() + 1e-6)
    }
    return feats


In [2]:
from tqdm import tqdm

def process_file_to_swing_features(file_path: Path, cut_points: list[int], mode: int) -> pd.DataFrame:
    with open(file_path, 'r') as f:
        lines = f.readlines()[1:]  # skip header

    data = [list(map(int, line.strip().split())) for line in lines if len(line.strip().split()) == 6]
    data = np.array(data)

    swings = [data[cut_points[i]:cut_points[i+1]] for i in range(len(cut_points) - 1)]

    swing_rows = []
    for i, swing in enumerate(swings):
        row = extract_features_from_swing(swing)
        row['swing_id'] = i
        row['file_id'] = int(file_path.stem)
        # one-hot encode mode_1 ~ mode_10
        for m in range(1, 11):
            row[f'mode_{m}'] = 1 if mode == m else 0
        swing_rows.append(row)

    return pd.DataFrame(swing_rows)


In [3]:
import numpy as np

def build_full_swing_dataset(data_folder: Path, info_df: pd.DataFrame) -> pd.DataFrame:
    all_rows = []
    for file_path in tqdm(sorted(data_folder.glob("*.txt")), desc="⛏ Extracting swings"):
        uid = int(file_path.stem)
        row = info_df[info_df['unique_id'] == uid]
        if row.empty or pd.isna(row['cut_point'].values[0]):
            continue
        cut_points_str = row['cut_point'].values[0]
        cut_points = np.fromstring(cut_points_str.strip("[]"), sep=' ', dtype=int).tolist()
        if len(cut_points) < 2:
            continue
        mode = int(row['mode'].values[0])
        df = process_file_to_swing_features(file_path, cut_points, mode)
        df['gender'] = row['gender'].values[0]
        df['hold racket handed'] = row['hold racket handed'].values[0]
        df['play years'] = row['play years'].values[0]
        df['level'] = row['level'].values[0]
        all_rows.append(df)
    return pd.concat(all_rows, ignore_index=True)

In [7]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold

def train_lightgbm_per_label(df, target_col, is_multiclass=False, n_splits=9):
    print(f"\n📚 Training LightGBM for {target_col}...")

    drop_cols = ['file_id', 'swing_id', 'gender', 'hold racket handed', 'play years', 'level']
    X_full = df.drop(columns=drop_cols)
    y_raw = df[target_col].values
    file_ids = df['file_id'].values

    le = LabelEncoder()
    y = le.fit_transform(y_raw)

    params = {
        'objective': 'multiclass' if is_multiclass else 'binary',
        'metric': 'multi_logloss' if is_multiclass else 'auc',
        'learning_rate': 0.05,
        'verbosity': -1,
        'num_class': len(np.unique(y)) if is_multiclass else 1,
        'seed': 42
    }
    
    oof_preds = []
    groups = df['file_id'].values
    gkf = GroupKFold(n_splits=n_splits)
    for fold, (train_idx, val_idx) in enumerate(gkf.split(X_full, y, groups=groups)):
        print(f"▶ Fold {fold}")
        dtrain = lgb.Dataset(X_full.iloc[train_idx], label=y[train_idx])
        dval = lgb.Dataset(X_full.iloc[val_idx], label=y[val_idx])

        model = lgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            valid_sets=[dtrain, dval],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
        )

        preds = model.predict(X_full.iloc[val_idx])
        for i, idx in enumerate(val_idx):
            oof_preds.append({
                'file_id': file_ids[idx],
                'true_label': y[idx],
                'pred': preds[i] if is_multiclass else float(preds[i])
            })

    # Evaluate
    pred_probs = np.array([row['pred'] for row in oof_preds])
    true_labels = np.array([row['true_label'] for row in oof_preds])
    if is_multiclass:
        score = roc_auc_score(true_labels, pred_probs, multi_class='ovr')
    else:
        score = roc_auc_score(true_labels, pred_probs)
    print(f"✅ {target_col} AUC = {score:.5f}")

    return pd.DataFrame(oof_preds), le


In [8]:
from pathlib import Path
import pandas as pd

# Load official cut points
train_info = pd.read_csv("39_Training_Dataset/train_info.csv")
data_folder = Path("39_Training_Dataset/train_data")

# Generate swing-level dataframe (~27 rows per file)
swing_df = build_full_swing_dataset(data_folder, train_info)

# (Optional) Save to CSV for reuse
# swing_df.to_csv("swing_df.csv", index=False)


⛏ Extracting swings: 100%|█████████████████████████████████████████████████████████| 1955/1955 [00:23<00:00, 84.40it/s]


In [9]:
# swing_df = build_full_swing_dataset(...)
model_gender, le_gender = train_lightgbm_per_label(swing_df, 'gender', is_multiclass=False, n_splits=9)
model_hand, le_hand = train_lightgbm_per_label(swing_df, 'hold racket handed', is_multiclass=False, n_splits=9)
model_years, le_years = train_lightgbm_per_label(swing_df, 'play years', is_multiclass=True, n_splits=9)
model_level, le_level = train_lightgbm_per_label(swing_df, 'level', is_multiclass=True, n_splits=9)


📚 Training LightGBM for gender...
▶ Fold 0
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.986237	valid_1's auc: 0.975935
[200]	training's auc: 0.994201	valid_1's auc: 0.980255
Early stopping, best iteration is:
[212]	training's auc: 0.994729	valid_1's auc: 0.980622
▶ Fold 1
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.985921	valid_1's auc: 0.978479
[200]	training's auc: 0.994031	valid_1's auc: 0.982755
[300]	training's auc: 0.997003	valid_1's auc: 0.983362
Early stopping, best iteration is:
[302]	training's auc: 0.997045	valid_1's auc: 0.983439
▶ Fold 2
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.98609	valid_1's auc: 0.971258
[200]	training's auc: 0.993976	valid_1's auc: 0.975809
[300]	training's auc: 0.997053	valid_1's auc: 0.976644
Early stopping, best iteration is:
[273]	training's auc: 0.996417	valid_1's auc: 0.977043
▶ Fold 3
Training until validation scores don'

In [10]:
def build_swing_prediction_table(model, df, label_encoder, target_col, is_multiclass=False):
    """
    Returns a DataFrame with one row per swing prediction, including:
    - file_id
    - true_label
    - prediction vector (as list if multiclass, float if binary)
    """
    drop_cols = ['file_id', 'swing_id', 'gender', 'hold racket handed', 'play years', 'level']
    # Always drop the target column if it’s not already listed
    if target_col not in drop_cols:
        drop_cols.append(target_col)
    X = df.drop(columns=drop_cols)
    
    # Safety check to match feature count
    if X.shape[1] != model.num_feature():
        raise ValueError(f"[❌] Mismatch in feature count: X has {X.shape[1]} features, model expects {model.num_feature()}.")

    y = label_encoder.transform(df[target_col])
    preds = model.predict(X)

    swing_preds = []
    for i, file_id in enumerate(df['file_id']):
        row = {
            'file_id': file_id,
            'true_label': y[i],
            'pred': preds[i] if not is_multiclass else preds[i].tolist()
        }
        swing_preds.append(row)

    return pd.DataFrame(swing_preds)


In [11]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

def train_aggregator_model(pred_df, is_multiclass=False, n_splits=9):
    """
    Group swing-level predictions into (file_id, [27 preds])
    Train a Logistic Regression meta-model using CV to avoid overfitting
    """
    grouped = pred_df.groupby('file_id')
    X_meta, y_meta = [], []

    for file_id, group in grouped:
        if is_multiclass:
            swing_preds = np.stack(group['pred'].values)  # shape (27, C)
            features = swing_preds.flatten()
        else:
            features = np.array(group['pred'].values)  # shape (27,)
        X_meta.append(features)
        y_meta.append(group['true_label'].iloc[0])

    X_meta = np.array(X_meta)
    y_meta = np.array(y_meta)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    all_preds = np.zeros((len(X_meta), len(np.unique(y_meta))) if is_multiclass else len(X_meta))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_meta, y_meta)):
        X_train, X_val = X_meta[train_idx], X_meta[val_idx]
        y_train, y_val = y_meta[train_idx], y_meta[val_idx]

        if is_multiclass:
            model_cv = LogisticRegressionCV(
                cv=5,
                max_iter=1000,
                random_state=42,
                solver='lbfgs'
            )
        else:
            model_cv = LogisticRegressionCV(cv=3, max_iter=1000)

        model_cv.fit(X_train, y_train)
        preds_val = model_cv.predict_proba(X_val)
        if is_multiclass:
            all_preds[val_idx] = preds_val
        else:
            all_preds[val_idx] = preds_val[:, 1]

    if is_multiclass:
        score = roc_auc_score(y_meta, all_preds, multi_class='ovr')
    else:
        score = roc_auc_score(y_meta, all_preds)

    print(f"✅ Aggregation model CV AUC = {score:.5f}")

    # Train final model on full set
    if is_multiclass:
        final_model = LogisticRegressionCV(cv=5, max_iter=1000, multi_class='multinomial', solver='lbfgs')
    else:
        final_model = LogisticRegressionCV(cv=5, max_iter=1000)
    final_model.fit(X_meta, y_meta)

    return final_model


In [12]:
def train_pipeline_for_label(swing_df, label_encoder, oof_df, target_col, is_multiclass):
    meta_model = train_aggregator_model(
        pred_df=oof_df,
        is_multiclass=is_multiclass,
        n_splits=9,
    )
    return oof_df, meta_model

In [13]:
# --- 1. Gender (already done) ---
oof_gender_df, le_gender = train_lightgbm_per_label(swing_df, 'gender', is_multiclass=False)
swing_pred_gender, meta_gender_model = train_pipeline_for_label(swing_df, le_gender, oof_gender_df, 'gender', is_multiclass=False)


📚 Training LightGBM for gender...
▶ Fold 0
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.986237	valid_1's auc: 0.975935
[200]	training's auc: 0.994201	valid_1's auc: 0.980255
Early stopping, best iteration is:
[212]	training's auc: 0.994729	valid_1's auc: 0.980622
▶ Fold 1
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.985921	valid_1's auc: 0.978479
[200]	training's auc: 0.994031	valid_1's auc: 0.982755
[300]	training's auc: 0.997003	valid_1's auc: 0.983362
Early stopping, best iteration is:
[302]	training's auc: 0.997045	valid_1's auc: 0.983439
▶ Fold 2
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.98609	valid_1's auc: 0.971258
[200]	training's auc: 0.993976	valid_1's auc: 0.975809
[300]	training's auc: 0.997053	valid_1's auc: 0.976644
Early stopping, best iteration is:
[273]	training's auc: 0.996417	valid_1's auc: 0.977043
▶ Fold 3
Training until validation scores don'

In [15]:
# --- 2. Hold Racket Handed ---
oof_hand_df, le_hand = train_lightgbm_per_label(swing_df, 'hold racket handed', is_multiclass=False)
swing_pred_hand, meta_hand_model = train_pipeline_for_label(
    swing_df, le_hand, oof_hand_df, target_col='hold racket handed', is_multiclass=False
)


📚 Training LightGBM for hold racket handed...
▶ Fold 0
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.999989	valid_1's auc: 0.999846
[200]	training's auc: 1	valid_1's auc: 0.999895
[300]	training's auc: 1	valid_1's auc: 0.999911
Early stopping, best iteration is:
[320]	training's auc: 1	valid_1's auc: 0.999914
▶ Fold 1
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.999983	valid_1's auc: 0.999885
[200]	training's auc: 1	valid_1's auc: 0.999929
[300]	training's auc: 1	valid_1's auc: 0.99994
[400]	training's auc: 1	valid_1's auc: 0.999945
Early stopping, best iteration is:
[448]	training's auc: 1	valid_1's auc: 0.999951
▶ Fold 2
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.999984	valid_1's auc: 0.999788
[200]	training's auc: 1	valid_1's auc: 0.999874
[300]	training's auc: 1	valid_1's auc: 0.999899
Early stopping, best iteration is:
[347]	training's auc: 1	valid_1's auc: 0.9

In [16]:
# --- 3. Play Years ---
oof_years_df, le_years = train_lightgbm_per_label(swing_df, 'play years', is_multiclass=True)
swing_pred_years, meta_years_model = train_pipeline_for_label(
    swing_df, le_years, oof_years_df, target_col='play years', is_multiclass=True
)



📚 Training LightGBM for play years...
▶ Fold 0
Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.310237	valid_1's multi_logloss: 0.380484
[200]	training's multi_logloss: 0.229398	valid_1's multi_logloss: 0.333794
[300]	training's multi_logloss: 0.179458	valid_1's multi_logloss: 0.306201
[400]	training's multi_logloss: 0.14578	valid_1's multi_logloss: 0.293546
[500]	training's multi_logloss: 0.1189	valid_1's multi_logloss: 0.284101
[600]	training's multi_logloss: 0.0987523	valid_1's multi_logloss: 0.278558
[700]	training's multi_logloss: 0.0827072	valid_1's multi_logloss: 0.276262
Early stopping, best iteration is:
[686]	training's multi_logloss: 0.0845678	valid_1's multi_logloss: 0.275634
▶ Fold 1
Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.308823	valid_1's multi_logloss: 0.397854
[200]	training's multi_logloss: 0.22669	valid_1's multi_logloss: 0.354691
[300]	training's multi_logloss: 0.178



In [17]:
# --- 4. Level ---
oof_level_df, le_level = train_lightgbm_per_label(swing_df, 'level', is_multiclass=True)
swing_pred_level, meta_level_model = train_pipeline_for_label(
    swing_df, le_level, oof_level_df, target_col='level', is_multiclass=True
)


📚 Training LightGBM for level...
▶ Fold 0
Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.229512	valid_1's multi_logloss: 0.367448
[200]	training's multi_logloss: 0.150961	valid_1's multi_logloss: 0.335793
[300]	training's multi_logloss: 0.109736	valid_1's multi_logloss: 0.32517
[400]	training's multi_logloss: 0.0821011	valid_1's multi_logloss: 0.321711
Early stopping, best iteration is:
[390]	training's multi_logloss: 0.0843555	valid_1's multi_logloss: 0.321444
▶ Fold 1
Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.23255	valid_1's multi_logloss: 0.314792
[200]	training's multi_logloss: 0.153932	valid_1's multi_logloss: 0.271077
[300]	training's multi_logloss: 0.111734	valid_1's multi_logloss: 0.252128
[400]	training's multi_logloss: 0.0836861	valid_1's multi_logloss: 0.242174
[500]	training's multi_logloss: 0.0634505	valid_1's multi_logloss: 0.235155
[600]	training's multi_logloss: 0.04845



In [22]:
def predict_file(file_path, cut_point_str, mode, model_swing, model_meta, is_multiclass=False):
    from numpy import fromstring

    # Step 1: parse cut points
    cut_points = np.fromstring(cut_point_str.strip("[]"), sep=' ', dtype=int).tolist()

    # Step 2: load sensor data
    with open(file_path, 'r') as f:
        lines = f.readlines()[1:]
    data = np.array([list(map(int, line.strip().split())) for line in lines if len(line.strip().split()) == 6])

    # Step 3: per-swing feature extraction
    swings = [data[cut_points[i]:cut_points[i+1]] for i in range(len(cut_points) - 1)]
    feature_rows = []
    for swing in swings:
        row = extract_features_from_swing(swing)
        for m in range(1, 11):
            row[f'mode_{m}'] = 1 if mode == m else 0
        feature_rows.append(row)

    df = pd.DataFrame(feature_rows)
    X = df.values

    # Step 4: swing-level prediction
    preds = model_swing.predict(X)
    if is_multiclass:
        swing_preds = np.stack(preds)
        meta_input = swing_preds.flatten().reshape(1, -1)
    else:
        swing_preds = preds  # shape (27,)
        meta_input = np.array(swing_preds).reshape(1, -1)

    # Step 5: wrap meta_input in DataFrame with feature names to suppress warning
    meta_input_df = pd.DataFrame(meta_input, columns=[f'swing_{i}' for i in range(meta_input.shape[1])])
    prob = model_meta.predict_proba(meta_input_df)[0]

    return prob if is_multiclass else prob[1]


In [None]:
def run_test_pipeline():
    test_info = pd.read_csv("39_Test_Dataset/test_info.csv")
    test_folder = Path("39_Test_Dataset/test_data")

    gender_preds = []
    hand_preds = []
    play_years_preds = []
    level_preds = []  # ← Add this line

    for file_path in tqdm(sorted(test_folder.glob("*.txt")), desc="🎯 Predicting test"):
        uid = int(file_path.stem)
        row = test_info[test_info['unique_id'] == uid]
        cut_str = row['cut_point'].values[0]
        mode = row['mode'].values[0]

        pred_gender = predict_file(file_path, cut_str, mode, model_gender, meta_gender_model, is_multiclass=False)
        pred_hand = predict_file(file_path, cut_str, mode, model_hand, meta_hand_model, is_multiclass=False)
        pred_years = predict_file(file_path, cut_str, mode, model_years, meta_years_model, is_multiclass=True)
        pred_level = predict_file(file_path, cut_str, mode, model_level, meta_level_model, is_multiclass=True)

        gender_preds.append(pred_gender)
        hand_preds.append(pred_hand)
        play_years_preds.append(pred_years)
        level_preds.append(pred_level)  # ← This now works

    return test_info['unique_id'].values, gender_preds, hand_preds, play_years_preds, level_preds


In [None]:
from datetime import datetime

def save_submission(uids, gender_preds, hand_preds, play_years_preds, level_preds):
    submission = pd.DataFrame()
    submission['unique_id'] = uids
    submission['gender'] = gender_preds
    submission['hold racket handed'] = hand_preds

    # play years (no shift needed)
    for i in range(len(play_years_preds[0])):
        submission[f'play years_{i}'] = [p[i] for p in play_years_preds]

    # level (shift class index by +2)
    for i in range(len(level_preds[0])):
        submission[f'level_{i+2}'] = [p[i] for p in level_preds]

    # Pad missing columns
    sample = pd.read_csv("39_Test_Dataset/sample_submission.csv")
    for col in sample.columns:
        if col not in submission.columns:
            submission[col] = 0.0

    submission = submission[sample.columns]

    # Add timestamp to filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    filename = f"submission_{timestamp}.csv"

    submission.to_csv(filename, index=False, float_format='%.10f')
    print(f"✅ {filename} saved.")


In [None]:
uids, g_preds, h_preds, y_preds, l_preds = run_test_pipeline()

In [None]:
save_submission(uids, g_preds, h_preds, y_preds, l_preds)

In [23]:
a

NameError: name 'a' is not defined