### feature_utils.py

In [9]:
import csv
import logging
import math
from pathlib import Path
import numpy as np
from scipy.fft import rfft
import scipy.stats
import pandas as pd
from sklearn.metrics import roc_auc_score
logging.basicConfig(
    filename="log.txt",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
# >>> 常數重整 <<<
SEG_DIM      = 335        # extract_features() 單段輸出
FATIGUE_DIM  = 102
FINAL_DIM    = SEG_DIM + FATIGUE_DIM   # 437
# <<< 常數重整 <<<


# >>> 官方 27→1 聚合規則 <<<
import numpy as np

def aggregate_group_prob(proba_mat: np.ndarray, group_size: int = 27) -> np.ndarray:
    """
    將 (N × C) 機率矩陣依官方規則壓縮成 (N/27 × C)：
    1. 先把 27 次機率「逐類別加總」
    2. 找加總值最大的類別 major_cls
    3. 在這 27 次裡挑出 major_cls 機率最高的那一次
    4. 取那一次的整排機率當此檔最終輸出
    """
    if proba_mat.size == 0:
        return np.array([[0.]])

    # 如果長度不足一組 27，就直接用同一規則處理整段
    if len(proba_mat) < group_size:
        proba_mat = proba_mat.reshape(1, len(proba_mat), -1)
    else:
        total      = (len(proba_mat) // group_size) * group_size
        proba_mat  = proba_mat[:total].reshape(-1, group_size, proba_mat.shape[1])

    agg_list = []
    for grp in proba_mat:                     # grp.shape = (27, C)
        sum_prob   = grp.sum(axis=0)          # (C,)
        major_cls  = sum_prob.argmax()
        best_idx   = grp[:, major_cls].argmax()
        agg_list.append(grp[best_idx])        # (C,)

    return np.stack(agg_list, axis=0)
# <<< 官方 27→1 聚合規則 >>>


    
def FFT_data(input_data, swinging_times):   
    txtlength = swinging_times[-1] - swinging_times[0]
    a_mean = [0] * txtlength
    g_mean = [0] * txtlength
       
    for num in range(len(swinging_times)-1):
        a = []
        g = []
        for swing in range(swinging_times[num], swinging_times[num+1]):
            a.append(math.sqrt(math.pow((input_data[swing][0] + input_data[swing][1] + input_data[swing][2]), 2)))
            g.append(math.sqrt(math.pow((input_data[swing][3] + input_data[swing][4] + input_data[swing][5]), 2)))
        a_mean[num] = (sum(a) / len(a))
        g_mean[num] = (sum(g) / len(g))
    
    return a_mean, g_mean

def feature(input_data, swinging_now, swinging_times, n_fft, a_fft, g_fft, a_fft_imag, g_fft_imag, writer):
    # Convert input data to numpy array
    arr = np.array(input_data)
    
    if swinging_times == 0:  # Handle case where there are no swings
        swinging_times = 1
        
    # Calculate frequency domain features
    cut = int(n_fft / swinging_times)
    idx_start = cut * swinging_now
    idx_end = min(cut * (swinging_now + 1), len(a_fft))  # Ensure we don't go out of bounds
    
    # Rest of the function remains the same...
    # Calculate acceleration and gyroscope vectors
    a_vec = np.sqrt(np.sum(arr[:, :3]**2, axis=1))
    g_vec = np.sqrt(np.sum(arr[:, 3:6]**2, axis=1))
    
    a_stats = [a_vec.max(), a_vec.mean(), a_vec.min()]
    g_stats = [g_vec.max(), g_vec.mean(), g_vec.min()]
    
    # Calculate kurtosis and skewness
    a_centered = a_vec - a_vec.mean()
    g_centered = g_vec - g_vec.mean()
    
    a_moments = {
        'skew': np.mean(a_centered**3) / (np.std(a_centered)**3),
        'kurt': np.mean(a_centered**4) / (np.var(a_centered)**2)
    }
    g_moments = {
        'skew': np.mean(g_centered**3) / (np.std(g_centered)**3),
        'kurt': np.mean(g_centered**4) / (np.var(g_centered)**2)
    }
    
    # Get FFT slices safely
    a_fft_slice = a_fft[idx_start:idx_end]
    g_fft_slice = g_fft[idx_start:idx_end]
    a_fft_imag_slice = a_fft_imag[idx_start:idx_end]
    g_fft_imag_slice = g_fft_imag[idx_start:idx_end]
    
    # Handle empty slices
    if len(a_fft_slice) == 0:
        a_fft_slice = np.array([0])
        g_fft_slice = np.array([0])
        a_fft_imag_slice = np.array([0])
        g_fft_imag_slice = np.array([0])
    
    # Calculate PSD using vectorized operations
    a_psd = np.power(a_fft_slice, 2) + np.power(a_fft_imag_slice, 2)
    g_psd = np.power(g_fft_slice, 2) + np.power(g_fft_imag_slice, 2)
    
    # Calculate entropy with safety checks
    e1 = np.sqrt(a_psd)
    e3 = np.sqrt(g_psd)
    e2 = np.sum(e1) + 1e-10  # Avoid division by zero
    e4 = np.sum(e3) + 1e-10
    
    p_a = e1 / e2
    p_g = e3 / e4
    entropy_a = np.sum(p_a * np.log(p_a + 1e-10)) / max(cut, 1)
    entropy_g = np.sum(p_g * np.log(p_g + 1e-10)) / max(cut, 1)
    
    # Calculate basic statistics
    mean = np.mean(arr, axis=0)
    std = np.std(arr, axis=0)
    rms = np.sqrt(np.mean(arr**2, axis=0))
    
    # Combine all features
    output = np.concatenate([
        mean, std, rms,
        a_stats, g_stats,
        [np.mean(a_fft_slice), np.mean(g_fft_slice)],
        [np.mean(a_psd), np.mean(g_psd)],
        [a_moments['kurt'], g_moments['kurt']],
        [a_moments['skew'], g_moments['skew']],
        [entropy_a, entropy_g]
    ]).tolist()
    
    writer.writerow(output)

def extract_features(df):
    features = []
    
    # 速度相關特徵
    speed_cols = ['Ax', 'Ay', 'Az']
    speeds = df[speed_cols].values
    
    # 基礎統計特徵
    features.extend([
        np.mean(speeds, axis=0),
        np.std(speeds, axis=0),
        np.max(speeds, axis=0),
        np.min(speeds, axis=0),
        np.percentile(speeds, 25, axis=0),
        np.percentile(speeds, 75, axis=0),
        scipy.stats.skew(speeds, axis=0),
        scipy.stats.kurtosis(speeds, axis=0)
    ])
    # --- Δ / Δ² 特徵：捕捉加速度變化趨勢 ---
    delta1 = np.diff(speeds, axis=0, n=1)   # 一階差分
    delta2 = np.diff(speeds, axis=0, n=2)   # 二階差分
    if delta1.size == 0:
        delta1 = np.zeros((1, 3))
    if delta2.size == 0:
        delta2 = np.zeros((1, 3))
    features.extend([
        np.mean(delta1, axis=0), np.std(delta1, axis=0),
        np.mean(delta2, axis=0), np.std(delta2, axis=0),
    ])
    # 速度交互特徵
    speed_means = np.mean(speeds, axis=0)
    features.extend([
        speed_means[0] * speed_means[1],  # x * y
        speed_means[1] * speed_means[2],  # y * z
        speed_means[0] * speed_means[2]   # x * z
    ])
    
    # 加速度特徵
    acc = np.diff(speeds, axis=0)
    features.extend([
        np.mean(acc, axis=0),
        np.std(acc, axis=0),
        np.max(acc, axis=0),
        np.min(acc, axis=0)
    ])
    
    # 位置相關特徵
    pos_cols   = ['Gx', 'Gy', 'Gz']
    positions = df[pos_cols].values
    
    # 位置統計特徵
    features.extend([
        np.mean(positions, axis=0),
        np.std(positions, axis=0),
        np.max(positions, axis=0),
        np.min(positions, axis=0),
        np.percentile(positions, 25, axis=0),
        np.percentile(positions, 75, axis=0)
    ])
    
    # 位置變化特徵
    pos_diff = np.diff(positions, axis=0)
    features.extend([
        np.mean(pos_diff, axis=0),
        np.std(pos_diff, axis=0)
    ])
    
    # 時序特徵
    for col in speed_cols + pos_cols:
        #
        ts = df[col].values
        # FFT特徵
        n = len(ts)
        window = np.hanning(n)                    # 1. 建立窗函數
        ts_windowed = ts * window                 # 2. 乘上窗
        n_fft = 1 << (n - 1).bit_length()         # 3. 計算下個 2 的冪次
        ts_padded = np.pad(ts_windowed,           # 4. 以零補齊
                        (0, n_fft - n),
                        mode='constant')
        fft_features = np.abs(np.fft.fft(ts_padded))[:5]
        fft_features = np.abs(np.fft.fft(ts))[:5]  # 取前5個頻率分量
        features.extend(fft_features)
        
        # 自相關特徵
        acf = np.correlate(ts, ts, mode='full') / len(ts)
        features.extend(acf[len(acf)//2:len(acf)//2+3])  # 取中心點後3個值
    
    # 三維向量特徵
    speed_magnitudes = np.linalg.norm(speeds, axis=1)
    pos_magnitudes = np.linalg.norm(positions, axis=1)
    
    features.extend([
        np.mean(speed_magnitudes),
        np.std(speed_magnitudes),
        np.max(speed_magnitudes),
        np.min(speed_magnitudes),
        np.mean(pos_magnitudes),
        np.std(pos_magnitudes),
        np.max(pos_magnitudes),
        np.min(pos_magnitudes)
    ])
    
    # --- 把 features 轉成 1-D list ---
    flat = []
    for f in features:
        if isinstance(f, np.ndarray):
            flat.extend(f.ravel())        # array(3,) → 3 個值
        else:                             # Python float / int / scalar
            flat.append(float(f))

    flat = np.asarray(flat, dtype=np.float32)

    # ---- 保證長度一致：不足補 0，多餘截斷 ----
    if flat.size < SEG_DIM:
        flat = np.pad(flat, (0, SEG_DIM - flat.size))
    elif flat.size > SEG_DIM:
        flat = flat[:SEG_DIM]


    return flat

# >>> 新增 疲勞/穩定度特徵 Helper Function <<<
def compute_fatigue_features(segments):
    """
    6 個通道 × 17 指標 ＝ 102 維
    指標 = 4 條序列(max/mean/std/p90) 的
           slope‧intercept‧R²‧(last-first)  +  std_ratio
    """
    import numpy as np
    chs  = ['Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz']
    rng  = np.arange(len(segments))          # 0…26
    eps  = 1e-6                              # 極小值，避免除零
    feat = []

    for c in chs:
        max_s  = [seg[c].abs().max()              for seg in segments]
        mean_s = [seg[c].abs().mean()             for seg in segments]
        std_s  = [seg[c].std()                    for seg in segments]
        p90_s  = [np.percentile(seg[c].abs(), 90) for seg in segments]

        for series in (max_s, mean_s, std_s, p90_s):
            # 若序列全相同 → 標準差為 0，直接設定斜率、R² 為 0
            if np.std(series) < eps:
                k, b, r2 = 0.0, series[0], 0.0
            else:
                k, b = np.polyfit(rng, series, 1)
                r2   = np.corrcoef(rng, series)[0, 1]**2
            feat.extend([k, b, r2, series[-1] - series[0]])

        std_ratio = (np.mean(std_s[:13]) + eps) / (np.mean(std_s[13:]) + eps)
        feat.append(std_ratio)

    # 確保沒有 nan / inf
    return np.nan_to_num(np.array(feat), nan=0.0, posinf=0.0, neginf=0.0)
# <<< 新增 疲勞/穩定度特徵 Helper Function >>>



def generate_features(raw_dir: str, info_csv:str,  out_dir: str):
    # 讀取 cut_point；用 unique_id 當 index 方便隨查
    info_df = pd.read_csv(info_csv).set_index("unique_id")

    Path(out_dir).mkdir(exist_ok=True)
    pathlist_txt = Path(raw_dir).glob('*.txt')

    for file in pathlist_txt:
        # 讀取文件
        data = []
        with open(file, 'r') as f:
            for line in f.readlines()[1:]:  # Skip header
                if line.strip():  # Skip empty lines
                    values = line.strip().split()
                    if len(values) >= 6:
                        # 只取前6個值，並轉換為整數
                        row = [int(x) for x in values[:6]]
                        data.append(row)

        if not data:
            print(f"Warning: No valid data found in {file}")
            logging.warning(f"No valid data found in {file}")
            continue

        # 創建DataFrame並命名列
        df = pd.DataFrame(data, columns=[
            'Ax', 'Ay', 'Az',    # Accelerometer
            'Gx', 'Gy', 'Gz'     # Gyroscope
        ])
        
        try:
            # --- 依 cut_point 精準切 27 段；缺切點則回退等分 ---
            uid        = int(file.stem)
            try:
               cuts_raw = info_df.loc[uid, "cut_point"]          # "[0 35 ... 998]"
               cuts     = np.fromstring(cuts_raw.strip("[]"), sep=" ", dtype=int)
               assert len(cuts) == 28
               segments  = [df.iloc[cuts[i]:cuts[i+1]] for i in range(27)]
            except Exception as _:
               # 若該檔案沒有 cut_point 或解析失敗 → 回退均分
               idx_splits = np.array_split(np.arange(len(df)), 27)
               segments   = [df.iloc[idx] for idx in idx_splits]
            seg_feats = [extract_features(seg) for seg in segments]

            # 🟢 確認所有段落向量長度一致，否則捨棄異常段
            base_len = len(seg_feats[0])
            seg_feats = [v for v in seg_feats if len(v) == base_len]
            if len(seg_feats) == 0:                 # 全部失敗 → 填 0
                seg_feats = [np.zeros(base_len)]
            # 這裡示範取平均；想取最大值可改 np.max(seg_feats, axis=0)
            fatigue_vec = compute_fatigue_features(segments)
            features = np.concatenate([np.mean(seg_feats, axis=0), fatigue_vec])
            assert features.shape[0] == FINAL_DIM, \
                f"feature dim={features.shape[0]}, expect {FINAL_DIM}"
            
            # 保存特徵
            features_df = pd.DataFrame([features])
            output_path = Path(out_dir) / f"{file.stem}.csv"
            features_df.to_csv(output_path, index=False)
        except Exception as e:
            print(f"Error processing {file}: {str(e)}")
            logging.error(f"Error processing {file}: {str(e)}")
            continue



### model_utils.py

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
import logging
import joblib
#  lightgbm as lgb
from pathlib import Path
import numpy as np, pandas as pd
import optuna
from optuna.trial import TrialState
# from sklearn.calibration import label_binarize
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

TARGETS = {
    "gender":             {"type": "bin", "name": "gender"},
    "hold racket handed": {"type": "bin", "name": "hold"},
    "play years":         {"type": "multi", "num_class": 3, "name": "play_years"},
    "level":              {"type": "multi", "num_class": 4, "name": "level"},
}

def build_scaler(X: np.ndarray):
    sc = MinMaxScaler()
    return sc.fit(X)

def build_model(y, meta):
    """
    回傳一個 RNN 模型
    y: 1D numpy array (labels)
    meta: dict，含 'num_classes' 或 binary
    """
    from tensorflow.keras.metrics import AUC as AUCMetric
    # 1. 轉為 one-hot（若 multi-class）
    num_classes = meta.get("num_classes", None)
    if num_classes and num_classes > 2:
        y_cat = to_categorical(y, num_classes)
        loss = "categorical_crossentropy"
        output_units = num_classes
        activation = "softmax"
    else:
        y_cat = y
        loss = "binary_crossentropy"
        output_units = 1
        activation = "sigmoid"

    # 2. 建模型
    model = Sequential([
        # 輸入 shape 會在 main.py 傳入
        LSTM(64, return_sequences=True, input_shape=(None, meta["n_features"])),
        BatchNormalization(),
        Dropout(0.3),
        LSTM(32),
        BatchNormalization(),
        Dropout(0.3),
        Dense(output_units, activation=activation)
    ])
    model.compile(optimizer="adam", loss=loss, metrics=[AUCMetric(name="auc")])
    # 包成一個 dict，保留給外部呼叫
    return {"model": model, "y_cat": y_cat}


def cv_evaluate(model, X, y, groups, target_info, early_stopping_rounds=30):
    gkf = GroupKFold(n_splits=5)
    scores = []

    for tr_idx, val_idx in gkf.split(X, y, groups):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]
        
        eval_set = [(X_val, y_val)]
        
        # Use different metrics for binary and multiclass
        eval_metric = 'auc' if target_info["type"] == "bin" else 'multi_logloss'
        
        model.fit(X_tr, y_tr,
                 eval_set=eval_set,
                 eval_metric=eval_metric,
                 callbacks=[lgb.early_stopping(early_stopping_rounds)])
                 
        proba = model.predict_proba(X_val)

        # ---------- Binary ----------
        if target_info["type"] == "bin":
            pos_prob = proba[:, 1] if proba.ndim == 2 else proba.ravel()
            scores.append(roc_auc_score(y_val, pos_prob))
            continue

        # ---------- Multi-class ----------
        present = np.unique(y_val)             
        if len(present) == 1:                       
            continue                                

        col_of = {c: i for i, c in enumerate(model.classes_)}
        
        if len(present) == 2:
            pos_cls   = present[1]
            pos_prob  = proba[:, col_of[pos_cls]]
            y_bin = (y_val == pos_cls).astype(int)   
            score = roc_auc_score(y_bin, pos_prob)
            scores.append(score)
            continue

        proba_use = proba[:, [col_of[c] for c in present]]

        if proba_use.ndim > 1:
            proba_use = proba_use / proba_use.sum(axis=1, keepdims=True)

        score = roc_auc_score(
            y_val, proba_use,
            labels=present, average="micro", multi_class="ovr"
        )
        scores.append(score)

    return np.mean(scores) if scores else 0.5

def save_model(model, scaler, feature_names, col):      # ← 多一個參數
    Path("models").mkdir(exist_ok=True, parents=True)
    bundle = {                                         # ★ 多存 feature_names
        "model": model,
        "scaler": scaler,
        "feature_names": feature_names,
    }
    joblib.dump(bundle, f"./models/{col}.pkl")

def load_model(name):
    return joblib.load(f"models/{name}.pkl")

def tune_lgb_params(X_train, y_train, groups, target_info, n_trials: int = 30):
    """使用 Optuna 做 GroupKFold 的 LGB 超參數優化，回傳 best_params"""
    from sklearn.model_selection import GroupKFold
    from sklearn.metrics import roc_auc_score
    def objective(trial):
        param = {
            "objective": "binary" if target_info["type"]=="bin" else "multiclass",
            "metric":    "auc"    if target_info["type"]=="bin" else "multi_logloss",
            "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
            "num_leaves":    trial.suggest_int("num_leaves", 16, 128),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
            "subsample":         trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree":  trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "reg_alpha":         trial.suggest_loguniform("reg_alpha", 1e-3, 10),
            "reg_lambda":        trial.suggest_loguniform("reg_lambda", 1e-3, 10),
            "n_estimators": 1000,
            "random_state": 42,
            "n_jobs":       -1,
        }
        cv = GroupKFold(n_splits=3)
        scores = []
        for tr_idx, val_idx in cv.split(X_train, y_train, groups):
            X_tr, X_val = X_train[tr_idx], X_train[val_idx]
            y_tr, y_val = y_train[tr_idx], y_train[val_idx]
            if target_info["type"] == "multi" and len(np.unique(y_tr)) < target_info["num_class"]:
                return 0.0
            try:
                clf = lgb.LGBMClassifier(**param)
                clf.fit(
                    X_tr, y_tr,
                    eval_set=[(X_val, y_val)],
                    eval_metric=param["metric"],
                    callbacks=[lgb.early_stopping(30)]
                )
                proba = clf.predict_proba(X_val)
            except Exception:
                # 包含 unseen labels 或其他 fit 錯誤都視為此 trial 失敗
                return 0.0
            try:
                if target_info["type"] == "bin":
                    score = roc_auc_score(y_val, proba[:, 1])
                else:
                    import pandas as pd
                    y_ohe = pd.get_dummies(y_val)
                    score = roc_auc_score(
                        y_ohe, proba,
                        multi_class="ovr", average="micro"
                    )
                if np.isnan(score):
                    return 0.5
                scores.append(score)
            except ValueError:
                # Handle the case where all labels are the same
                return 0.5
        return sum(scores) / len(scores)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)
    if any(t.state == TrialState.COMPLETE for t in study.trials):
        return study.best_trial.params
    else:
        return {}

### train_val_utils.py

In [8]:
import logging
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
# import lightgbm as lgb

logging.basicConfig(
    filename="log.txt",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
from sklearn.linear_model import LogisticRegression

def stacking_predict(models_dict, X_val, meta_list):
    """取所有基模型機率當特徵，訓練簡單 LR 做二階融合"""
    # 第一層預測
    layer1 = []
    for name, bundle in models_dict.items():
        proba = bundle["model"].predict_proba(X_val)
        layer1.append(proba)
    X_stack = np.hstack(layer1)
    # 用真實 y 建 LR
    true_y = np.column_stack([meta_list[t] for t in models_dict.keys()])
    # 這裡示範單目標，實際可延伸
    lr = LogisticRegression(max_iter=500)
    lr.fit(X_stack, true_y.ravel())
    return lr, X_stack

def train_validate_split(X, y, groups, test_size=0.2, random_state=42):
    from sklearn.model_selection import GroupShuffleSplit

    # Log the length of groups before splitting
    # print(f"Total number of unique groups: {len(groups)}")

    # Debug: Print the first few values of groups
    # print(f"First 10 values in groups: {groups[:10]}")

    # Check unique values in groups
    unique_groups, group_counts = np.unique(groups, return_counts=True)
    # print(f"Number of unique groups: {len(unique_groups)}")
    # logging.info(f"Number of unique groups: {len(unique_groups)}")
    # print(f"Group counts: {dict(zip(unique_groups, group_counts))}")
    # logging.info(f"Group counts: {dict(zip(unique_groups, group_counts))}")


    splitter = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    train_idx, val_idx = next(splitter.split(X, y, groups))

    # Ensure unique_id is not repeated
    train_unique_ids = np.unique(groups[train_idx])
    val_unique_ids = np.unique(groups[val_idx])

    if set(train_unique_ids).intersection(set(val_unique_ids)):
        raise ValueError("Data leakage detected: Some unique_ids are in both training and validation sets.")

    # Save unique_ids for debugging
    np.savetxt("train_ids_from_split.txt", train_unique_ids, fmt="%d")
    np.savetxt("val_ids_from_split.txt", val_unique_ids, fmt="%d")

    return {
        'X_train': X[train_idx],
        'y_train': y[train_idx],
        'groups_train': groups[train_idx],
        'X_val': X[val_idx],
        'y_val': y[val_idx],
        'groups_val': groups[val_idx]
    }

def check_data_leakage(train_ids, val_ids):
    """Check for data leakage between training and validation datasets."""
    train_set = set(train_ids)
    val_set = set(val_ids)

    # Find intersection
    leakage = train_set.intersection(val_set)
    if leakage:
        print("Data leakage detected! Overlapping IDs:", leakage)
        logging.error(f"Data leakage detected! Overlapping IDs: {leakage}")
        return True
    else:
        print("No data leakage detected.")
        logging.info("No data leakage detected.")
        return False

# def evaluate_model(model, data_dict, target_info):
#     """Evaluate model performance on validation set"""
#     model.fit(
#         data_dict['X_train'], 
#         data_dict['y_train'],
#         eval_set=[(data_dict['X_val'], data_dict['y_val'])],
#         eval_metric='auc' if target_info["type"] == "bin" else 'multi_logloss',
#         callbacks=[lgb.early_stopping(50)]
#     )
    
#     proba = model.predict_proba(data_dict['X_val'])
    
#     if target_info["type"] == "bin":
#         score = roc_auc_score(data_dict['y_val'], proba[:, 1])
#     else:
#         # Convert validation labels to one-hot encoding
#         classes = np.unique(data_dict['y_train'])
#         y_val_onehot = np.zeros((len(data_dict['y_val']), len(classes)))
#         for i, cls in enumerate(classes):
#             y_val_onehot[:, i] = (data_dict['y_val'] == cls).astype(int)
            
#         score = roc_auc_score(
#             y_val_onehot,
#             proba,
#             multi_class="ovr", 
#             average="micro"
#         )
    
#     return score, model


def evaluate_validation_set(data_dict, models_dict, target_info):
    """Calculate ROC AUC scores for validation data using same logic as evaluate_predictions"""
    scores = {}
    
    for target_name, meta in target_info.items():
        model = models_dict[target_name]['model']
        scaler = models_dict[target_name]['scaler']
        X_val_scaled = scaler.transform(data_dict['X_val'])
        proba = model.predict_proba(X_val_scaled)
        
        if meta["type"] == "bin":
            # 二元分類 - 轉換為 0/1
            true_vals = (data_dict['y_val'][target_name] == 1).astype(int)
            pred_vals = proba[:, 1]  # 使用正類的概率
            score = roc_auc_score(true_vals, pred_vals)
        else:
            # 多分類 - 使用 one-hot 編碼
            if target_name == "play years":
                true_vals = pd.get_dummies(data_dict['y_val'][target_name])
                pred_vals = pd.DataFrame(proba, columns=range(3))
            else:  # level
                true_vals = pd.get_dummies(data_dict['y_val'][target_name])
                pred_vals = pd.DataFrame(proba, columns=[2,3,4,5])
            
            score = roc_auc_score(
                true_vals, pred_vals,
                multi_class="ovr",
                average="micro"
            )
            
        scores[target_name] = score
        print(f"{target_name} ROC AUC: {score:.4f}")
        logging.info(f"{target_name} ROC AUC: {score:.4f}")
    
    avg_score = np.mean(list(scores.values()))
    print(f"\nAverage ROC AUC: {avg_score:.4f}")
    logging.info(f"Average ROC AUC: {avg_score:.4f}")
    return scores, avg_score

def check_unique_id_overlap(train_file, val_file):
    """Check if any unique_id exists in both train and validation files."""
    with open(train_file, 'r') as f:
        train_ids = set(map(int, f.readlines()))

    with open(val_file, 'r') as f:
        val_ids = set(map(int, f.readlines()))

    overlap = train_ids.intersection(val_ids)
    if overlap:
        print("Data leakage detected! Overlapping unique_ids:", overlap)
        logging.error(f"Data leakage detected! Overlapping unique_ids: {overlap}")
    else:
        print("No data leakage detected.")
        logging.info("No data leakage detected.")

# Example usage
check_unique_id_overlap("train_ids_from_split.txt", "val_ids_from_split.txt")

No data leakage detected.


### main.py
prepare_train

In [12]:
import argparse, pandas as pd, numpy as np
from pathlib import Path
# from feature_utils import generate_features, aggregate_group_prob
# from model_utils import TARGETS, build_scaler, build_model
# from model_utils import cv_evaluate, save_model, load_model, tune_lgb_params
# from train_val_utils import train_validate_split, evaluate_model, evaluate_validation_set
import warnings
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(
    filename="log.txt",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Log the start of the script
logging.info("Script started.")

warnings.filterwarnings(
    "ignore",
    message="X does not have valid feature names, but LGBMClassifier was fitted with feature names",
    category=UserWarning,
)

def load_features(feat_dir):
    seqs, uids = [],[]
    feat_names = None
    for p in Path(feat_dir).glob("*.csv"):
        df = pd.read_csv(p)
        if feat_names is None:                      # 只抓一次欄位
            feat_names = df.columns.tolist()
        seqs.append(df.values)          # ← 每個 unique_id 一個時序陣列
        uids.append(int(p.stem))
    return seqs, np.array(uids), feat_names

def prepare_train():
    # 1. 產生特徵
    # generate_features("./train_data", "train_info.csv", "tabular_data_train")

    # 2. 讀取 info & 特徵
    info = pd.read_csv("train_info.csv")
    seqs, uid_idx, feat_names = load_features("tabular_data_train")
    groups = info.set_index("unique_id").loc[uid_idx, "player_id"].values

    # 3. 數據標準化：先攤平成 2D，再 scale，每個序列再分回
    #    seqs: list of arrays, 每個 shape=(t_i, f)
    all_flat = np.vstack(seqs)                      # (sum_i t_i, f)
    scaler = build_scaler(all_flat)
    seqs_scaled = [scaler.transform(s) for s in seqs]

    # 4. 補長／裁切到同一長度
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    seq_len = max(s.shape[0] for s in seqs_scaled)
    n_features = seqs_scaled[0].shape[1]
    X_seq = pad_sequences(
        seqs_scaled,
        maxlen=seq_len,
        dtype="float32",
        padding="post",
        truncating="post",
        value=0.0
    )  # shape = (n_sequences, seq_len, n_features)

    # 5. 把這兩個放進 meta，供 build_model 使用
    for col, meta in TARGETS.items():
        meta["seq_len"]    = seq_len
        meta["n_features"] = n_features
    # # 儲存訓練/驗證集的分割
    # all_targets = {}
    # holdout_data = None
    
    # 4. 對每個 target 建模
    for col, meta in TARGETS.items():
        print(f"\nTraining for {col}:")
        logging.info(f"\nTraining for {col}:")
        y = info.set_index("unique_id").loc[uid_idx, col].values
        
        # 拆分訓練集和驗證集
        # data_dict = train_validate_split(X_scaled, y, groups)
            # 👉 使用 Optuna 找最佳超參數
        # best_params = tune_lgb_params(
        #     data_dict["X_train"], data_dict["y_train"], data_dict["groups_train"], meta, n_trials=20
        # )
        # print(f"Best params for {col}: {best_params}")
        # logging.info(f"Best params for {col}: {best_params}")
        # build_model 現在回傳 dict
        out = build_model(y, meta)
        model = out["model"]
        y_cat = out["y_cat"]
        # 拆 train / val split
        split = int(len(X_seq) * 0.8)
        X_tr, X_val = X_seq[:split], X_seq[split:]
        y_tr = y_cat[:split] if y_cat.ndim > 1 else y[:split]
        y_val = y_cat[split:] if y_cat.ndim > 1 else y[split:]        # 開始訓練
        history = model.fit(
            X_tr, y_tr,
            validation_data=(X_val, y_val),
            epochs=50,
            batch_size=32,
            # callbacks=[],  # 若要 early stopping 可加
        )
        # 驗證結果
        val_auc = history.history["val_auc"][-1]
        print(f"{col} validation AUC: {val_auc:.4f}")
        logging.info(f"{col} validation AUC: {val_auc:.4f}")

        # mdl = build_model(y, meta)

        # if holdout_data is None:
        #     holdout_data = {
        #         'X_val': data_dict['X_val'],
        #         'X_train': data_dict['X_train'],
        #         'y_val': {},
        #         'y_train': {},
        #         'groups_val': data_dict['groups_val']
        #     }
        
        # holdout_data['y_val'][col] = data_dict['y_val']
        # holdout_data['y_train'][col] = data_dict['y_train']
        
        # # 訓練和驗證
        # # mdl = build_model(y, meta)
        # val_score, trained_model = evaluate_model(mdl, data_dict, meta)
        # print(f"Training Score for {col}: {val_score:.4f}")
        # logging.info(f"Training Score for {col}: {val_score:.4f}")
        # save_model(trained_model, scaler, feat_names, col)
        # all_targets[col] = {'model': trained_model, 'scaler': scaler}
        

    # print("\nEvaluating validation set:")
    # scores, avg_score = evaluate_validation_set(holdout_data, all_targets, TARGETS)
    # np.save("split_uid_val.npy", 
    #         np.unique(uid_idx[np.isin(groups, holdout_data['groups_val'])]))
    print("\n✅ Models saved to ./models/")

prepare_train()



Training for gender:
Epoch 1/50


  super().__init__(**kwargs)


KeyboardInterrupt: 

### predict_train.py

In [49]:
import logging
import warnings
import pandas as pd
import numpy as np
from pathlib import Path
from feature_utils import aggregate_group_prob
from model_utils import TARGETS, load_model
from train_val_utils import train_validate_split
from sklearn.metrics import roc_auc_score

logging.basicConfig(
    filename="log.txt",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
# Log the start of the script

warnings.filterwarnings(
    "ignore",
    message="X does not have valid feature names, but LGBMClassifier was fitted with feature names",
    category=UserWarning,
)

def evaluate_predictions(pred_file: str, info: pd.DataFrame):
    pred = pd.read_csv(pred_file)
    data = pd.merge(info, pred, on="unique_id", suffixes=("_true", "_pred"))

    # 二元任務
    gender_true = (data["gender_true"] == 1).astype(int)
    hold_true = (data["hold racket handed_true"] == 1).astype(int)

    gender_auc = roc_auc_score(gender_true, data["gender_pred"])
    hold_auc = roc_auc_score(hold_true, data["hold racket handed_pred"])

    # 多元任務
    play_year_true = pd.get_dummies(data["play years"])
    play_year_pred = data[[f"play years_{i}" for i in range(3)]]
    play_year_auc = roc_auc_score(
        play_year_true, play_year_pred,
        multi_class="ovr", average="micro"
    )

    level_true = pd.get_dummies(data["level"])
    level_pred = data[[f"level_{i}" for i in [2, 3, 4, 5]]]
    level_auc = roc_auc_score(
        level_true, level_pred,
        multi_class="ovr", average="micro"
    )

    final_score = (gender_auc + hold_auc + play_year_auc + level_auc) / 4
    print(f"Gender ROC AUC       : {gender_auc:.4f}")
    print(f"Hold Racket ROC AUC  : {hold_auc:.4f}")
    print(f"Play Years ROC AUC   : {play_year_auc:.4f}")
    print(f"Level ROC AUC        : {level_auc:.4f}")
    print(f"Final Score          : {final_score:.4f}")
    return {
        'gender': gender_auc,
        'hold': hold_auc,
        'play_years': play_year_auc,
        'level': level_auc,
        'final': final_score
    }

def predict_train():
    # 讀取原始數據和特徵
    info = pd.read_csv("train_info.csv")
    all_features = []
    uid_idx = []
    
    for p in sorted(Path("tabular_data_train").glob("*.csv")):
        uid = int(p.stem)
        df = pd.read_csv(p)
        all_features.append(df.values)
        uid_idx.extend([uid] * len(df))
    
    X = np.vstack(all_features)
    uid_idx = np.array(uid_idx)
    groups = info.set_index("unique_id").loc[uid_idx, "player_id"].values
    
    # 分割驗證集
    y = info.set_index("unique_id").loc[uid_idx, list(TARGETS.keys())].values
    data_dict = train_validate_split(X, y, groups)
    val_mask = np.isin(groups, np.unique(data_dict['groups_val']))
    
    # 只預測驗證集數據
    val_uids = np.unique(uid_idx[val_mask])
    sub_rows = []
    
    for uid in val_uids:
        idx = np.where(uid_idx == uid)[0]
        X_current = X[idx]
        row = {"unique_id": uid}

        for col, meta in TARGETS.items():
            bundle = load_model(col)
            scaler = bundle["scaler"]
            model = bundle["model"]

            X_scaled = scaler.transform(X_current)
            proba = model.predict_proba(X_scaled)
            grp = aggregate_group_prob(proba)[0]

            if meta["type"] == "bin":
                pos_idx = np.where(model.classes_ == 1)[0][0]
                row[col] = grp[pos_idx]
                continue

            needed_labels = [0, 1, 2] if col == "play years" else [2, 3, 4, 5]
            for lbl in needed_labels:
                row[f"{col}_{lbl}"] = 0.0

            for idx, lbl in enumerate(model.classes_):
                row[f"{col}_{lbl}"] = grp[idx]

        sub_rows.append(row)

    sub_cols = ["unique_id", "gender", "hold racket handed",
                "play years_0", "play years_1", "play years_2",
                "level_2", "level_3", "level_4", "level_5"]
    
    submission = pd.DataFrame(sub_rows)[sub_cols]
    submission.to_csv("val_pred.csv", index=False, float_format="%.8f")
    print("✅  val_pred.csv ready!")
    
    # 評估驗證集分數
    val_info = info[info['unique_id'].isin(val_uids)]
    scores = evaluate_predictions("val_pred.csv", val_info)
    return scores

if __name__ == '__main__':
    logging.info("Predict validation data started.")
    scores = predict_train()
    logging.info("Predict validation data finished successfully.")

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 437 and the array at index 659 has size 335

## Predict_test

In [None]:
# from feature_utils import aggregate_group_prob
# from model_utils import TARGETS, load_model
import pandas as pd
def predict_test():
    # 1. 產生特徵
    generate_features("./test_data", "test_info.csv", "tabular_data_test")

    # 2. 預先加載所有模型
    models = {col: load_model(col) for col in TARGETS.keys()}
    
    # 3. 批量讀取所有測試數據
    test_files = sorted(Path("tabular_data_test").glob("*.csv"))
    all_uids = []
    all_features = []
    
    for p in test_files:
        uid = int(p.stem)
        df = pd.read_csv(p)          # ← 不要 .values
        all_uids.append(uid)
        all_features.append(df)      # ← 存 DataFrame
    
    sub_rows = []
    for idx, uid in enumerate(all_uids):
        X_df = all_features[idx]
        row = {"unique_id": uid}

        for col, meta in TARGETS.items():
            bundle = models[col]
            scaler = bundle["scaler"]
            model = bundle["model"]

            # === 對齊欄位：補缺 → 排序 ===
            feat_order = bundle["feature_names"]
            X_aligned  = X_df.reindex(columns=feat_order, fill_value=0.0)

            X_scaled = scaler.transform(X_aligned.values)
            proba    = model.predict_proba(X_scaled)
            grp      = aggregate_group_prob(proba)[0]

            if meta["type"] == "bin":
                pos_idx = np.where(model.classes_ == 1)[0][0]
                row[col] = grp[pos_idx]
                continue

            needed_labels = [0, 1, 2] if col == "play years" else [2, 3, 4, 5]
            for lbl in needed_labels:
                row[f"{col}_{lbl}"] = 0.0

            for idx, lbl in enumerate(model.classes_):
                row[f"{col}_{lbl}"] = grp[idx]

        sub_rows.append(row)

    sub_cols = ["unique_id", "gender", "hold racket handed",
                "play years_0","play years_1","play years_2",
                "level_2","level_3","level_4","level_5"]
    df_temp = pd.DataFrame(sub_rows)
    df_temp = df_temp.reindex(columns=sub_cols, fill_value=0.0)
    submission = df_temp[sub_cols]
    # 使用 DataFrame 批量處理
    submission.to_csv("submission.csv", index=False, float_format="%.8f")
    print("✅  submission.csv ready!")
    
predict_test()