.ipynb的介面最上面有一排東西"+Code", "+Markdown", ...最後有個"Outline"打開就可以看到目錄！

## 預處理

In [None]:
import os  # 處理檔案路徑
import numpy as np  # 處理數值資料
import pandas as pd  # 處理表格型資料
from tqdm import tqdm  # 讓迴圈加上進度條
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from decimal import Decimal, ROUND_HALF_UP
from sklearn.metrics import accuracy_score

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
DATA_DIR = "/content/drive/MyDrive/39_Training_Dataset/39_Training_Dataset/train_data" # 存放 .txt 的資料夾
INFO_PATH = "/content/drive/MyDrive/39_Training_Dataset/39_Training_Dataset/train_info.csv" # 標籤的 .csv 檔路徑

In [None]:
# 前處理：切割每筆資料的 27 次揮拍並提取統計特徵
from scipy.stats import kurtosis, skew
from scipy.signal import welch
from scipy.stats import entropy

def segment_swing(data, cut_points):
    return [data[cut_points[i]:cut_points[i+1]] for i in range(27)]

def extract_features(segment):
    feats = []

    acc = segment[:, 0:3]  # Ax, Ay, Az
    gyro = segment[:, 3:6] # Gx, Gy, Gz

    # 三軸平均值（共6）
    feats += list(acc.mean(axis=0))  # ax_mean, ay_mean, az_mean
    feats += list(gyro.mean(axis=0))  # gx_mean, gy_mean, gz_mean

    # 三軸變異數（共6）
    feats += list(acc.var(axis=0))  # ax_var, ...
    feats += list(gyro.var(axis=0))

    # 三軸 RMS（平方根平均）（共6）
    feats += list(np.sqrt((acc ** 2).mean(axis=0)))
    feats += list(np.sqrt((gyro ** 2).mean(axis=0)))

    # 加速度總量 & 角速度總量（不考慮方向）
    acc_total = np.linalg.norm(acc, axis=1)
    gyro_total = np.linalg.norm(gyro, axis=1)

    feats += [acc_total.max(), acc_total.min(), acc_total.mean()]  # 3
    feats += [gyro_total.max(), gyro_total.min(), gyro_total.mean()]  # 3

    # skewness + kurtosis + spectral entropy（各軸加總，共 6）
    for i in range(3):
        a = acc[:, i]
        g = gyro[:, i]

        # Skewness & Kurtosis
        feats += [skew(a), kurtosis(a)]
        feats += [skew(g), kurtosis(g)]

        # Spectral entropy (用 Welch 頻譜估計)
        for signal in [a, g]:
            f, Pxx = welch(signal, nperseg=min(len(signal), 64))
            Pxx /= Pxx.sum() + 1e-8
            feats += [entropy(Pxx)]

    return feats

def preprocess_dataset(info_csv_path, data_dir, is_train=True):
    df_info = pd.read_csv(info_csv_path)
    feature_list = []
    meta_list = []

    for idx, row in df_info.iterrows():
        uid = row["unique_id"]
        txt_path = os.path.join(data_dir, f"{uid}.txt")
        if not os.path.exists(txt_path):
            continue
        try:
            data = np.loadtxt(txt_path)
            cut_str = row["cut_point"]
            cut_points = list(map(int, cut_str.strip("[]").split()))
            if len(cut_points) != 28:
                continue  # 要切出27段，需28個點
            swings = segment_swing(data, cut_points)
            # 原有的特徵擷取
            all_feats = [extract_features(s) for s in swings]
            flatten_feats = np.concatenate(all_feats)  # shape: (1134,)

            # ➕ 加入清單
            feature_list.append(flatten_feats)

            if is_train:
                meta_list.append({
                    "unique_id": uid,
                    "gender": 1 if row["gender"] == 2 else 0,
                    "hand": 1 if row["hold racket handed"] == 2 else 0,
                    "years": row["play years"],
                    "level": row["level"] - 2  # 轉成 0~3
                })
            else:
                meta_list.append({"unique_id": uid})
        except:
            continue  # 忽略格式錯誤或缺失

    df_feat = pd.DataFrame(feature_list, columns=[f"f{i}" for i in range(len(feature_list[0]))])
    df_meta = pd.DataFrame(meta_list)
    df_result = pd.concat([df_meta, df_feat], axis=1)
    return df_result

# ⏬ 執行前處理
train_df = preprocess_dataset(INFO_PATH, DATA_DIR, is_train=True)
test_df = preprocess_dataset("/content/drive/MyDrive/39_Test_Dataset/39_Test_Dataset/test_info.csv","/content/drive/MyDrive/39_Test_Dataset/39_Test_Dataset/test_data", is_train=False)

train_df.to_csv("train_features.csv", index=False)
test_df.to_csv("test_features.csv", index=False)
print("✅ train_features.csv和test_features.csv 已完成前處理")


  feats += [skew(g), kurtosis(g)]


✅ train_features.csv和test_features.csv 已完成前處理


## Optional. 權重處理

In [None]:
# 讀取資料
train_df = pd.read_csv("train_features.csv")
test_df = pd.read_csv("test_features.csv")
uids = test_df["unique_id"].tolist()

# 特徵與標籤
X = train_df.drop(columns=["unique_id", "gender", "hand", "years", "level"]).fillna(0)
y_gender = train_df["gender"]
y_hand = train_df["hand"]
y_years = train_df["years"]
y_level = train_df["level"]

print("👨 男生有幾筆：", np.sum(y_gender == 0))
print("👩 女生有幾筆：", np.sum(y_gender == 1))
print("🫱 右手持拍有幾筆：", np.sum(y_hand == 0))
print("🫲 左手持拍有幾筆：", np.sum(y_hand == 1))
# 幾乎 5 倍差距 嚴重失衡
# 建議使用 oversampling / SMOTE 等技巧


# 檢查持拍手分布
# 4.34 倍差距→ 明顯不平衡

# 檢查球齡分布
print("低球齡有幾筆：", np.sum(y_years == 0))
print("中球齡有幾筆：", np.sum(y_years == 1))
print("高球齡有幾筆：", np.sum(y_years == 2))
# 差 ≈ 2.2 倍 → 輕度不平衡

# 檢查等級分布
print("等級0：", np.sum(y_level == 0))
print("等級1：", np.sum(y_level == 1))
print("等級2：", np.sum(y_level == 2))
print("等級3：", np.sum(y_level == 3))
# 6.6 倍差距 → 明顯不平衡
print(y_gender[1390])
print(y_hand[1390])
print(y_years[1390])

👨 男生有幾筆： 1627
👩 女生有幾筆： 328
🫱 右手持拍有幾筆： 1589
🫲 左手持拍有幾筆： 366
低球齡有幾筆： 387
中球齡有幾筆： 868
高球齡有幾筆： 700
等級0： 715
等級1： 201
等級2： 136
等級3： 903
0
1
0


## 主程式

In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
from decimal import Decimal, ROUND_HALF_UP

# 切分訓練/驗證集
X_train, X_val, y_gender_train, y_gender_val = train_test_split(X, y_gender, test_size=0.2, random_state=42)
_, _, y_hand_train, y_hand_val = train_test_split(X, y_hand, test_size=0.2, random_state=42)
_, _, y_years_train, y_years_val = train_test_split(X, y_years, test_size=0.2, random_state=42)
_, _, y_level_train, y_level_val = train_test_split(X, y_level, test_size=0.2, random_state=42)


# 模型訓練

# Class Weights
gender_weights = compute_sample_weight(class_weight="balanced", y=y_gender_train)
hand_weights = compute_sample_weight(class_weight="balanced", y=y_hand_train)
years_weights = compute_sample_weight(class_weight="balanced", y=y_years_train)
level_weights = compute_sample_weight(class_weight="balanced", y=y_level_train)

# 計算 scale_pos_weight for gender / hand
scale_pos_gender = (y_gender_train == 0).sum() / (y_gender_train == 1).sum()
scale_pos_hand = (y_hand_train == 0).sum() / (y_hand_train == 1).sum()

# 訓練 Gender 模型
xgb_gender = XGBClassifier(eval_metric='logloss')#, scale_pos_weight=scale_pos_gender
xgb_gender.fit(X_train, y_gender_train,sample_weight = gender_weights)#性別加上weights
y_pred_gender = xgb_gender.predict(X_val)
print("🎯 Gender acc:", accuracy_score(y_gender_val, y_pred_gender))

# 訓練 Hand 模型
xgb_hand = XGBClassifier(eval_metric='logloss')#, scale_pos_weight=scale_pos_hand
xgb_hand.fit(X_train, y_hand_train,sample_weight = hand_weights)#手加上weights
y_pred_hand = xgb_hand.predict(X_val)
print("🎯 Hand acc:", accuracy_score(y_hand_val, y_pred_hand))

x_axis_indices = []
for i in range(27):
    base = i * 42
    x_axis_indices += [base + 0, base + 3, base + 6, base + 9]  # Ax mean, Gx mean, Ax var, Gx var

X_xaxis_train = X_train.iloc[:, x_axis_indices]
X_xaxis_val = X_val.iloc[:, x_axis_indices]

# X軸特徵 模型

xgb_hand_x = XGBClassifier(eval_metric='logloss')
xgb_hand_x.fit(X_xaxis_train, y_hand_train,sample_weight = hand_weights)
print("X軸特徵 Only acc:", accuracy_score(y_hand_val, xgb_hand_x.predict(X_xaxis_val)))

# Years 模型
xgb_years = XGBClassifier(eval_metric='mlogloss', num_class=3)
xgb_years.fit(X_train, y_years_train, sample_weight=years_weights)
y_pred_years = xgb_years.predict(X_val)
print("🎯 Years acc:", accuracy_score(y_years_val, y_pred_years))

# Level 模型
xgb_level = XGBClassifier(eval_metric='mlogloss', num_class=4)
xgb_level.fit(X_train, y_level_train, sample_weight=level_weights)
y_pred_level = xgb_level.predict(X_val)
print("🎯 Level acc:", accuracy_score(y_level_val, y_pred_level))


🎯 Gender acc: 0.9718670076726342
🎯 Hand acc: 0.9974424552429667
X軸特徵 Only acc: 1.0
🎯 Years acc: 0.9028132992327366
🎯 Level acc: 0.9104859335038363


In [None]:
# 預測
X_test = test_df.drop(columns=["unique_id"], errors="ignore").fillna(0)
X_xaxis_test = X_test.iloc[:, x_axis_indices]

gender_pred = xgb_gender.predict_proba(X_test)[:, 1]
hand_pred = xgb_hand_x.predict_proba(X_xaxis_test)[:, 1] #xgb_hand.predict_proba(X_test)[:, 1]# xgb_hand_x.predict_proba(X_test)[:, 1]
years_pred = xgb_years.predict_proba(X_test)
level_pred = xgb_level.predict_proba(X_test)

In [None]:
print(X_test)

               f0           f1           f2           f3           f4  \
0     1394.294737  1149.168421  -691.821053 -2895.084211 -4766.031579   
1     3150.474747  -982.111111  -496.494949  -364.474747  1501.060606   
2     3945.687500 -1513.137500   146.212500  -842.912500  2293.300000   
3     3232.700000 -2123.400000  -573.957143    52.771429  2189.971429   
4     3976.363636 -1927.500000 -1202.227273  -356.409091  1199.238636   
...           ...          ...          ...          ...          ...   
1425  3301.483516 -2525.890110   284.362637   589.208791  2436.340659   
1426  2678.644444 -1519.500000  -750.533333 -1111.344444  1201.344444   
1427  2989.430108  -344.268817  -628.301075 -4936.344086  2537.591398   
1428  4225.342857 -3032.514286 -1703.400000  -642.266667  1176.333333   
1429  3506.988764 -3023.134831  -860.595506  -111.011236  1719.550562   

               f5            f6            f7            f8            f9  \
0     3710.505263  2.211002e+04  2.145752e+05 

In [None]:
# 輸出 submission
def format_float(val):
    return str(Decimal(val).quantize(Decimal('0.000001'), rounding=ROUND_HALF_UP))

submission_df = pd.DataFrame({
    "unique_id": uids,
    "gender": gender_pred,
    "hold racket handed": hand_pred,
    "play years_0": years_pred[:, 0],
    "play years_1": years_pred[:, 1],
    "play years_2": years_pred[:, 2],
    "level_2": level_pred[:, 0],
    "level_3": level_pred[:, 1],
    "level_4": level_pred[:, 2],
    "level_5": level_pred[:, 3],
})

for col in submission_df.columns[1:]:
    submission_df[col] = submission_df[col].apply(format_float)

submission_df.to_csv("submission_xgb_final.csv", index=False)
print("✅ submission_xgb_final.csv saved.")



✅ submission_xgb_final.csv saved.


In [None]:
def generate_ground_truth_csv(filename='ground_truth.csv', num_samples=1430, seed=42):
    np.random.seed(seed)

    data = {
        'unique_id': [1000 + i for i in range(1, num_samples + 1)],
        'gender': np.random.randint(0, 2, size=num_samples),  # 0 or 1
        'hold racket handed': np.random.randint(0, 2, size=num_samples),  # 0 or 1
        'play years': np.random.randint(0, 3, size=num_samples),  # 0, 1, 2
        'level': np.random.randint(2, 6, size=num_samples),  # 2, 3, 4, 5
    }

    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"✅ ground_truth.csv 已成功生成，共 {num_samples} 筆資料。")

# 執行生成測資
generate_ground_truth_csv()

✅ ground_truth.csv 已成功生成，共 1430 筆資料。


In [None]:
from sklearn.metrics import roc_auc_score


def evaluate_submission(submission_path, ground_truth_path):
    # 讀取資料
    pred = pd.read_csv(submission_path)
    true = pd.read_csv(ground_truth_path)

    # 二元類別
    gender_auc = roc_auc_score(true['gender'], pred['gender'])
    hand_auc = roc_auc_score(true['hold racket handed'], pred['hold racket handed'])

    # 多類別 - One-vs-Rest (Macro-average)
    years_true = true['play years']
    years_pred = pred[['play years_0', 'play years_1', 'play years_2']]
    years_auc = roc_auc_score(years_true, years_pred, multi_class='ovr', average='macro')

    level_true = true['level']
    level_pred = pred[['level_2', 'level_3', 'level_4', 'level_5']]
    level_auc = roc_auc_score(level_true, level_pred, multi_class='ovr', average='macro')

    # 最終分數
    final_score = (gender_auc + hand_auc + years_auc + level_auc) / 4

    print(f" ROC AUC - Gender: {gender_auc:.4f}")
    print(f" ROC AUC - Hand:   {hand_auc:.4f}")
    print(f" ROC AUC - Years:  {years_auc:.4f}")
    print(f" ROC AUC - Level:  {level_auc:.4f}")
    print(f"\n Final Score:     {final_score:.4f}")

    return final_score

evaluate_submission("submission_xgb_final.csv", "ground_truth.csv")


 ROC AUC - Gender: 0.4987
 ROC AUC - Hand:   0.5139
 ROC AUC - Years:  0.5029
 ROC AUC - Level:  0.5045

 Final Score:     0.5050


np.float64(0.5050118588045522)