In [19]:
import numpy as np
import pandas as pd
import os
import re
from scipy.stats import skew, kurtosis
from scipy.fft import rfft, rfftfreq

# 參數設定
DATA_DIR = './train_data/'
INFO_FILE = './train_info.csv'
OUTPUT_DIR = './features_by_segment/train/'

def spectral_features(segment: np.ndarray, fs: float = 100.0) -> list:
    fft_vals = np.abs(rfft(segment, axis=0))
    freqs    = rfftfreq(segment.shape[0], d=1/fs)
    bands    = [(0,20),(20,50),(50,None)]
    feats    = []
    for lo, hi in bands:
        if hi is None:
            idx = np.where(freqs >= lo)[0]
        else:
            idx = np.where((freqs>=lo)&(freqs<hi))[0]
        if idx.size>0:
            feats.extend(fft_vals[idx].mean(axis=0).tolist())
        else:
            feats.extend([0.0]*segment.shape[1])
    return feats

# 提取單一分段特徵函式
def extract_segment_features(segment: np.ndarray) -> list:
    # segment shape: (n_steps, 6)
    feats = []
    feats.extend(segment.mean(axis=0))
    feats.extend(segment.std(axis=0))
    feats.extend(segment.max(axis=0))
    feats.extend(segment.min(axis=0))
    # 偏度、峭度
    feats.extend(skew(segment, axis=0))
    feats.extend(kurtosis(segment, axis=0))
    # 百分位數
    pctls = np.percentile(segment, [10,25,50,75,90], axis=0)  # shape=(5,6)
    feats.extend(pctls.flatten())
    # 一階差分
    diff = np.diff(segment, axis=0)
    feats.extend(diff.mean(axis=0))
    feats.extend(diff.std(axis=0))
    # (5) —— 新增 —— 频域能量特征
    feats.extend(spectral_features(segment, fs=100.0))
    return feats

# 生成分段級資料，一筆 row 對應一個 (unique_id, segment_id)
# 依據 cut_point 切成多段，然後對每段計算統計特徵
def generate_features(data_dir=DATA_DIR,
                      info_file=INFO_FILE,
                      output_dir=OUTPUT_DIR):
    os.makedirs(output_dir, exist_ok=True)
    info_df = pd.read_csv(info_file, dtype={'unique_id': str, 'cut_point': str})

    all_rows = []
    for _, row in info_df.iterrows():
        uid = row['unique_id']
        # 解析所有切點
        points = list(map(int, re.findall(r"\d+", row['cut_point'])))
        # 邊界：起點 0、所有 points、最後一筆 length
        file_path = os.path.join(data_dir, f"{uid}.txt")
        df = pd.read_csv(file_path, sep='\s+', header=None, dtype=float)
        data = df.values
        boundaries = [0] + points + [data.shape[0]]
        # 逐段提取特徵
        for seg_id in range(len(boundaries) - 1):
            start = boundaries[seg_id]
            end   = boundaries[seg_id + 1]
            seg   = data[start:end]
            if seg.size == 0:
                feats = [0] * (4 * 6)
            else:
                feats = extract_segment_features(seg)
            # 一行：unique_id, segment index, features...
            all_rows.append([uid, seg_id] + feats)

    # 組欄位名稱
    stats = ['mean','std','max','min','skew','kurtosis']
    pctls = ['p10','p25','p50','p75','p90']
    diffs = ['dmean','dstd']
    bands = ['b0_20','b20_50','b50_Nyquist']
    cols = ['unique_id','segment_id']
    for name_group, group in [('stat', stats), ('pct', pctls), ('diff', diffs), ('spec', bands)]:
        for stat in group:
            for axis in range(6):
                cols.append(f"{stat}_{axis}")

    features_df = pd.DataFrame(all_rows, columns=cols)
    features_df.to_csv(os.path.join(output_dir, 'features.csv'), index=False)

# 執行特徵生成
generate_features()

In [None]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf
from sklearn.metrics import roc_auc_score

class SubmissionAUC(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.X_val, (self.yg_val, self.yh_val, self.yy_val, self.yl_val) = validation_data

    def on_epoch_end(self, epoch, logs=None):
        p_g, p_h, p_y, p_l = self.model.predict(self.X_val, verbose=0)
        y_true_g = np.argmax(self.yg_val, axis=1)  # 0=男、1=女
        y_true_h = np.argmax(self.yh_val, axis=1)  # 0=右手、1=左手

        auc_g = roc_auc_score(y_true_g, p_g[:,1])
        auc_h = roc_auc_score(y_true_h, p_h[:,1])
        # 多類別要用 micro-ovr
        auc_y = roc_auc_score(self.yy_val, p_y, average='micro', multi_class='ovr')
        auc_l = roc_auc_score(self.yl_val, p_l, average='micro', multi_class='ovr')
        sub_auc = (auc_g + auc_h + auc_y + auc_l) / 4
        logs = logs or {}
        logs['val_sub_auc'] = sub_auc
        print(f' — val_sub_auc: {sub_auc:.4f}')

# ------------------ 1. 特徵生成 ------------------


# ------------------ 2. 載入資料並合併 ------------------
# 1. 載入分段特徵與標籤
feat_path = './features_by_segment/train/features.csv'
info_path = './train_info.csv'
feat_df = pd.read_csv(feat_path, dtype={'unique_id': str})
info_df = pd.read_csv(info_path, dtype={'unique_id': str})

df = pd.merge(
    feat_df,
    info_df[['unique_id','gender','hold racket handed','play years','level']],
    on='unique_id'
)


# 3. 構造特徵矩陣 X 與 groups
exclude_cols = ['unique_id','segment_id','gender','hold racket handed','play years','level']
feature_cols = [c for c in df.columns if c not in exclude_cols]

feats_per_seg = len(feature_cols)
num_segments = df['segment_id'].nunique()
unique_ids = df['unique_id'].unique()

X = np.stack([
    df[df['unique_id']==uid]
      .sort_values('segment_id')[feature_cols]
      .values
    for uid in unique_ids
])  # shape = (n_ids, num_segments, feats_per_seg)
# 標籤
info_df = info_df.set_index('unique_id')
label_df = info_df.loc[unique_ids]
y_gender = to_categorical(label_df['gender'].values - 1, num_classes=2)
y_hand   = to_categorical(label_df['hold racket handed'].values - 1, num_classes=2)
y_years  = to_categorical(label_df['play years'].values, num_classes=3)
y_level  = to_categorical(label_df['level'].values - 2, num_classes=4)

# 分割訓練/驗證
gss = GroupShuffleSplit(test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, groups=unique_ids))
X_tr, X_val = X[train_idx], X[val_idx]
yg_tr, yg_val = y_gender[train_idx], y_gender[val_idx]
yh_tr, yh_val = y_hand[train_idx], y_hand[val_idx]
yy_tr, yy_val = y_years[train_idx], y_years[val_idx]
yl_tr, yl_val = y_level[train_idx], y_level[val_idx]

# —— 1. 基本形状检查 —— #
print("=== Shape Check ===")
print("X_tr:", X_tr.shape)
print("X_val:", X_val.shape)
# print("X_test:", X_test.shape)
print("y_gender   train/val:", yg_tr.shape, yg_val.shape)
print("y_hand     train/val:", yh_tr.shape, yh_val.shape)
print("y_years    train/val:", yy_tr.shape, yy_val.shape)
print("y_level    train/val:", yl_tr.shape, yl_val.shape)
print("unique_ids count:", len(unique_ids))
print("train+val == total?", len(train_idx)+len(val_idx), "vs", len(unique_ids))
print("==========================================")

# —— 5. 训练/验证标签分布 —— #
print("=== Label Distribution (train vs val) ===")
for name, y_tr, y_v in [
    ("gender", yg_tr, yg_val),
    ("hand",   yh_tr, yh_val),
    ("years",  yy_tr, yy_val),
    ("level",  yl_tr, yl_val),
]:
    tr_counts = pd.Series(np.argmax(y_tr,1)).value_counts().sort_index()
    v_counts  = pd.Series(np.argmax(y_v,1)).value_counts().sort_index()
    print(f"{name}: train {tr_counts.to_dict()},  val {v_counts.to_dict()}")
print("==========================================")
# —— 2. GroupSplit 互斥性检查 —— #
train_uids = unique_ids[train_idx]
val_uids   = unique_ids[val_idx]
inter = set(train_uids) & set(val_uids)
print("train/val 重叠的 unique_id 个数:", len(inter))
assert len(inter)==0, "❌ 有球手同时出现在 train 和 val！"

# feats_per_seg = X.shape[2]
scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr.reshape(-1, feats_per_seg)).reshape(X_tr.shape)
X_val = scaler.transform(   X_val.reshape(-1, feats_per_seg)).reshape(X_val.shape)

# ------------------ 4. RNN 模型定義 ------------------
inputs = Input(shape=(num_segments, feats_per_seg))
x = LSTM(64, return_sequences=False)(inputs)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
out_g = Dense(2, activation='softmax', name='gender')(x)
out_h = Dense(2, activation='softmax', name='hand')(x)
out_y = Dense(3, activation='softmax', name='years')(x)
out_l = Dense(4, activation='softmax', name='level')(x)
model = Model(inputs, [out_g, out_h, out_y, out_l])
model.compile(
    optimizer='adam',
    loss={
        'gender':'categorical_crossentropy',
        'hand':'categorical_crossentropy',
        'years':'categorical_crossentropy',
        'level':'categorical_crossentropy'
    },
    metrics={
        'gender':'accuracy',
        'hand':'accuracy',
        'years':'accuracy',
        'level':'accuracy'
    }
)

# ------------------ 5. 模型訓練與最佳化 ------------------
checkpoint = ModelCheckpoint(
    'best_model.keras', monitor='val_sub_auc', mode='max', save_best_only=True
)
early_stop = EarlyStopping(
    monitor='val_sub_auc', mode='max', patience=3, restore_best_weights=True
)
model.fit(
    X_tr,
    {'gender':yg_tr, 'hand':yh_tr, 'years':yy_tr, 'level':yl_tr},
    validation_data=(
        X_val, 
        {'gender':yg_val, 'hand':yh_val, 'years':yy_val, 'level':yl_val}
    ),
    epochs=20,
    batch_size=32,
    callbacks=[SubmissionAUC((X_val, (yg_val, yh_val, yy_val, yl_val))), 
               checkpoint, early_stop
    ]
)


num_segments= 29  feats_per_seg= 3
=== Shape Check ===
X_tr: (1564, 29, 96)
X_val: (391, 29, 96)
y_gender   train/val: (1564, 2) (391, 2)
y_hand     train/val: (1564, 2) (391, 2)
y_years    train/val: (1564, 3) (391, 3)
y_level    train/val: (1564, 4) (391, 4)
unique_ids count: 1955
train+val == total? 1955 vs 1955
=== Label Distribution (train vs val) ===
gender: train {0: 1287, 1: 277},  val {0: 340, 1: 51}
hand: train {0: 1263, 1: 301},  val {0: 326, 1: 65}
years: train {0: 307, 1: 693, 2: 564},  val {0: 80, 1: 175, 2: 136}
level: train {0: 575, 1: 157, 2: 107, 3: 725},  val {0: 140, 1: 44, 2: 29, 3: 178}
train/val 重叠的 unique_id 个数: 0
Epoch 1/20


ValueError: Input 0 of layer "functional_3" is incompatible with the layer: expected shape=(None, 29, 3), found shape=(None, 29, 96)

In [None]:
import numpy as np
import pandas as pd

# 1. 读原始分段特征
test_feat = pd.read_csv('./features_by_segment/test/features.csv', dtype={'unique_id':str})

# 2. 得到按球手顺序的 unique_id 列表
unique_ids = test_feat['unique_id'].unique()

# 3. 把分段特征 stack 成 (n_players, n_segments, feats_per_seg)
X_test = np.stack([
    test_feat[test_feat['unique_id']==uid]
      .sort_values('segment_id')[feature_cols]
      .values
    for uid in unique_ids
])  # (1431, num_segments, feats_per_seg)

# 4. 标准化
X_test = scaler.transform(X_test.reshape(-1, feats_per_seg)) \
               .reshape(X_test.shape)

# 5. 预测
# —— 4. 预测概率合法性 —— #
p_g, p_h, p_y, p_l = model.predict(X_test, verbose=0)
print("=== Prob Sums (first 10) ===")
print("gender sum:", np.round(p_g[:10].sum(axis=1), 6))
print("hand   sum:", np.round(p_h[:10].sum(axis=1), 6))
print("years  sum:", np.round(p_y[:10].sum(axis=1), 6))
print("level  sum:", np.round(p_l[:10].sum(axis=1), 6))

# 6. 用 unique_ids 构造 submission
sub = pd.DataFrame({'unique_id': unique_ids})
# gender: 索引0是「male」的概率
sub['gender'] = p_g[:, 0]
# hold racket handed: 索引0是「right」的概率
sub['hold racket handed'] = p_h[:, 0]
# play years 的三类
for i in range(p_y.shape[1]):
    sub[f'play years_{i}'] = p_y[:, i]
# level_2、3、4、5
for idx, lvl in enumerate([2,3,4,5]):
    sub[f'level_{lvl}'] = p_l[:, idx]

sub.to_csv('RNN_submission.csv', index=False, float_format='%.6f')
print('RNN_submission.csv 已生成。')


=== Prob Sums (first 10) ===
gender sum: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
hand   sum: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
years  sum: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
level  sum: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
RNN_submission.csv 已生成。
