In [1]:
import tomli
import numpy as np
import pandas as pd
import glob
import math
import csv
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score

In [2]:
with open("config.toml", "rb") as f:
    config = tomli.load(f)

In [3]:
train_info = pd.read_csv(config['Train']['InfoFile'])
train_info.head()
# train_info['level'].unique()

Unnamed: 0,unique_id,player_id,mode,gender,hold racket handed,play years,level,cut_point
0,1,41,1,1,1,1,5,[ 0 61 122 183 244 305 366 428 489 ...
1,2,41,2,1,1,1,5,[ 0 74 149 224 299 374 449 524 599 ...
2,3,41,3,1,1,1,5,[ 0 103 207 311 415 519 623 727 831 ...
3,4,41,4,1,1,1,5,[ 0 101 203 304 406 507 609 710 812 ...
4,5,41,5,1,1,1,5,[ 0 105 211 317 423 529 635 740 846 ...


In [4]:
unique_players = train_info['player_id'].unique()
train_players, test_players = train_test_split(unique_players, test_size=0.2, random_state=42)

# 讀取特徵 CSV 檔（位於 "./tabular_data_train"）
datapath = './tabular_data_train'
datalist = glob.glob(os.path.join(config['Train']['FeaturePath'], "*.csv"))
target_mask = ['gender', 'hold racket handed', 'play years', 'level']

In [5]:
sample_file = pd.read_csv(datalist[0])
sample_file.head()

Unnamed: 0,ax_mean,ay_mean,az_mean,gx_mean,gy_mean,gz_mean,ax_var,ay_var,az_var,gx_var,...,a_fft,g_fft,a_psd,g_psd,a_kurt,g_kurt,a_skewn,g_skewn,a_entropy,g_entropy
0,-3210.325581,-1472.639535,769.174419,3888.139535,15000.069767,-1508.011628,2625.734377,2141.530712,2579.672441,22376.099388,...,34235.187587,34235.187587,8524912000.0,8524912000.0,7.929079,2.71536,193.153135,63.072307,-0.055732,-0.055732
1,-4266.151163,-1758.790698,433.104651,4868.604651,14022.290698,3694.767442,2716.650412,3104.92144,2033.799252,23827.776754,...,4200.628815,4200.628815,285508700.0,285508700.0,4.805886,3.387055,116.344046,86.140297,-0.055453,-0.055453
2,-3777.05814,-2363.44186,496.430233,3838.127907,17846.848837,2016.662791,2646.756001,2342.910897,2150.277547,25096.946277,...,8077.753674,8077.753674,206315200.0,206315200.0,3.82819,2.898664,113.018318,54.248883,-0.056366,-0.056366
3,-3545.05814,-1156.709302,618.046512,3811.674419,14340.662791,3557.930233,2507.900301,2258.403467,1908.36597,22559.762748,...,5265.867665,5265.867665,155785500.0,155785500.0,2.039432,3.110605,30.070979,67.902479,-0.056986,-0.056986
4,-3078.068966,-793.758621,137.551724,10518.264368,9810.137931,-9929.16092,2518.215706,2126.238452,1811.472957,19047.955661,...,1315.209915,1315.209915,61764660.0,61764660.0,8.145404,5.094156,205.116213,137.770788,-0.057376,-0.057376


In [6]:
# 根據 test_players 分組資料
x_train = pd.DataFrame()
y_train = pd.DataFrame(columns=target_mask)
x_test = pd.DataFrame()
y_test = pd.DataFrame(columns=target_mask)

for file in datalist:
    unique_id = int(Path(file).stem)
    row = train_info[train_info['unique_id'] == unique_id]
    if row.empty:
        continue
    player_id = row['player_id'].iloc[0]
    data = pd.read_csv(file)
    target = row[target_mask]
    target_repeated = pd.concat([target] * len(data))
    if player_id in train_players:
        x_train = pd.concat([x_train, data], ignore_index=True)
        y_train = pd.concat([y_train, target_repeated], ignore_index=True)
    elif player_id in test_players:
        x_test = pd.concat([x_test, data], ignore_index=True)
        y_test = pd.concat([y_test, target_repeated], ignore_index=True)

In [7]:
# 標準化特徵
scaler = MinMaxScaler()
le = LabelEncoder()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

group_size = 27

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score
import numpy as np

# Define a simple neural network for binary classification
class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

# Define a neural network for multi-class classification
class MultiClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MultiClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, num_classes)
        )
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        logits = self.model(x)
        return logits
    
    def predict_proba(self, x):
        logits = self.model(x)
        return self.softmax(logits)

# Function to train binary classification model
def train_binary_model(X_train, y_train, X_test, y_test, batch_size=64, epochs=10, lr=0.001):
    # Convert data to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train)
    y_train_tensor = torch.FloatTensor(y_train.reshape(-1, 1))
    X_test_tensor = torch.FloatTensor(X_test)
    
    # Create dataset and dataloader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize model
    input_size = X_train.shape[1]
    model = BinaryClassifier(input_size)
    
    # Loss and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')
    
    # Make predictions
    model.eval()
    with torch.no_grad():
        predicted = model(X_test_tensor).numpy().flatten()
    
    # Group predictions
    num_groups = len(predicted) // group_size
    y_pred = []
    for i in range(num_groups):
        group_preds = predicted[i*group_size: (i+1)*group_size]
        # Use the same logic as your original code
        if sum(group_preds[:group_size]) / group_size > 0.5:
            y_pred.append(max(group_preds))
        else:
            y_pred.append(min(group_preds))
    
    y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
    
    auc_score = roc_auc_score(y_test_agg, y_pred, average='micro')
    print(f'Binary AUC: {auc_score}')
    return model, auc_score

# Function to train multi-class classification model
def train_multi_model(X_train, y_train, X_test, y_test, batch_size=64, epochs=10, lr=0.001):
    # Get number of classes
    num_classes = len(np.unique(y_train))
    
    # Convert data to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train)
    y_train_tensor = torch.LongTensor(y_train)
    X_test_tensor = torch.FloatTensor(X_test)
    
    # Create dataset and dataloader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize model
    input_size = X_train.shape[1]
    model = MultiClassifier(input_size, num_classes)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')
    
    # Make predictions
    model.eval()
    with torch.no_grad():
        logits = model(X_test_tensor)
        predicted = torch.softmax(logits, dim=1).numpy()
    
    # Group predictions as in your original code
    num_groups = len(predicted) // group_size
    y_pred = []
    for i in range(num_groups):
        group_pred = predicted[i*group_size: (i+1)*group_size]
        # Sum probabilities for each class in the group
        class_sums = [sum([group_pred[k][j] for k in range(group_size)]) for j in range(num_classes)]
        chosen_class = np.argmax(class_sums)
        # Get probabilities for the chosen class
        candidate_probs = [group_pred[k][chosen_class] for k in range(group_size)]
        best_instance = np.argmax(candidate_probs)
        y_pred.append(group_pred[best_instance])
    
    y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
    
    auc_score = roc_auc_score(y_test_agg, y_pred, average='micro', multi_class='ovr')
    print(f'Multi-class AUC: {auc_score}')
    return model, auc_score

In [10]:
# For binary classification (gender)
y_train_le_gender = le.fit_transform(y_train['gender'])
y_test_le_gender = le.transform(y_test['gender'])
binary_model, gender_auc = train_binary_model(X_train_scaled, y_train_le_gender, X_test_scaled, y_test_le_gender)

# For multi-class classification (level)
y_train_le_level = le.fit_transform(y_train['level'])
y_test_le_level = le.transform(y_test['level'])
level_model, level_auc = train_multi_model(X_train_scaled, y_train_le_level, X_test_scaled, y_test_le_level)

# For multi-class classification (play years)
y_train_le_years = le.fit_transform(y_train['play years'])
y_test_le_years = le.transform(y_test['play years'])
years_model, years_auc = train_multi_model(X_train_scaled, y_train_le_years, X_test_scaled, y_test_le_years)

# For binary classification (hold racket handed)
y_train_le_hold = le.fit_transform(y_train['hold racket handed'])
y_test_le_hold = le.transform(y_test['hold racket handed'])
hold_model, hold_auc = train_binary_model(X_train_scaled, y_train_le_hold, X_test_scaled, y_test_le_hold)

Epoch 1/10, Loss: 0.41651737363371133
Epoch 2/10, Loss: 0.35574488310055463
Epoch 3/10, Loss: 0.3026491910718166
Epoch 4/10, Loss: 0.2643374332327458
Epoch 5/10, Loss: 0.23999136304156785
Epoch 6/10, Loss: 0.22434403215459855
Epoch 7/10, Loss: 0.2104383984657183
Epoch 8/10, Loss: 0.20147640064154584
Epoch 9/10, Loss: 0.19488754668350997
Epoch 10/10, Loss: 0.1897100304030846
Binary AUC: 0.9126506024096386
Epoch 1/10, Loss: 0.8333555248592905
Epoch 2/10, Loss: 0.6395526257551969
Epoch 3/10, Loss: 0.5623715902300186
Epoch 4/10, Loss: 0.5132840724386943
Epoch 5/10, Loss: 0.4808901316136167
Epoch 6/10, Loss: 0.4545907273185489
Epoch 7/10, Loss: 0.4253676595843909
Epoch 8/10, Loss: 0.41232867811096313
Epoch 9/10, Loss: 0.3957994635382744
Epoch 10/10, Loss: 0.38115763491872967
Multi-class AUC: 0.8323800745772414
Epoch 1/10, Loss: 0.830948534077161
Epoch 2/10, Loss: 0.7247194632366186
Epoch 3/10, Loss: 0.6576969746948196
Epoch 4/10, Loss: 0.590321555075943
Epoch 5/10, Loss: 0.5444560262801259


In [53]:
def model_binary(X_train, y_train, X_test, y_test):
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    
    predicted = clf.predict_proba(X_test)
    print(predicted.shape) 
    # 取出正類（index 0）的概率
    predicted = [predicted[i][0] for i in range(len(predicted))]

    
    num_groups = len(predicted) // group_size 
    if sum(predicted[:group_size]) / group_size > 0.5:
        y_pred = [max(predicted[i*group_size: (i+1)*group_size]) for i in range(num_groups)]
    else:
        y_pred = [min(predicted[i*group_size: (i+1)*group_size]) for i in range(num_groups)]
    
    y_pred  = [1 - x for x in y_pred]
    y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
    
    auc_score = roc_auc_score(y_test_agg, y_pred, average='micro')
    print(auc_score)

def model_multiary(X_train, y_train, X_test, y_test):
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    predicted = clf.predict_proba(X_test)
    num_groups = len(predicted) // group_size
    y_pred = []
    for i in range(num_groups):
        group_pred = predicted[i*group_size: (i+1)*group_size]
        num_classes = len(np.unique(y_train))
        # 對每個類別計算該組內的總機率
        class_sums = [sum([group_pred[k][j] for k in range(group_size)]) for j in range(num_classes)]
        chosen_class = np.argmax(class_sums)
        candidate_probs = [group_pred[k][chosen_class] for k in range(group_size)]
        best_instance = np.argmax(candidate_probs)
        y_pred.append(group_pred[best_instance])
    
    y_test_agg = [y_test[i*group_size] for i in range(num_groups)]
    auc_score = roc_auc_score(y_test_agg, y_pred, average='micro', multi_class='ovr')
    print('Multiary AUC:', auc_score)

In [54]:
# 評分：針對各目標進行模型訓練與評分
y_train_le_gender = le.fit_transform(y_train['gender'])
y_test_le_gender = le.transform(y_test['gender'])
model_binary(X_train_scaled, y_train_le_gender, X_test_scaled, y_test_le_gender)

# y_train_le_hold = le.fit_transform(y_train['hold racket handed'])
# y_test_le_hold = le.transform(y_test['hold racket handed'])
# model_binary(X_train_scaled, y_train_le_hold, X_test_scaled, y_test_le_hold)

# y_train_le_years = le.fit_transform(y_train['play years'])
# y_test_le_years = le.transform(y_test['play years'])
# model_multiary(X_train_scaled, y_train_le_years, X_test_scaled, y_test_le_years)

# y_train_le_level = le.fit_transform(y_train['level'])
# y_test_le_level = le.transform(y_test['level'])
# model_multiary(X_train_scaled, y_train_le_level, X_test_scaled, y_test_le_level)


(10746, 2)
0.7907995618838993


In [66]:
y_test['gender'].unique()

array([1, 2], dtype=object)

In [73]:
np.unique(le.fit_transform(y_test['level']))

array([0, 1, 2, 3])