In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import label_binarize

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

X_train= train_data.drop(columns=['data_ID', 'player_ID', 'gender', 'play years', 'hold racket handed', 'level'])
X_test = test_data.drop(columns=['data_ID'])

y_play_years = train_data['play years']
y_level = train_data['level']
y_gender = train_data['gender']
y_hold= train_data['hold racket handed']

# 合併資料進行正規化
X_combined = pd.concat([X_train, X_test], axis=0)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# 將正規化後的數據拆分回訓練集和測試集
X_scaled = X_combined_scaled[:len(X_train)]
X_test_scaled = X_combined_scaled[len(X_train):]


X_train_play_years, X_val_play_years, y_train_play_years, y_val_play_years = train_test_split(X_scaled, y_play_years, test_size=0.2, random_state=21)
X_train_level, X_val_level, y_train_level, y_val_level = train_test_split(X_scaled, y_level, test_size=0.2, random_state=21)
X_train_gender, X_val_gender, y_train_gender, y_val_gender = train_test_split(X_scaled, y_gender, test_size=0.2, random_state=21)
X_train_hold, X_val_hold, y_train_hold, y_val_hold = train_test_split(X_scaled, y_hold, test_size=0.2, random_state=21)

In [20]:
play_years_model = XGBClassifier(objective='multi:softmax', num_class=3, seed = 42, n_jobs=-1) #objective='multi:softmax', num_class=3,
play_years_model.fit(X_train_play_years, y_train_play_years, verbose=True)

y_pred_years = play_years_model.predict_proba(X_val_play_years)
y_val_play_years_binarized = label_binarize(y_val_play_years, classes=[0, 1, 2])

# 計算 Micro One-vs-Rest ROC AUC
roc_auc_years = roc_auc_score(y_val_play_years_binarized, y_pred_years, multi_class="ovr", average="micro")
print(f"Micro One-vs-Rest ROC AUC for play_years_model: {roc_auc_years}")


Micro One-vs-Rest ROC AUC for play_years_model: 0.9921353981826516


In [21]:
level_model = XGBClassifier(objective='multi:softmax', num_class=3, seed = 42, n_jobs=-1) #objective='multi:softmax', num_class=3, 
level_model.fit(X_train_level, y_train_level, verbose=True)

y_pred_level = level_model.predict_proba(X_val_level)
y_val_level_binarized = label_binarize(y_val_level, classes=[0, 1, 2])

roc_auc_level = roc_auc_score(y_val_level_binarized, y_pred_level, multi_class="ovr", average="micro")
print(f"Micro One-vs-Rest ROC AUC for level_model: {roc_auc_level}")

Micro One-vs-Rest ROC AUC for level_model: 0.9950129238811198


In [None]:
selected_features_gender_1 = ['ay_mean', 'ay_rms', 'az_mean', 'a_mean', 'gz_rms', 'g_mean', 'ax_rms']
selected_features_gender_2 = ['g_entropy', 'a_kurt', 'gz_var', 'g_kurt', 'gy_var', 'az_var']

X_train_gender_selected = X_train[selected_features_gender_1 + selected_features_gender_2]
X_test_gender_selected = X_test[selected_features_gender_1 + selected_features_gender_2]

# 創建人工特徵
X_train_gender_selected['ay_az_mean'] = X_train_gender_selected['ay_mean'] * X_train_gender_selected['az_mean']
X_train_gender_selected['gz_rms_g_mean'] = X_train_gender_selected['gz_rms'] * X_train_gender_selected['g_mean']
X_train_gender_selected['ay_rms_squared'] = X_train_gender_selected['ay_rms'] ** 2
X_train_gender_selected['log_g_entropy'] = np.log1p(X_train_gender_selected['g_entropy'])
X_train_gender_selected['az_mean_g_kurt_ratio'] = X_train_gender_selected['az_mean'] / (X_train_gender_selected['g_kurt'] + 1e-5)

X_test_gender_selected['ay_az_mean'] = X_test_gender_selected['ay_mean'] * X_test_gender_selected['az_mean']
X_test_gender_selected['gz_rms_g_mean'] = X_test_gender_selected['gz_rms'] * X_test_gender_selected['g_mean']
X_test_gender_selected['ay_rms_squared'] = X_test_gender_selected['ay_rms'] ** 2
X_test_gender_selected['log_g_entropy'] = np.log1p(X_test_gender_selected['g_entropy'])
X_test_gender_selected['az_mean_g_kurt_ratio'] = X_test_gender_selected['az_mean'] / (X_test_gender_selected['g_kurt'] + 1e-5)

# 將人工特徵進行標準化
X_train_selected_scaled = scaler.fit_transform(X_train_gender_selected)
X_test_selected_scaled = scaler.transform(X_test_gender_selected)

# 合併人工特徵回到原始的正規化資料集中
X_train_gender_enhanced = np.hstack((X_scaled, X_train_selected_scaled))
X_test_gender_enhanced = np.hstack((X_test_scaled, X_test_selected_scaled))

# 打印人工特徵的名稱
print("新增的人工特徵:", ['ay_az_mean', 'gz_rms_g_mean', 'ay_rms_squared', 'log_g_entropy', 'az_mean_g_kurt_ratio'])


新增的人工特徵: ['ay_az_mean', 'gz_rms_g_mean', 'ay_rms_squared', 'log_g_entropy', 'az_mean_g_kurt_ratio']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_selected['ay_az_mean'] = X_train_selected['ay_mean'] * X_train_selected['az_mean']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_selected['gz_rms_g_mean'] = X_train_selected['gz_rms'] * X_train_selected['g_mean']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_selected['ay_r

In [None]:
gender_model = XGBClassifier(objective='multi:softmax', num_class=2, seed = 42, n_jobs=-1)#objective='multi:softmax', num_class=2, 
gender_model.fit(X_train_gender_enhanced, y_gender, verbose=True)

'\ny_pred_gender = gender_model.predict_proba(X_val_gender)[:, 1]\nroc_auc_gender = roc_auc_score(y_val_gender, y_pred_gender)\nprint(f"ROC AUC for gender_model: {roc_auc_gender}")\n'

In [None]:
y_pred_years = play_years_model.predict_proba(X_test_scaled)
y_pred_level = level_model.predict_proba(X_test_scaled)
y_pred_gender = gender_model.predict_proba(X_test_selected_scaled)
y_pred_hold = hold_model.predict_proba(X_test_enhanced)

In [36]:
submission = pd.DataFrame({
    'data_ID': test_data['data_ID'],
    'gender': y_pred_gender[:, 1],
    'hold racket handed': y_pred_hold[:, 1],
    'play years_0': y_pred_years[:, 0],
    'play years_1': y_pred_years[:, 1],
    'play years_2': y_pred_years[:, 2],
    'level_0': y_pred_level[:, 0],
    'level_1': y_pred_level[:, 1],
    'level_2': y_pred_level[:, 2]
})

# 儲存提交檔案
submission.to_csv('submission.csv', index=False)