In [1]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from xgboost import XGBClassifier

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

X_train= train_data.drop(columns=['data_ID', 'player_ID', 'gender', 'play years', 'hold racket handed', 'level'])
X_test = test_data.drop(columns=['data_ID'])

y_play_years = train_data['play years']
y_level = train_data['level']
y_gender = train_data['gender']
y_hold= train_data['hold racket handed']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [3]:

# 假設 X_scaled_years, X_scaled_level, X_scaled_gender, X_scaled_hold 是正規化後的訓練集特徵
# y_train_smote_years, y_train_smote_level, y_train_smote_gender, y_train_smote_hold 是四個目標變量的訓練標籤

# 定義特徵篩選函數
def feature_selection(model, X_train, y_train, threshold):
    model.fit(X_train, y_train)
    feature_importances = model.feature_importances_
    selected_features = X_train.columns[feature_importances > threshold]
    return selected_features

#objective='multi:softmax', num_class=3
# 特徵篩選 - years
years_model = XGBClassifier(
    objective='multi:softmax', num_class=3, n_jobs=-1, random_state = 42
)
selected_features_years = feature_selection(years_model, pd.DataFrame(X_scaled, columns=X_train.columns), y_play_years, 0.01)
X_train_selected_years = pd.DataFrame(X_scaled, columns=X_train.columns)[selected_features_years]
X_test_selected_years = pd.DataFrame(X_test_scaled, columns=X_train.columns)[selected_features_years]

# 特徵篩選 - level
level_model = XGBClassifier(
    objective='multi:softmax', num_class=3, n_jobs=-1, random_state = 42
)
selected_features_level = feature_selection(level_model, pd.DataFrame(X_scaled, columns=X_train.columns), y_level, 0.01)
X_train_selected_level = pd.DataFrame(X_scaled, columns=X_train.columns)[selected_features_level]
X_test_selected_level = pd.DataFrame(X_test_scaled, columns=X_train.columns)[selected_features_level]


#objective='multi:softmax', num_class=2, 
# 特徵篩選 - gender
gender_model = XGBClassifier(
    objective='multi:softmax', num_class=2, n_jobs=-1, random_state = 42
)
selected_features_gender = feature_selection(gender_model, pd.DataFrame(X_scaled, columns=X_train.columns), y_gender, 0.01)
X_train_selected_gender = pd.DataFrame(X_scaled, columns=X_train.columns)[selected_features_gender]
X_test_selected_gender = pd.DataFrame(X_test_scaled, columns=X_train.columns)[selected_features_gender]

#objective='multi:softmax', num_class=2, n_jobs=-1, random_state = 42# 特徵篩選 - hold
hold_model = XGBClassifier(
    objective='multi:softmax', num_class=2, n_jobs=-1, random_state = 42
)
selected_features_hold = feature_selection(hold_model, pd.DataFrame(X_scaled, columns=X_train.columns), y_hold, 0.01)
X_train_selected_hold = pd.DataFrame(X_scaled, columns=X_train.columns)[selected_features_hold]
X_test_selected_hold = pd.DataFrame(X_test_scaled, columns=X_train.columns)[selected_features_hold]

# 查看每個目標變量的選定特徵
print("Selected features for years:", selected_features_years)
print("Selected features for level:", selected_features_level)
#print("Selected features for gender:", selected_features_gender)
print("Selected features for hold:", selected_features_hold)

Selected features for years: Index(['ax_mean', 'ay_mean', 'az_mean', 'gx_mean', 'gy_mean', 'gz_mean',
       'ax_var', 'ay_var', 'az_var', 'gx_var', 'gy_var', 'gz_var', 'ax_rms',
       'ay_rms', 'az_rms', 'gx_rms', 'gy_rms', 'gz_rms', 'a_max', 'a_mean',
       'g_max', 'g_mean', 'a_kurt', 'g_kurt', 'a_skewn', 'g_skewn',
       'a_entropy'],
      dtype='object')
Selected features for level: Index(['ax_mean', 'ay_mean', 'az_mean', 'gx_mean', 'gy_mean', 'gz_mean',
       'ax_var', 'ay_var', 'az_var', 'gx_var', 'gy_var', 'gz_var', 'ax_rms',
       'ay_rms', 'az_rms', 'gx_rms', 'gy_rms', 'gz_rms', 'a_max', 'a_mean',
       'g_max', 'g_mean', 'a_kurt', 'g_kurt', 'a_skewn', 'g_skewn',
       'a_entropy'],
      dtype='object')
Selected features for hold: Index(['ax_mean', 'az_mean', 'gx_mean', 'gy_mean', 'az_var', 'gx_var',
       'gy_var', 'ax_rms', 'az_rms', 'gy_rms', 'gz_rms', 'a_max', 'a_mean',
       'g_max', 'g_mean'],
      dtype='object')


In [5]:
param = {
    'n_estimators': 150,
    'max_depth': 5,
    'learning_rate': 0.06,
    'eval_metric': 'logloss',
    'subsample': 0.7,
    'colsample_bytree': 0.8,
    'min_child_weight': 2,
    'gamma': 0.2,
    'random_state': 42
}

play_years_model = XGBClassifier(**param, n_jobs = -1)
play_years_model.fit(X_train_selected_years, y_play_years, verbose=100)

y_pred_years = play_years_model.predict_proba(X_test_selected_years)

level_model = XGBClassifier(**param,n_jobs=-1) 
level_model.fit(X_train_selected_level, y_level, verbose=100)

y_pred_level = level_model.predict_proba(X_test_selected_level)

gender_model = XGBClassifier(**param, n_jobs = -1) 
gender_model.fit(X_train_selected_gender, y_gender, verbose=True)

y_pred_gender = gender_model.predict_proba(X_test_selected_gender)

hold_model = XGBClassifier(**param, n_jobs=-1)
hold_model.fit(X_train_selected_hold, y_hold, verbose=True)

y_pred_hold = hold_model.predict_proba(X_test_selected_hold)

In [7]:
cat_prediction = pd.read_csv('cat_submission.csv')
print(cat_prediction.head(5))



   data_ID    gender  hold racket handed  play years_0  play years_1  \
0    26520  0.018615            0.014309      0.929576      0.027087   
1    26521  0.004553            0.000179      0.998111      0.001347   
2    26522  0.050045            0.000987      0.978436      0.012515   
3    26523  0.015051            0.000400      0.995398      0.001728   
4    26524  0.038363            0.003167      0.977561      0.011633   

   play years_2   level_0   level_1   level_2  
0      0.043337  0.498139  0.009842  0.492019  
1      0.000542  0.018471  0.041104  0.940425  
2      0.009048  0.143976  0.000597  0.855427  
3      0.002874  0.027350  0.000147  0.972503  
4      0.010807  0.027057  0.000183  0.972760  


In [8]:
submission = pd.DataFrame({
    'data_ID': test_data['data_ID'],
    'gender': y_pred_gender[:, 1],
    'hold racket handed': y_pred_hold[:, 1],
    'play years_0': y_pred_years[:, 0],
    'play years_1': y_pred_years[:, 1],
    'play years_2': y_pred_years[:, 2],
    'level_0': cat_prediction["level_0"],
    'level_1': cat_prediction["level_1"],
    'level_2': cat_prediction["level_2"]
})

# 儲存提交檔案
submission.to_csv('final_submission.csv', index=False)