# 用 XGBoost 建模

對每個預測目標(target)分別用 `xgboost` 建模

- 訓練集
    - 特徵：993 (= 1+990+2)
        - mode (1)
        - tsfel 抽取 'Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz': 990(= 165*6)
            - **去頭去尾**後再抽取特徵
        - 區間長度、總長度 (2)
    - 樣本數：1955
    - target: gender (G), hold racket handed (H), play years (P), level (L)

+ 測試集
    - 樣本數：1430

In [1]:
import numpy as np
import pandas as pd
import tsfel
import time, sys
from tqdm import tqdm

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei'] 
plt.rcParams['axes.unicode_minus'] = False

import warnings
warnings.filterwarnings("ignore")

In [2]:
TRAIN_PATH = r"./data/39_Training_Dataset/"
TEST_PATH = r"./data/39_Test_Dataset/"

MODE = list(range(1, 11))
TARGET = {'G':'gender', 'H':'hold racket handed', 'P':'play years', 'L':'level'}

# 建立數據集

In [5]:
file = TRAIN_PATH + "train_info.csv"
df_train_info = pd.read_csv(file)
print(df_train_info.shape)
df_train_info.head()

(1955, 8)


Unnamed: 0,unique_id,player_id,mode,gender,hold racket handed,play years,level,cut_point
0,1,41,1,1,1,1,5,[ 0 61 122 183 244 305 366 428 489 ...
1,2,41,2,1,1,1,5,[ 0 74 149 224 299 374 449 524 599 ...
2,3,41,3,1,1,1,5,[ 0 103 207 311 415 519 623 727 831 ...
3,4,41,4,1,1,1,5,[ 0 101 203 304 406 507 609 710 812 ...
4,5,41,5,1,1,1,5,[ 0 105 211 317 423 529 635 740 846 ...


In [10]:
df = pd.DataFrame()
df['unique_id'] = df_train_info['unique_id']
df['mode'] = df_train_info['mode']

# 處理 "區間長度、總長度"
interval = []
length = []
for idx in df_train_info.index:
    try:
        # 取出每次揮拍的連續數據，共 27 次 (mode 9,10 有部分數據不到 27 次揮拍)
        data = df_train_info.loc[idx, 'cut_point']
        data = data.replace('[', '')
        data = data.replace('    ', '_')
        data = data.replace('   ', '_')
        data = data.replace('  ', '_')
        data = data.replace(' ', '_')
        data = [eval(x.strip()) for x in data[1:-1].split('_')]

        diff = [data[n]-data[n-1] for n in range(1, len(data))]
        interval.append(sum(diff)/len(diff))
        length.append(data[-1])
    except Exception as e:
        print(e)
        print(f"Error in index {idx}")
df['interval'] = interval
df['length'] = length

df.head()

Unnamed: 0,unique_id,mode,interval,length
0,1,1,61.148148,1651
1,2,2,74.925926,2023
2,3,3,103.925926,2806
3,4,4,101.518519,2741
4,5,5,105.851852,2858


## TSFEL 抽取特徵 -> 訓練集

In [21]:
# dataset sampling frequency
fs = 85

# Feature Extraction
cfg_file = tsfel.get_features_by_domain()  # All features
# cfg_file = tsfel.get_features_by_domain('statistical') # Only statistical features
# cfg_file = tsfel.get_features_by_domain('temporal')    # Only temporal features
# cfg_file = tsfel.get_features_by_domain('spectral')    # Only spectral features exclude 'Spectrogram mean coefficient'

start_time = time.time()
df_fea = pd.DataFrame()
for idx in tqdm(df.index, desc="抽取特徵"):
# for idx in tqdm([0, 1], desc="抽取特徵"):
    try:        
        file = TRAIN_PATH + r"train_data/" + str(df.iloc[idx, 0]) + ".txt"
        # print(file)
        df_data = pd.read_csv(file, sep=' ', names=['Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz'])
        # print(df_data.shape)
    
        # Get features
        start_idx = int(df.loc[idx, 'interval'])     # 去頭
        end_idx = df.loc[idx, 'length'] - start_idx  # 去尾
        df_tsfel = tsfel.time_series_features_extractor(cfg_file, 
                                                        df_data.loc[start_idx:end_idx ,:], 
                                                        fs=fs, 
                                                        verbose=0)
        df_fea = pd.concat([df_fea, df_tsfel])
    except Exception as e:
        print(e)
        print(f"Error in index {idx}")
        sys.exit()

# df_fea = pd.concat([df, df_fea], axis=1)
df_fea.to_csv("df_tsfel_train.csv", index=False)

end_time = time.time()
print("程式執行時間：", (end_time - start_time)/60, "mins")

print("df_fea.shape:", df_fea.shape)
df_fea.head()

抽取特徵: 100%|████████████████████████████████████████████████████████████████████| 1955/1955 [02:37<00:00, 12.40it/s]


程式執行時間： 2.6486942609151205 mins
df_fea.shape: (1955, 744)


Unnamed: 0,Ax_Absolute energy,Ax_Area under the curve,Ax_Autocorrelation,Ax_Average power,Ax_Centroid,Ax_ECDF Percentile Count_0,Ax_ECDF Percentile Count_1,Ax_ECDF Percentile_0,Ax_ECDF Percentile_1,Ax_ECDF_0,...,Gz_Wavelet variance_10.62Hz,Gz_Wavelet variance_2.36Hz,Gz_Wavelet variance_2.66Hz,Gz_Wavelet variance_21.25Hz,Gz_Wavelet variance_3.04Hz,Gz_Wavelet variance_3.54Hz,Gz_Wavelet variance_4.25Hz,Gz_Wavelet variance_5.31Hz,Gz_Wavelet variance_7.08Hz,Gz_Zero crossing rate
0,27026360000.0,66913.370588,8.0,1502446000.0,8.87124,306.0,1224.0,1660.0,5253.0,0.000654,...,35710320.0,6811895000.0,4860586000.0,3560986.0,3228471000.0,1933227000.0,1017604000.0,441718200.0,148445700.0,52.0
0,52478760000.0,97901.588235,9.0,2379037000.0,11.203396,375.0,1500.0,1899.0,6825.0,0.000533,...,39225720.0,6865518000.0,4744879000.0,3270496.0,3059768000.0,1793228000.0,951886900.0,443915400.0,166332800.0,60.0
0,56842810000.0,110514.547059,9.0,1858323000.0,14.760668,520.0,2080.0,1843.0,4434.0,0.000384,...,50111200.0,2952263000.0,2278526000.0,5866378.0,1652347000.0,1096373000.0,657278500.0,346899300.0,155345400.0,98.0
0,67595830000.0,116260.241176,10.0,2262956000.0,15.057424,508.0,2032.0,1691.0,5878.0,0.000394,...,49249140.0,3337741000.0,2533584000.0,5613627.0,1784595000.0,1130761000.0,638378600.0,320690000.0,145555000.0,64.0
0,64078700000.0,114691.076471,10.0,2056907000.0,15.283514,529.0,2119.0,1553.0,4840.0,0.000378,...,45099670.0,2880120000.0,2172258000.0,5788660.0,1520555000.0,956556300.0,536036300.0,268588900.0,124001900.0,77.0


In [24]:
# 訓練特徵
# df_output = pd.concat([df.reset_index(drop=True), 
#                        df_fea.reset_index(drop=True), 
#                        df_train_info[['gender', 'hold racket handed', 'play years', 'level']].reset_index(drop=True)], axis=1)
# df_output.to_csv("df_tsfel_train.csv", index=False)

## TSFEL 抽取特徵 -> 測試集

In [4]:
file = TEST_PATH + "test_info.csv"
df_test_info = pd.read_csv(file)
print(df_test_info.shape)
df_test_info.head(2)

(1430, 3)


Unnamed: 0,unique_id,mode,cut_point
0,1968,9,[ 0 95 190 285 380 475 571 666 761 ...
1,1969,9,[ 0 99 198 297 396 495 594 693 792 ...


In [6]:
df = pd.DataFrame()
df['unique_id'] = df_test_info['unique_id']
df['mode'] = df_test_info['mode']

# 處理 "區間長度、總長度"
interval = []
length = []
for idx in df_test_info.index:
    try:
        # 取出每次揮拍的連續數據，共 27 次 (mode 9,10 有部分數據不到 27 次揮拍)
        data = df_test_info.loc[idx, 'cut_point']
        data = data.replace('[', '')
        data = data.replace('    ', '_')
        data = data.replace('   ', '_')
        data = data.replace('  ', '_')
        data = data.replace(' ', '_')
        data = [eval(x.strip()) for x in data[1:-1].split('_')]

        diff = [data[n]-data[n-1] for n in range(1, len(data))]
        interval.append(sum(diff)/len(diff))
        length.append(data[-1])
    except Exception as e:
        print(e)
        print(f"Error in index {idx}")
df['interval'] = interval
df['length'] = length

df.head(2)

Unnamed: 0,unique_id,mode,interval,length
0,1968,9,95.185185,2570
1,1969,9,99.0,2673


In [8]:
# dataset sampling frequency
fs = 85
# Feature Extraction
cfg_file = tsfel.get_features_by_domain()  # All features

start_time = time.time()
df_fea = pd.DataFrame()

for idx in tqdm(df.index, desc="抽取特徵"):
# for idx in tqdm([0, 1], desc="抽取特徵"):
    try:        
        file = TEST_PATH + r"test_data/" + str(df.iloc[idx, 0]) + ".txt"
        df_data = pd.read_csv(file, sep=' ', names=['Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz'])
    
        # Get features
        start_idx = int(df.loc[idx, 'interval'])     # 去頭
        end_idx = df.loc[idx, 'length'] - start_idx  # 去尾
        df_tsfel = tsfel.time_series_features_extractor(cfg_file, 
                                                        df_data.loc[start_idx:end_idx ,:], 
                                                        fs=fs, 
                                                        verbose=0)
        df_fea = pd.concat([df_fea, df_tsfel])
    except Exception as e:
        print(e)
        print(f"Error in index {idx}")
        sys.exit()

df_fea.to_csv("df_tsfel_test.csv", index=False)

end_time = time.time()
print("程式執行時間：", (end_time - start_time)/60, "mins")

print("df_fea.shape:", df_fea.shape)
df_fea.head()

抽取特徵: 100%|████████████████████████████████████████████████████████████████████| 1430/1430 [01:54<00:00, 12.48it/s]


程式執行時間： 1.9249319473902384 mins
df_fea.shape: (1430, 744)


Unnamed: 0,Ax_Absolute energy,Ax_Area under the curve,Ax_Autocorrelation,Ax_Average power,Ax_Centroid,Ax_ECDF Percentile Count_0,Ax_ECDF Percentile Count_1,Ax_ECDF Percentile_0,Ax_ECDF Percentile_1,Ax_ECDF_0,...,Gz_Wavelet variance_10.62Hz,Gz_Wavelet variance_2.36Hz,Gz_Wavelet variance_2.66Hz,Gz_Wavelet variance_21.25Hz,Gz_Wavelet variance_3.04Hz,Gz_Wavelet variance_3.54Hz,Gz_Wavelet variance_4.25Hz,Gz_Wavelet variance_5.31Hz,Gz_Wavelet variance_7.08Hz,Gz_Zero crossing rate
0,40469050000.0,75361.788235,9.0,1445323000.0,12.482273,476.0,1904.0,615.0,3661.0,0.00042,...,82334340.0,4066683000.0,3199152000.0,11069770.0,2391215000.0,1655729000.0,1046657000.0,580298700.0,263204400.0,62.0
0,34045680000.0,89805.464706,11.0,1169245000.0,13.845084,495.0,1980.0,1247.0,4935.0,0.000404,...,987930.0,93219980.0,64486510.0,160090.8,42085970.0,25360560.0,14088210.0,7064404.0,3082090.0,51.0
0,47636830000.0,89513.129412,11.0,1999571000.0,12.056372,405.0,1620.0,993.0,6479.0,0.000494,...,2726515.0,175177400.0,126945800.0,338730.3,86551940.0,54428850.0,31857520.0,17323210.0,8275360.0,56.0
0,31477990000.0,73015.041176,12.0,1510801000.0,9.921759,354.0,1417.0,1369.0,5437.0,0.000564,...,1372603.0,107365500.0,71662210.0,173421.3,45158380.0,26482610.0,14678060.0,7705085.0,3746123.0,44.0
0,57184850000.0,98086.629412,12.0,2205405000.0,13.631828,441.0,1764.0,1159.0,6543.0,0.000454,...,2528018.0,170395300.0,118704700.0,271825.7,80047070.0,51557180.0,31500400.0,17393660.0,7990295.0,53.0


In [10]:
# 預測目標
df_output = pd.concat([df.reset_index(drop=True), 
                       df_fea.reset_index(drop=True)], axis=1)
df_output.to_csv("df_tsfel_test.csv", index=False)

# XGBoost 建模

In [24]:
# 訓練集
file = TRAIN_PATH + "df_tsfel_train.csv"
df = pd.read_csv(file)
print(df.shape)
df.head(2)

(1955, 752)


Unnamed: 0,unique_id,mode,interval,length,Ax_Absolute energy,Ax_Area under the curve,Ax_Autocorrelation,Ax_Average power,Ax_Centroid,Ax_ECDF Percentile Count_0,...,Gz_Wavelet variance_3.04Hz,Gz_Wavelet variance_3.54Hz,Gz_Wavelet variance_4.25Hz,Gz_Wavelet variance_5.31Hz,Gz_Wavelet variance_7.08Hz,Gz_Zero crossing rate,gender,hold racket handed,play years,level
0,1,1,61.148148,1651,27026360000.0,66913.37059,8,1502446000.0,8.87124,306,...,3228471000.0,1933227000.0,1017604000.0,441718172.4,148445713.9,52,1,1,1,5
1,2,2,74.925926,2023,52478760000.0,97901.58824,9,2379037000.0,11.203396,375,...,3059768000.0,1793228000.0,951886900.0,443915418.3,166332832.6,60,1,1,1,5


In [25]:
# 預測目標
file = TEST_PATH + "df_tsfel_test.csv"
df_pred = pd.read_csv(file)
print(df_pred.shape)
df_pred.head(2)

(1430, 750)


Unnamed: 0,unique_id,mode,interval,length,Ax_Absolute energy,Ax_Area under the curve,Ax_Autocorrelation,Ax_Average power,Ax_Centroid,Ax_ECDF Percentile Count_0,...,Gz_Wavelet variance_2.66Hz,Gz_Wavelet variance_21.25Hz,Gz_Wavelet variance_3.04Hz,Gz_Wavelet variance_3.54Hz,Gz_Wavelet variance_4.25Hz,Gz_Wavelet variance_5.31Hz,Gz_Wavelet variance_7.08Hz,Gz_Zero crossing rate,gender,hold racket handed
0,1968,9,95.185185,2570,40469050000.0,75361.78824,9,1445323000.0,12.482273,476,...,3199152000.0,11069770.0,2391215000.0,1655729000.0,1046657000.0,580298700.0,263204400.0,62,1,1
1,1969,9,99.0,2673,34045680000.0,89805.46471,11,1169245000.0,13.845084,495,...,64486510.0,160090.8,42085970.0,25360560.0,14088210.0,7064404.0,3082090.0,51,1,1


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import ParameterGrid
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC # for numerical and categorical features

## gender
gender : 1:男,2:女

訓練集：測試集 = 7：1

| mode | 數據集筆數 | 測試集筆數 |
|:-----:|:----:|:----:|
| 1 ~ 6 | 42 | 5 |
| 7 | 44 | 5 |
| 8 | 45 | 5 |
| 9 | 785 | 98 |
| 10 | 829 | 103 |
|總數| 1955 | 241 |

In [27]:
target = 'G'

X = df.loc[:, 'mode':'Gz_Zero crossing rate']
y = df['gender']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.125, 
                                                    stratify=df[['mode','gender']])
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

X_train: (1710, 747)
X_test: (245, 747)


In [28]:
# SMOTE resample
# https://xgboosting.com/xgboost-for-imbalanced-classification-with-smote/
sm = SMOTENC(categorical_features=['mode'])
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("X_train_sm =", X_train_sm.shape)
print(f"Resampled class distribution: {Counter(y_train_sm)}")

X_train_sm = (2848, 747)
Resampled class distribution: Counter({1: 1424, 2: 1424})


In [30]:
def store_and_print(file, output_str):
    file.write(output_str)
    print(output_str)

start = time.time()

# Define parameter grid
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [7, 10],
    'min_child_weight': [1],
    # 'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6],
    'colsample_bytree': [0.6],
    'learning_rate': [0.1, 0.3]
}
# param_grid = {
#     'max_depth': [3, 7],
#     'colsample_bytree': [0.6],
# }

# Create XGBoost classifier
xgb = XGBClassifier(objective='binary:logistic', 
                    eval_metric='auc',
                    device="cuda", tree_method="hist",
                    # strict_shape=True, # https://xgboost.readthedocs.io/en/stable/prediction.html
                    #silent=True,
                    verbosity=1    # default=1
)

# for param in tqdm(ParameterGrid(param_grid), desc="參數搜尋"):
print("===== target", target)
grid_s = GridSearchCV(estimator=xgb, 
                      param_grid=param_grid, 
                      cv=max(3, int(X_train_sm.shape[0]/1430)),  # 預測樣本 1430 筆
                      scoring='roc_auc_ovr',
                      n_jobs=-1, 
                      verbose=1)
dct_to_xgb = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}
y_train_sm_xgb = y_train_sm.map(dct_to_xgb)  # Xgboost expected: [0 1], not [1 2]
grid_s.fit(X_train_sm, y_train_sm_xgb)

# store tuned result
file_name = TRAIN_PATH + "params_" + target + ".txt"
file = open(file_name, 'w')
store_and_print(file, f"Best model: {str(grid_s.best_estimator_)}\n")
store_and_print(file, f"Best params: {str(grid_s.best_params_)}\n")
store_and_print(file, f"Best score: {str(grid_s.best_score_)}\n")

y_pred_xgb = grid_s.predict(X_test)
dct_to_contest = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}
y_pred = np.array(list(map(dct_to_contest.get, y_pred_xgb)))

store_and_print(file, f"Confusion Matrix:\n")
store_and_print(file, f"{confusion_matrix(y_test, y_pred)}")
store_and_print(file, f"Classification Report:\n")
store_and_print(file, f"{classification_report(y_test, y_pred)}")

# roc_auc score
y_pred_xgb = grid_s.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_pred_xgb[:, 1],  # 感覺應該是 y_pred_xgb[:, 0]
                        average='micro',
                        multi_class='ovr', 
                        labels=[1,2])
store_and_print(file, f"\nROC_AUC score: {str(roc_auc)}")

end = time.time()
store_and_print(file, f"\n\nTime elapsed: {round((end-start)/60):,.2f} mins\n")
file.close()

# fit all train data and predict test
print("===== Fit all train data and predict test")
sm = SMOTENC(categorical_features=['mode'])
X_sm, y_sm = sm.fit_resample(X, y)

y_sm_xgb = y_sm.map(dct_to_xgb)
# grid_s.fit(X_sm, y_sm_xgb)
final_xgb = XGBClassifier(objective='binary:logistic',
                          eval_metric='auc',
                          device="cuda", tree_method="hist",
                          n_estimators=grid_s.best_params_['n_estimators'],
                          max_depth=grid_s.best_params_['max_depth'], 
                          min_child_weight=grid_s.best_params_['min_child_weight'], 
                          subsample=grid_s.best_params_['subsample'], 
                          colsample_bytree=grid_s.best_params_['colsample_bytree'], 
                          learning_rate=grid_s.best_params_['learning_rate'],
                          verbosity=1
)
final_xgb.fit(X_sm, y_sm_xgb)

y_pred_xgb = final_xgb.predict(df_pred.loc[:, 'mode':'Gz_Zero crossing rate'])
y_pred = np.array(list(map(dct_to_contest.get, y_pred_xgb)))
df_pred[TARGET[target]] = y_pred
df_pred.to_csv(TEST_PATH + f"df_tsfel_test_{target}.csv", index=False)

# store prediction probability
df_output = pd.DataFrame()
df_output['unique_id'] = df_pred['unique_id']
df_output[TARGET[target]] = grid_s.predict_proba(df_pred.loc[:, 'mode':'Gz_Zero crossing rate'])[:, 0]
df_output.to_csv(TEST_PATH + f"df_tsfel_test_{target}_prob.csv", index=False)

print("===== Finish target", target)

===== target G
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.6, device='cuda', early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.3, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

Best params: {'colsample_bytree': 0.6, 'learning_rate': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 300, 'subsample': 0.6}

Best score: 0.999883

## hold racket handed
hold racket handed : 1:右 2:左

In [31]:
target = 'H'

X = df.loc[:, 'mode':'Gz_Zero crossing rate']
y = df['hold racket handed']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.125, 
                                                    stratify=df[['mode','hold racket handed']])
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

sm = SMOTENC(categorical_features=['mode'])
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("X_train_sm =", X_train_sm.shape)
print(f"Resampled class distribution: {Counter(y_train_sm)}")

X_train: (1710, 747)
X_test: (245, 747)
X_train_sm = (2782, 747)
Resampled class distribution: Counter({1: 1391, 2: 1391})


In [33]:
def store_and_print(file, output_str):
    file.write(output_str)
    print(output_str)
    
start = time.time()

# Define parameter grid
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [5, 7],
    'min_child_weight': [1],
    # 'gamma': [0, 0.5, 1, 2, 5],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6],
    'learning_rate': [0.1, 0.3]
}
# param_grid = {
#     'max_depth': [3, 7],
#     'colsample_bytree': [0.6],
# }

# Create XGBoost classifier
xgb = XGBClassifier(objective='binary:logistic', 
                    eval_metric='auc',
                    device="cuda", tree_method="hist",
                    # strict_shape=True, # https://xgboost.readthedocs.io/en/stable/prediction.html
                    #silent=True,
                    verbosity=0    # default=1
)

# for param in tqdm(ParameterGrid(param_grid), desc="參數搜尋"):
print("===== target", target)
grid_s = GridSearchCV(estimator=xgb, 
                      param_grid=param_grid, 
                      cv=max(3, int(X_train_sm.shape[0]/1430)),  # 預測樣本 1430 筆
                      scoring='roc_auc_ovr',
                      n_jobs=-1, 
                      verbose=1)
dct_to_xgb = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}
y_train_sm_xgb = y_train_sm.map(dct_to_xgb)  # Xgboost expected: [0 1], not [1 2]
grid_s.fit(X_train_sm, y_train_sm_xgb)

# store tuned result
file_name = TRAIN_PATH + "params_" + target + ".txt"
file = open(file_name, 'w')
store_and_print(file, f"Best model: {str(grid_s.best_estimator_)}\n")
store_and_print(file, f"Best params: {str(grid_s.best_params_)}\n")
store_and_print(file, f"Best score: {str(grid_s.best_score_)}\n")

y_pred_xgb = grid_s.predict(X_test)
dct_to_contest = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}
y_pred = np.array(list(map(dct_to_contest.get, y_pred_xgb)))

store_and_print(file, f"Confusion Matrix:\n")
store_and_print(file, f"{confusion_matrix(y_test, y_pred)}")
store_and_print(file, f"Classification Report:\n")
store_and_print(file, f"{classification_report(y_test, y_pred)}")

# roc_auc score
y_pred_xgb = grid_s.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_pred_xgb[:, 1],  # 感覺應該是 y_pred_xgb[:, 0]
                        average='micro',
                        multi_class='ovr', 
                        labels=[1,2])
store_and_print(file, f"\nROC_AUC score: {str(roc_auc)}")

end = time.time()
store_and_print(file, f"\n\nTime elapsed: {round((end-start)/60):,.2f} mins\n")
file.close()

# fit all train data and predict test
print("===== Fit all train data and predict test")
sm = SMOTENC(categorical_features=['mode'])
X_sm, y_sm = sm.fit_resample(X, y)

y_sm_xgb = y_sm.map(dct_to_xgb)
# grid_s.fit(X_sm, y_sm_xgb)
final_xgb = XGBClassifier(objective='binary:logistic',
                          eval_metric='auc',
                          device="cuda", tree_method="hist",
                          n_estimators=grid_s.best_params_['n_estimators'],
                          max_depth=grid_s.best_params_['max_depth'], 
                          min_child_weight=grid_s.best_params_['min_child_weight'], 
                          subsample=grid_s.best_params_['subsample'], 
                          colsample_bytree=grid_s.best_params_['colsample_bytree'], 
                          learning_rate=grid_s.best_params_['learning_rate'],
                          verbosity=1
)
final_xgb.fit(X_sm, y_sm_xgb)

y_pred_xgb = final_xgb.predict(df_pred.loc[:, 'mode':'Gz_Zero crossing rate'])
y_pred = np.array(list(map(dct_to_contest.get, y_pred_xgb)))
df_pred[TARGET[target]] = y_pred
df_pred.to_csv(TEST_PATH + f"df_tsfel_test_{target}.csv", index=False)

# store prediction probability
df_output = pd.DataFrame()
df_output['unique_id'] = df_pred['unique_id']
df_output[TARGET[target]] = grid_s.predict_proba(df_pred.loc[:, 'mode':'Gz_Zero crossing rate'])[:, 0]
df_output.to_csv(TEST_PATH + f"df_tsfel_test_{target}_prob.csv", index=False)

print("===== Finish target", target)

===== target H
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.6, device='cuda', early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

Best params: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}

Best score: 0.99998

## play years
play years : 共3個球齡層(根據所有選手的球齡分布，分為 0:低、1:中、2:高)

In [34]:
# 預測目標
file = TEST_PATH + "df_tsfel_test.csv"
df_pred = pd.read_csv(file)
print(df_pred.shape)
df_pred.head(2)

(1430, 750)


Unnamed: 0,unique_id,mode,interval,length,Ax_Absolute energy,Ax_Area under the curve,Ax_Autocorrelation,Ax_Average power,Ax_Centroid,Ax_ECDF Percentile Count_0,...,Gz_Wavelet variance_2.66Hz,Gz_Wavelet variance_21.25Hz,Gz_Wavelet variance_3.04Hz,Gz_Wavelet variance_3.54Hz,Gz_Wavelet variance_4.25Hz,Gz_Wavelet variance_5.31Hz,Gz_Wavelet variance_7.08Hz,Gz_Zero crossing rate,gender,hold racket handed
0,1968,9,95.185185,2570,40469050000.0,75361.78824,9,1445323000.0,12.482273,476,...,3199152000.0,11069770.0,2391215000.0,1655729000.0,1046657000.0,580298700.0,263204400.0,62,0.978152,0.999033
1,1969,9,99.0,2673,34045680000.0,89805.46471,11,1169245000.0,13.845084,495,...,64486510.0,160090.8,42085970.0,25360560.0,14088210.0,7064404.0,3082090.0,51,0.994599,0.999794


In [35]:
target = 'P'

X = df.loc[:, 'mode':'hold racket handed']
y = df['play years']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.125, 
                                                    stratify=df[['mode','play years']])
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

# sm = SMOTENC(categorical_features=['mode', 'gender', 'hold racket handed'])
sm = SMOTENC(categorical_features=['mode'])
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("X_train_sm =", X_train_sm.shape)
print(f"Resampled class distribution: {Counter(y_train_sm)}")

X_train: (1710, 749)
X_test: (245, 749)
X_train_sm = (2280, 749)
Resampled class distribution: Counter({1: 760, 2: 760, 0: 760})


In [None]:
def store_and_print(file, output_str):
    file.write(output_str)
    print(output_str)

start = time.time()

# Define parameter grid
param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [7, 10],
    'min_child_weight': [1],
    # 'gamma': [0, 0.5, 1, 2, 5],
    'subsample': [0.6],
    'colsample_bytree': [0.6, 0.8],
    'learning_rate': [0.1, 0.3]
}
# param_grid = {
#     'max_depth': [3, 7],
#     'colsample_bytree': [0.6, 0.8],
#     'learning_rate': [0.01, 0.1]
# }

# Create XGBoost classifier
xgb = XGBClassifier(objective='multi:softprob', 
                    eval_metric='auc',
                    device="cuda", tree_method="hist",
                    # strict_shape=True, # https://xgboost.readthedocs.io/en/stable/prediction.html
                    # silent=True,
                    verbosity=1    # default=1
)

# for param in tqdm(ParameterGrid(param_grid), desc="參數搜尋"):
print("===== target", target)
grid_s = GridSearchCV(estimator=xgb, 
                      param_grid=param_grid, 
                      cv=max(3, int(X_train_sm.shape[0]/1430)),  # 預測樣本 1430 筆
                      scoring='roc_auc_ovr',
                      n_jobs=-1, 
                      verbose=1)
# dct_to_xgb = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}
# y_train_sm_xgb = y_train_sm.map(dct_to_xgb)  # Xgboost expected: [0 1], not [1 2]
grid_s.fit(X_train_sm, y_train_sm)

# store tuned result
file_name = TRAIN_PATH + "params_" + target + ".txt"
file = open(file_name, 'w')
store_and_print(file, f"Best model: {str(grid_s.best_estimator_)}\n")
store_and_print(file, f"Best params: {str(grid_s.best_params_)}\n")
store_and_print(file, f"Best score: {str(grid_s.best_score_)}\n")

# y_pred_xgb = grid_s.predict(X_test)
# dct_to_contest = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}
# y_pred = np.array(list(map(dct_to_contest.get, y_pred_xgb)))
y_pred = grid_s.predict(X_test)

store_and_print(file, f"Confusion Matrix:\n")
store_and_print(file, f"{confusion_matrix(y_test, y_pred)}")
store_and_print(file, f"Classification Report:\n")
store_and_print(file, f"{classification_report(y_test, y_pred)}")

# roc_auc score
y_pred_xgb = grid_s.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_pred_xgb,
                        average='micro',
                        multi_class='ovr', 
                        labels=[0,1,2])
store_and_print(file, f"\nROC_AUC score: {str(roc_auc)}")

end = time.time()
store_and_print(file, f"\n\nTime elapsed: {round((end-start)/60):,.2f} mins\n")
file.close()

# fit all train data and predict test
print("===== Fit all train data and predict test")
sm = SMOTENC(categorical_features=['mode'])
X_sm, y_sm = sm.fit_resample(X, y)
# y_sm_xgb = y_sm.map(dct_to_xgb)
# grid_s.fit(X_sm, y_sm)
final_xgb = XGBClassifier(objective='multi:softprob',
                          eval_metric='auc',
                          device="cuda", tree_method="hist",
                          n_estimators=grid_s.best_params_['n_estimators'],
                          max_depth=grid_s.best_params_['max_depth'], 
                          min_child_weight=grid_s.best_params_['min_child_weight'], 
                          subsample=grid_s.best_params_['subsample'], 
                          colsample_bytree=grid_s.best_params_['colsample_bytree'], 
                          learning_rate=grid_s.best_params_['learning_rate'],
                          verbosity=1
)
final_xgb.fit(X_sm, y_sm)

# y_pred_xgb = grid_s.predict(df_pred.loc[:, 'mode':'hold racket handed'])
# y_pred = np.array(list(map(dct_to_contest.get, y_pred_xgb)))
y_pred = final_xgb.predict(df_pred.loc[:, 'mode':'hold racket handed'])
df_pred[TARGET[target]] = y_pred
df_pred.to_csv(TEST_PATH + f"df_tsfel_test_{target}.csv", index=False)

# store prediction probability
df_output = pd.DataFrame()
df_output['unique_id'] = df_pred['unique_id']
pred_prob = grid_s.predict_proba(df_pred.loc[:, 'mode':'hold racket handed'])
df_output[f"{TARGET[target]}_0"] = pred_prob[:, 0]
df_output[f"{TARGET[target]}_1"] = pred_prob[:, 1]
df_output[f"{TARGET[target]}_2"] = pred_prob[:, 2]
df_output.to_csv(TEST_PATH + f"df_tsfel_test_{target}_prob.csv", index=False)

print("===== Finish target", target)

===== target P
Fitting 3 folds for each of 24 candidates, totalling 72 fits


## level
level : 共4個等級(2:大專甲組選手、3:大專乙組選手、4:青少年國手、5:青少年選手)

In [10]:
# 預測目標
file = TEST_PATH + "df_tsfel_test.csv"
df_pred = pd.read_csv(file)
print(df_pred.shape)
df_pred.head(2)

(1430, 750)


Unnamed: 0,unique_id,mode,interval,length,Ax_Absolute energy,Ax_Area under the curve,Ax_Autocorrelation,Ax_Average power,Ax_Centroid,Ax_ECDF Percentile Count_0,...,Gz_Wavelet variance_2.66Hz,Gz_Wavelet variance_21.25Hz,Gz_Wavelet variance_3.04Hz,Gz_Wavelet variance_3.54Hz,Gz_Wavelet variance_4.25Hz,Gz_Wavelet variance_5.31Hz,Gz_Wavelet variance_7.08Hz,Gz_Zero crossing rate,gender,hold racket handed
0,1968,9,95.185185,2570,40469050000.0,75361.78824,9,1445323000.0,12.482273,476,...,3199152000.0,11069770.0,2391215000.0,1655729000.0,1046657000.0,580298700.0,263204400.0,62,1,1
1,1969,9,99.0,2673,34045680000.0,89805.46471,11,1169245000.0,13.845084,495,...,64486510.0,160090.8,42085970.0,25360560.0,14088210.0,7064404.0,3082090.0,51,1,1


In [11]:
target = 'L'

X = df.loc[:, 'mode':'hold racket handed']
y = df['level']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.125,
                                                    stratify=df[['mode','level']])
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

# sm = SMOTENC(categorical_features=['mode', 'gender', 'hold racket handed'])
sm = SMOTENC(categorical_features=['mode'])
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("X_train_sm =", X_train_sm.shape)
print(f"Resampled class distribution: {Counter(y_train_sm)}")

X_train: (1710, 749)
X_test: (245, 749)
X_train_sm = (3160, 749)
Resampled class distribution: Counter({2: 790, 4: 790, 5: 790, 3: 790})


In [12]:
def store_and_print(file, output_str):
    file.write(output_str)
    print(output_str)

start = time.time()

# Define parameter grid
param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [7, 10],
    'min_child_weight': [1],
    # 'gamma': [0, 0.5, 1, 2, 5],
    'subsample': [0.6],
    'colsample_bytree': [0.6],
    'learning_rate': [0.1, 0.3]
}
# param_grid = {
#     'max_depth': [3, 7],
#     'colsample_bytree': [0.6],
# }

# Create XGBoost classifier
xgb = XGBClassifier(objective='multi:softprob', 
                    eval_metric='auc',
                    device="cuda", tree_method="hist",
                    # strict_shape=True, # https://xgboost.readthedocs.io/en/stable/prediction.html
                    # silent=True,
                    verbosity=1    # default=1
)

# for param in tqdm(ParameterGrid(param_grid), desc="參數搜尋"):
print("===== target", target)
grid_s = GridSearchCV(estimator=xgb, 
                      param_grid=param_grid, 
                      cv=max(3, int(X_train_sm.shape[0]/1430)),  # 預測樣本 1430 筆
                      scoring='roc_auc_ovr',
                      n_jobs=-1, 
                      verbose=1)
dct_to_xgb = {2:0, 3:1, 4:2, 5:3}
y_train_sm_xgb = y_train_sm.map(dct_to_xgb)  # Xgboost expected: [0 1], not [1 2]
grid_s.fit(X_train_sm, y_train_sm_xgb)

# store tuned result
file_name = TRAIN_PATH + "params_" + target + ".txt"
file = open(file_name, 'w')
store_and_print(file, f"Best model: {str(grid_s.best_estimator_)}\n")
store_and_print(file, f"Best params: {str(grid_s.best_params_)}\n")
store_and_print(file, f"Best score: {str(grid_s.best_score_)}\n")

y_pred_xgb = grid_s.predict(X_test)
dct_to_contest = {0:2, 1:3, 2:4, 3:5}
y_pred = np.array(list(map(dct_to_contest.get, y_pred_xgb)))
# y_pred = grid_s.predict(X_test)

store_and_print(file, f"Confusion Matrix:\n")
store_and_print(file, f"{confusion_matrix(y_test, y_pred)}")
store_and_print(file, f"Classification Report:\n")
store_and_print(file, f"{classification_report(y_test, y_pred)}")

# roc_auc score
y_pred_xgb = grid_s.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_pred_xgb,
                        average='micro',
                        multi_class='ovr', 
                        labels=[2,3,4,5])
store_and_print(file, f"\nROC_AUC score: {str(roc_auc)}")

end = time.time()
store_and_print(file, f"\n\nTime elapsed: {round((end-start)/60):,.2f} mins\n")
file.close()

# fit all train data and predict test
print("===== Fit all train data and predict test")
sm = SMOTENC(categorical_features=['mode'])
X_sm, y_sm = sm.fit_resample(X, y)

y_sm_xgb = y_sm.map(dct_to_xgb)
# grid_s.fit(X_sm, y_sm_xgb)
final_xgb = XGBClassifier(objective='multi:softprob',
                          eval_metric='auc',
                          device="cuda", tree_method="hist",
                          n_estimators=grid_s.best_params_['n_estimators'],
                          max_depth=grid_s.best_params_['max_depth'], 
                          min_child_weight=grid_s.best_params_['min_child_weight'], 
                          subsample=grid_s.best_params_['subsample'], 
                          colsample_bytree=grid_s.best_params_['colsample_bytree'], 
                          learning_rate=grid_s.best_params_['learning_rate'],
                          verbosity=1
)
final_xgb.fit(X_sm, y_sm_xgb)

y_pred_xgb = final_xgb.predict(df_pred.loc[:, 'mode':'hold racket handed'])
y_pred = np.array(list(map(dct_to_contest.get, y_pred_xgb)))
# y_pred = grid_s.predict(df_pred.loc[:, 'mode':'hold racket handed'])
df_pred[TARGET[target]] = y_pred
df_pred.to_csv(TEST_PATH + f"df_tsfel_test_{target}.csv", index=False)

# store prediction probability
df_output = pd.DataFrame()
df_output['unique_id'] = df_pred['unique_id']
pred_prob = grid_s.predict_proba(df_pred.loc[:, 'mode':'hold racket handed'])
df_output[f"{TARGET[target]}_2"] = pred_prob[:, 0]
df_output[f"{TARGET[target]}_3"] = pred_prob[:, 1]
df_output[f"{TARGET[target]}_4"] = pred_prob[:, 2]
df_output[f"{TARGET[target]}_5"] = pred_prob[:, 3]
df_output.to_csv(TEST_PATH + f"df_tsfel_test_{target}_prob.csv", index=False)

print("===== Finish target", target)

===== target L
Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.6, device='cuda', early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)

Best params: {'colsample_bytree': 0.6, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.6}

Best score: 0.9998930710340437

Confusion Matrix:


# 調整精確度

Excel 會自動把極小 or 極大數字用科學記號表示
- 用 `round()` 取到小數點第 4 位
- 修正機率總和 < 1

In [4]:
df = pd.read_csv("sample_submission_0530v0.csv")
print(df.shape)
df.take([95,96,97])
# df.head()

(1430, 10)


Unnamed: 0,unique_id,gender,hold racket handed,play years_0,play years_1,play years_2,level_2,level_3,level_4,level_5
95,2063,0.996721,0.998021,0.023608,0.839221,0.137171,0.002295,0.444776,0.003074,0.549855
96,2064,0.997686,0.000925,0.000231,0.997099,0.002671,0.000184,0.001569,0.000355,0.997892
97,2065,0.986931,0.999398,0.008396,0.222721,0.768883,0.990532,0.000658,0.000796,0.008013


In [5]:
df1 = df.round(4)
df1.take([95,96,97])

Unnamed: 0,unique_id,gender,hold racket handed,play years_0,play years_1,play years_2,level_2,level_3,level_4,level_5
95,2063,0.9967,0.998,0.0236,0.8392,0.1372,0.0023,0.4448,0.0031,0.5499
96,2064,0.9977,0.0009,0.0002,0.9971,0.0027,0.0002,0.0016,0.0004,0.9979
97,2065,0.9869,0.9994,0.0084,0.2227,0.7689,0.9905,0.0007,0.0008,0.008


In [6]:
# 檢查 play years, level
for idx in df1.index:
# for idx in [95,96,97]:
    arr = np.array(df1.loc[idx, 'play years_0':'play years_2'])
    if sum(arr) < 1:
        i_sort = np.argsort(arr)
        arr[i_sort[-1]] = 1 - sum(arr[i_sort[:-1]])
        df1.loc[idx, 'play years_0':'play years_2'] = arr
        # print(idx, arr)
        
    arr = np.array(df1.loc[idx, 'level_2':'level_5'])
    if sum(arr) < 1:
        i_sort = np.argsort(arr)
        arr[i_sort[-1]] = 1- sum(arr[i_sort[:-1]])
        df1.loc[idx, 'level_2':'level_5'] = arr

In [None]:
df1.to_csv("sample_submission_0530v1.csv", index=False)

In [3]:
# p0 vs l2 調整成 p0 -> p2 

(1430, 10)


Unnamed: 0,unique_id,gender,hold racket handed,play years_0,play years_1,play years_2,level_2,level_3,level_4,level_5
95,2063,0.9993,0.9994,0.006,0.9518,0.0422,0.0,0.6296,0.0,0.3704
96,2064,0.9997,0.0003,0.0,1.9983,0.0008,0.0,0.0,0.0,1.0
97,2065,0.9711,0.9994,0.0012,0.0354,0.9634,1.8519,0.037,0.0,0.037


In [13]:
df = pd.read_csv("sample_submission_0527v1.csv")
print(df.shape)
df.take([146,147,148])

(1430, 10)


Unnamed: 0,unique_id,gender,hold racket handed,play years_0,play years_1,play years_2,level_2,level_3,level_4,level_5
146,2114,0.998,0.9997,0.2756,0.7116,0.0128,0.0,0.037,0.0741,0.8889
147,2115,0.8132,0.9993,0.0123,0.9868,0.0009,0.0,0.2593,0.0,0.7407
148,2116,0.9999,0.9999,0.0086,0.1108,0.8806,0.6667,0.2593,0.0,0.0741


In [14]:
for idx in df1.index:
    arr = np.array(df.loc[idx, 'level_2':'level_5'])
    if arr[2] > 0:
        i_sort = np.argsort(arr)
        arr[2], arr[i_sort[-1]] = arr[i_sort[-1]], arr[2]
        df.loc[idx, 'level_2':'level_5'] = arr
df.take([146,147,148])

Unnamed: 0,unique_id,gender,hold racket handed,play years_0,play years_1,play years_2,level_2,level_3,level_4,level_5
146,2114,0.998,0.9997,0.2756,0.7116,0.0128,0.0,0.037,0.8889,0.0741
147,2115,0.8132,0.9993,0.0123,0.9868,0.0009,0.0,0.2593,0.0,0.7407
148,2116,0.9999,0.9999,0.0086,0.1108,0.8806,0.6667,0.2593,0.0,0.0741


In [15]:
df.to_csv("sample_submission_0527v2.csv", index=False)

In [19]:
# 檢查比例
pd.concat([X_train['mode'].reset_index(drop=True),
          y_train.reset_index(drop=True)], axis=1).value_counts()

mode  gender
10    1         645
9     1         537
      2         149
10    2          80
8     1          32
5     1          30
7     1          30
1     1          30
4     1          30
3     1          30
2     1          30
6     1          30
7     2           8
4     2           7
5     2           7
1     2           7
6     2           7
3     2           7
8     2           7
2     2           7
dtype: int64

In [5]:
# SMOTE resampling
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC # for numerical and categorical features

sm = SMOTENC(categorical_features=['mode'])
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("X_train_sm =", X_train_sm.shape)
print(f"Resampled class distribution: {Counter(y_train_sm)}")

X_train_sm = (2848, 747)
Resampled class distribution: Counter({1: 1424, 2: 1424})
