In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

import os
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [10]:
#match_df = pd.read_csv(r"F:\大四\tennis_predicton\processed_data\match_df_20_24.csv",low_memory=False)
match_df = pd.read_csv(r"..\dataset\processed_data\match_df_20_24_symmetry.csv",low_memory=False)

In [11]:
# 定义所有特征
features = ['round_code', 'best_of', 'surface_Clay', 'surface_Grass', 'surface_Hard','sym_seed_bucket', 'sym_entry', 'sym_host', 
            'sym_hand', 'sym_ht', 'sym_age', 'sym_elo_before', 'sym_ace_hist', 'sym_df_hist',
            'sym_svpt_hist', 'sym_fstIn_hist', 'sym_fstWon_hist', 'sym_sndWon_hist',
            'sym_SvGms_hist', 'sym_bpSaved_hist', 'sym_bpFaced_hist', 'sym_baseline_rally_hist',
            'sym_intensity_hist', 'sym_ace_rate_hist', 'sym_df_rate_hist', 
            'sym_serve_win_rate_hist', 'sym_serve_efficiency_hist', 'sym_clutch_ability_hist']

# 数值型特征
continuous_features = ['sym_ht', 'sym_age', 'sym_elo_before', 'sym_ace_hist', 'sym_df_hist', 
                       'sym_svpt_hist', 'sym_fstIn_hist', 'sym_fstWon_hist', 'sym_sndWon_hist', 
                       'sym_SvGms_hist', 'sym_bpSaved_hist', 'sym_bpFaced_hist', 
                       'sym_baseline_rally_hist', 'sym_intensity_hist', 'sym_ace_rate_hist', 
                       'sym_df_rate_hist', 'sym_serve_win_rate_hist', 'sym_serve_efficiency_hist', 
                       'sym_clutch_ability_hist']

# 目标变量
X = match_df[features]
y = match_df['result']

In [12]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 创建并训练决策树模型
dt_clf = DecisionTreeClassifier(
    max_depth=5,  # 控制树深度防止过拟合
    min_samples_split=10,
    random_state=42
)
dt_clf.fit(X_train, y_train)

# 预测与评估
y_pred = dt_clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 特征重要性分析
feature_importance = pd.Series(
    dt_clf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)
print("\nFeature Importance:")
print(feature_importance)

Accuracy: 0.6182

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.62      0.62      1222
           1       0.62      0.62      0.62      1224

    accuracy                           0.62      2446
   macro avg       0.62      0.62      0.62      2446
weighted avg       0.62      0.62      0.62      2446


Feature Importance:
sym_seed_bucket              0.577534
sym_serve_win_rate_hist      0.138623
sym_age                      0.082706
sym_serve_efficiency_hist    0.082593
sym_elo_before               0.039921
sym_clutch_ability_hist      0.023476
best_of                      0.020787
sym_intensity_hist           0.016936
sym_baseline_rally_hist      0.006606
sym_svpt_hist                0.005841
sym_sndWon_hist              0.004976
surface_Clay                 0.000000
surface_Grass                0.000000
surface_Hard                 0.000000
sym_ht                       0.000000
sym_entry                    0.000000
sym

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

grid_tree = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_tree.fit(X_train, y_train)

best_tree = grid_tree.best_estimator_
print(f'Best Decision Tree Accuracy: {grid_tree.best_score_:.4f}')


Best Decision Tree Accuracy: 0.6208


In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

grid_tree = GridSearchCV(
    DecisionTreeClassifier(random_state=42), 
    param_grid, 
    cv=5, 
    scoring='accuracy', 
    n_jobs=-1,   # 并行计算
    verbose=1    # 输出进度
)

grid_tree.fit(X_train, y_train)

best_tree = grid_tree.best_estimator_

# 打印最佳参数
print(f'Best Params: {grid_tree.best_params_}')
print(f'Best Decision Tree Accuracy: {grid_tree.best_score_:.4f}')


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.5s


Best Params: {'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2}
Best Decision Tree Accuracy: 0.6208


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    5.4s finished
