In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
match_df = pd.read_csv(r"..\dataset\tennis_predicton\processed_data_1\match_df_20_24.csv",low_memory=False)

In [3]:
match_df = match_df.dropna()

In [4]:
hist_features = [col for col in match_df.columns if col.endswith('_hist')]
hist_e_features = [col for col in match_df.columns if col.endswith('_hist_e')]
supplement_features =['round_code','best_of',
                      'player1_seed_bucket','player2_seed_bucket',
                     'player1_entry', 'player1_host', 'player1_hand', 'player1_ht','player1_age',
                      'player2_entry', 'player2_host', 'player2_hand', 'player2_ht', 'player2_age',
                      ]
elo_features =['player1_elo','player2_elo']
rank_features =['player1_rank', 'player1_rank_points','player2_rank', 'player2_rank_points']
histo_features = [col for col in match_df.columns if col.endswith('_histo')]

#features =hist_features + hist_e_features + supplement_features + elo_features +  rank_features + histo_features
features = rank_features 
# 数值型特征
#continuous_features = hist_features+hist_e_features+rank_features+histo_features+elo_features+['player1_ht','player1_age','player2_ht', 'player2_age',]
continuous_features = rank_features
# 目标变量
X = match_df[features]
y = match_df['result']

In [5]:
#X = pd.get_dummies(X, columns=['sym_entry', 'sym_hand', 'sym_host','round_code', 'best_of','sym_seed_bucket'], drop_first=True)

match_df['year'] = pd.to_datetime(match_df['tourney_date'], format='%Y-%m-%d').dt.year
train_mask = match_df['year'].between(2020, 2023)
test_mask = (match_df['year'] == 2024)

X_train, X_test = X[train_mask], X[test_mask]
y = match_df['result']
y_train = y[train_mask]
y_test = y[test_mask]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
tree_model = DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
tree_model.fit(X_train, y_train)

y_pred_tree = tree_model.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)

print(f'Decision Tree Accuracy: {accuracy_tree:.4f}')
print(classification_report(y_test, y_pred_tree))

Decision Tree Accuracy: 0.6074
              precision    recall  f1-score   support

           0       0.61      0.70      0.65      1269
           1       0.61      0.51      0.55      1166

    accuracy                           0.61      2435
   macro avg       0.61      0.60      0.60      2435
weighted avg       0.61      0.61      0.60      2435



In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

grid_tree = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_tree.fit(X_train, y_train)

best_tree = grid_tree.best_estimator_
print(f'Best Decision Tree Accuracy: {grid_tree.best_score_:.4f}')


Best Decision Tree Accuracy: 0.6195


In [10]:
feature_importance = pd.Series(
    tree_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)
print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
player2_rank_points    0.392490
player1_rank           0.294639
player1_rank_points    0.194426
player2_rank           0.118446
dtype: float64
