In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt


In [2]:
match_df = pd.read_csv(r"..\dataset\processed_data\match_df_20_24_symmetry.csv",low_memory=False)

In [3]:
hist_features = [col for col in match_df.columns if col.endswith('_hist')]
hist_e_features = [col for col in match_df.columns if col.endswith('_hist_e')]
histo_features = [col for col in match_df.columns if col.endswith('_histo')]
elo_features = ['sym_elo']
rank_features = [ 'sym_rank', 'sym_rank_points']
supplement_features = ['round_code', 'best_of', 'surface_Clay', 'surface_Grass',
                       'surface_Hard','sym_seed_bucket',
                       'sym_entry', 'sym_host', 'sym_hand', 'sym_ht', 'sym_age']

features = hist_features + hist_e_features + histo_features + elo_features + rank_features + supplement_features
continuous_features = hist_features + hist_e_features + histo_features + elo_features + rank_features + [
    'sym_ht', 'sym_age']
X = match_df[features]
y = match_df['result']

In [4]:
# 显式创建 X 的副本，防止 SettingWithCopyWarning
X = match_df[features].copy()

# 归一化数值特征
scaler = StandardScaler()
X[continuous_features] = scaler.fit_transform(X[continuous_features])

In [34]:
#X = pd.get_dummies(X, columns=['sym_entry', 'sym_hand', 'sym_host','round_code', 'best_of','sym_seed_bucket'], drop_first=True)

match_df['year'] = pd.to_datetime(match_df['tourney_date'], format='%Y-%m-%d').dt.year
train_mask = match_df['year'].between(2020, 2023)
test_mask = (match_df['year'] == 2024)

X_train, X_test = X[train_mask], X[test_mask]
y = match_df['result']
y_train = y[train_mask]
y_test = y[test_mask]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
tree_model = DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
tree_model.fit(X_train, y_train)

y_pred_tree = tree_model.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)

print(f'Decision Tree Accuracy: {accuracy_tree:.4f}')
print(classification_report(y_test, y_pred_tree))

Decision Tree Accuracy: 0.5953
              precision    recall  f1-score   support

           0       0.61      0.63      0.62      1275
           1       0.58      0.56      0.57      1171

    accuracy                           0.60      2446
   macro avg       0.59      0.59      0.59      2446
weighted avg       0.59      0.60      0.59      2446



In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

grid_tree = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_tree.fit(X_train, y_train)

best_tree = grid_tree.best_estimator_
print(f'Best Decision Tree Accuracy: {grid_tree.best_score_:.4f}')


Best Decision Tree Accuracy: 0.6208


In [9]:
ann_model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', 
                          alpha=0.01, max_iter=500, random_state=42)
ann_model.fit(X_train, y_train)

y_pred_ann = ann_model.predict(X_test)
accuracy_ann = accuracy_score(y_test, y_pred_ann)

print(f'Artificial Neural Network Accuracy: {accuracy_ann:.4f}')
print(classification_report(y_test, y_pred_ann))

Artificial Neural Network Accuracy: 0.5879
              precision    recall  f1-score   support

           0       0.61      0.57      0.59      1275
           1       0.57      0.60      0.58      1171

    accuracy                           0.59      2446
   macro avg       0.59      0.59      0.59      2446
weighted avg       0.59      0.59      0.59      2446



In [None]:
param_grid_ann = {
    'hidden_layer_sizes': [(64,), (64, 32), (128, 64)],
    'alpha': [0.01, 0.001, 0.0001],
    'solver': ['adam', 'sgd']
}

grid_ann = GridSearchCV(MLPClassifier(max_iter=500, random_state=42), param_grid_ann, cv=5, scoring='accuracy')
grid_ann.fit(X_train, y_train)

best_ann = grid_ann.best_estimator_
print(f'Best ANN Accuracy: {grid_ann.best_score_:.4f}')


In [None]:
print(f'Decision Tree Accuracy: {accuracy_tree:.4f}')
print(f'Artificial Neural Network Accuracy: {accuracy_ann:.4f}')