In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
from sklearn.base import clone  # 新增导入解决NameError
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"..\dataset\processed_data_1\match_df_20_24.csv",low_memory=False)

In [3]:
print(df.columns.tolist())

['tourney_id', 'tourney_date', 'round_code', 'best_of', 'surface_Clay', 'surface_Grass', 'surface_Hard', 'player1_id', 'player2_id', 'player1_seed_bucket', 'player1_entry', 'player1_host', 'player1_hand', 'player1_ht', 'player1_age', 'player1_rank', 'player1_rank_points', 'player1_ace', 'player1_df', 'player1_svpt', 'player1_fstIn', 'player1_fstWon', 'player1_sndWon', 'player1_SvGms', 'player1_bpSaved', 'player1_bpFaced', 'baseline_rally', 'intensity', 'player1_ace_rate', 'player1_df_rate', 'player1_serve_win_rate', 'player1_serve_efficiency', 'player1_clutch_ability', 'player1_sets', 'player1_games', 'player1_ret', 'player1_elo_before_hard', 'player1_elo_before_clay', 'player1_elo_before_grass', 'result', 'player1_ace_hist', 'player1_df_hist', 'player1_svpt_hist', 'player1_fstIn_hist', 'player1_fstWon_hist', 'player1_sndWon_hist', 'player1_SvGms_hist', 'player1_bpSaved_hist', 'player1_bpFaced_hist', 'player1_baseline_rally_hist', 'player1_intensity_hist', 'player1_ace_rate_hist', 'pla

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

# 假设 df 是最终包含所有特征的数据集，result 为标签

# 划分特征集，举例：选取 _hist 与其他特征（这里需要根据实际特征列名来指定）
hist_features = [col for col in df.columns if col.endswith('_hist')]
hist_e_features = [col for col in df.columns if col.endswith('_hist_e')]
hist_o_features = [col for col in df.columns if col.endswith('_histo')]
other_features = [col for col in df.columns if col not in hist_features + hist_e_features + ['result',"tourney_id",
                                                        "tourney_date","player1_id","player2_id", "player1_ret",
                                                        "player2_ret",'player1_sets', 'player1_games','player2_sets', 'player2_games']]
supplement_features =['round_code','best_of', 'surface_Clay', 'surface_Grass', 'surface_Hard','player1_seed_bucket',
                     'player1_entry', 'player1_host', 'player1_hand', 'player1_ht','player1_age',
                      'player2_seed_bucket', 'player2_entry','player2_host', 'player2_hand', 'player2_ht', 'player2_age',
                     'player1_elo_before_grass','player1_elo_before_clay','player1_elo_before_hard',
                      'player2_elo_before_grass','player2_elo_before_clay','player2_elo_before_hard',
                       'player1_rank', 'player1_rank_points','player2_rank', 'player2_rank_points']
elo_features =['player1_elo_before_grass','player1_elo_before_clay','player1_elo_before_hard',
                      'player2_elo_before_grass','player2_elo_before_clay','player2_elo_before_hard']
rank_features =['player1_rank', 'player1_rank_points','player2_rank', 'player2_rank_points']

# 构造不同特征组合
X_A = df[hist_features + supplement_features]
X_B = df[hist_e_features + supplement_features]
X_C = df[hist_features + hist_e_features + supplement_features]
#X_C = df[elo_features]
X_D = df[hist_features + rank_features]
X_F = df[hist_features + elo_features]
y = df['result']

# 分割训练集和测试集
X_A_train, X_A_test, y_train, y_test = train_test_split(X_A, y, test_size=0.3, random_state=42)
X_B_train, X_B_test, _, _ = train_test_split(X_B, y, test_size=0.3, random_state=42)
X_C_train, X_C_test, _, _ = train_test_split(X_C, y, test_size=0.3, random_state=42)
X_D_train, X_D_test, _, _ = train_test_split(X_D, y, test_size=0.3, random_state=42)
X_F_train, X_F_test, _, _ = train_test_split(X_F, y, test_size=0.3, random_state=42)

In [5]:
df['year'] = pd.to_datetime(df['tourney_date'], format='%Y-%m-%d').dt.year
# 按时间分割数据集
train_mask = df['year'].between(2020, 2023)
test_mask = (df['year'] == 2024)

# 统一划分标签集
y_train = y[train_mask]
y_test = y[test_mask]

# 划分各个特征组合
X_A_train, X_A_test = X_A[train_mask], X_A[test_mask]
X_B_train, X_B_test = X_B[train_mask], X_B[test_mask]
X_C_train, X_C_test = X_C[train_mask], X_C[test_mask]
X_D_train, X_D_test = X_D[train_mask], X_D[test_mask]
X_F_train, X_F_test = X_F[train_mask], X_F[test_mask]

In [6]:
# ==================================================================
# 定义三个模型（逻辑回归/ANN/决策树）和对应的标准化器
# ==================================================================
models = [
    {
        "name": "Logistic Regression",
        "clf": LogisticRegression(max_iter=2000, random_state=42),
        "scaler": StandardScaler()  # 逻辑回归需要标准化
    },
    {
        "name": "ANN",
        "clf": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
        "scaler": StandardScaler()  # 神经网络需要标准化
    },
    {
        "name": "Decision Tree",
        "clf": DecisionTreeClassifier(random_state=42),
        "scaler": None  # 决策树不需要标准化
    }
]

# ==================================================================
# 遍历每个模型进行训练和评估
# ==================================================================
for model in models:
    print(f"\n=== 正在评估模型: {model['name']} ===")
    # 遍历不同特征组合
    for feature_set_name, X_train, X_test in [
        ('Hist+s', X_A_train, X_A_test),
        ('Hist_e+s', X_B_train, X_B_test),
        ('Hist+Hist_e+s', X_C_train, X_C_test),
        ('hist+r', X_D_train, X_D_test),
        ('hist+e', X_F_train, X_F_test)
    ]:
        # 克隆一个新模型确保每次训练独立
        clf = clone(model["clf"])  # 现在可以正常使用了
        
        # 数据标准化处理（如果该模型需要）
        if model["scaler"] is not None:
            # 训练集标准化
            scaler = clone(model["scaler"])
            X_train_scaled = scaler.fit_transform(X_train)
            # 测试集使用相同的scaler转换
            X_test_scaled = scaler.transform(X_test)
        else:
            X_train_scaled = X_train
            X_test_scaled = X_test
        
        # 模型训练
        clf.fit(X_train_scaled, y_train)
        
        # 预测结果
        y_pred = clf.predict(X_test_scaled)
        
        # 计算评估指标
        acc = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, clf.predict_proba(X_test_scaled)[:, 1])
        f1 = f1_score(y_test, y_pred)
        
        # 打印结果
        print(f"特征组合 [{feature_set_name}] - Accuracy: {acc:.4f}, AUC: {auc:.4f}, F1: {f1:.4f}")


=== 正在评估模型: Logistic Regression ===
特征组合 [Hist+s] - Accuracy: 0.6461, AUC: 0.7131, F1: 0.6429
特征组合 [Hist_e+s] - Accuracy: 0.6441, AUC: 0.7043, F1: 0.6413
特征组合 [Hist+Hist_e+s] - Accuracy: 0.6455, AUC: 0.7085, F1: 0.6435
特征组合 [hist+r] - Accuracy: 0.6403, AUC: 0.7087, F1: 0.6374
特征组合 [hist+e] - Accuracy: 0.6192, AUC: 0.6775, F1: 0.6072

=== 正在评估模型: ANN ===
特征组合 [Hist+s] - Accuracy: 0.5787, AUC: 0.6186, F1: 0.5888
特征组合 [Hist_e+s] - Accuracy: 0.6205, AUC: 0.6671, F1: 0.6213
特征组合 [Hist+Hist_e+s] - Accuracy: 0.6205, AUC: 0.6688, F1: 0.6195
特征组合 [hist+r] - Accuracy: 0.6064, AUC: 0.6492, F1: 0.6043
特征组合 [hist+e] - Accuracy: 0.6088, AUC: 0.6512, F1: 0.5892

=== 正在评估模型: Decision Tree ===
特征组合 [Hist+s] - Accuracy: 0.5752, AUC: 0.5753, F1: 0.5766
特征组合 [Hist_e+s] - Accuracy: 0.5534, AUC: 0.5534, F1: 0.5478
特征组合 [Hist+Hist_e+s] - Accuracy: 0.5700, AUC: 0.5701, F1: 0.5712
特征组合 [hist+r] - Accuracy: 0.5655, AUC: 0.5656, F1: 0.5687
特征组合 [hist+e] - Accuracy: 0.5458, AUC: 0.5459, F1: 0.5483
