In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
match_df = pd.read_csv(r"..\dataset\processed_data\match_df_20_24.csv",low_memory=False)

In [3]:
# 保留不变的列
static_columns = [
    "tourney_id", "tourney_date", "round_code", "best_of", "surface_Clay", "surface_Grass", "surface_Hard",
    "player1_id", "player2_id", "result"
]

# 获取所有 player_1 和 player_2 相关列
player1_columns = [col for col in match_df.columns if col.startswith("player1_") and col != "player1_id"]
player2_columns = [col.replace("player1_", "player2_") for col in player1_columns]

# 计算差值并创建新DataFrame
new_data = match_df[static_columns].copy()
for p1_col, p2_col in zip(player1_columns, player2_columns):
    if p2_col in match_df.columns:
        new_col_name = p1_col.replace("player1_", "sym_")
        new_data[new_col_name] = match_df[p1_col] - match_df[p2_col]


In [4]:
# 将新数据保存到新的CSV文件
new_data.to_csv(r"..\dataset\processed_data\match_df_20_24_symmetry.csv", index=False)
print("数据处理完成，已保存至 match_df_20_24_symmetry.csv")

数据处理完成，已保存至 match_df_20_24_symmetry.csv


In [5]:
new_data.head(1)

Unnamed: 0,tourney_id,tourney_date,round_code,best_of,surface_Clay,surface_Grass,surface_Hard,player1_id,player2_id,result,sym_seed_bucket,sym_entry,sym_host,sym_hand,sym_ht,sym_age,sym_ace,sym_df,sym_svpt,sym_fstIn,sym_fstWon,sym_sndWon,sym_SvGms,sym_bpSaved,sym_bpFaced,sym_ace_rate,sym_df_rate,sym_serve_win_rate,sym_serve_efficiency,sym_clutch_ability,sym_sets,sym_games,sym_ret,sym_elo_before,sym_elo_after,sym_ace_hist,sym_df_hist,sym_svpt_hist,sym_fstIn_hist,sym_fstWon_hist,sym_sndWon_hist,sym_SvGms_hist,sym_bpSaved_hist,sym_bpFaced_hist,sym_baseline_rally_hist,sym_intensity_hist,sym_ace_rate_hist,sym_df_rate_hist,sym_serve_win_rate_hist,sym_serve_efficiency_hist,sym_clutch_ability_hist
0,2020-8888,2020-01-06,2.0,3,0,0,1,126774,100644,1,0.0,0,0,0,-5.0,-1.3,0.0,-9.0,-10.0,4.0,8.0,-1.0,1.0,-3.0,-6.0,0.008081,-0.433333,0.264646,-1.875,0.0,2,7,0,-1105.450446,-1105.450446,-1.005891,-1.842208,0.836917,-1.687512,-0.006192,2.321719,0.186348,-0.223873,-0.661657,0.01068,-0.070355,-0.015836,-0.088189,0.018441,-0.015817,-0.0101


In [6]:
new_data.columns

Index(['tourney_id', 'tourney_date', 'round_code', 'best_of', 'surface_Clay',
       'surface_Grass', 'surface_Hard', 'player1_id', 'player2_id', 'result',
       'sym_seed_bucket', 'sym_entry', 'sym_host', 'sym_hand', 'sym_ht',
       'sym_age', 'sym_ace', 'sym_df', 'sym_svpt', 'sym_fstIn', 'sym_fstWon',
       'sym_sndWon', 'sym_SvGms', 'sym_bpSaved', 'sym_bpFaced', 'sym_ace_rate',
       'sym_df_rate', 'sym_serve_win_rate', 'sym_serve_efficiency',
       'sym_clutch_ability', 'sym_sets', 'sym_games', 'sym_ret',
       'sym_elo_before', 'sym_elo_after', 'sym_ace_hist', 'sym_df_hist',
       'sym_svpt_hist', 'sym_fstIn_hist', 'sym_fstWon_hist', 'sym_sndWon_hist',
       'sym_SvGms_hist', 'sym_bpSaved_hist', 'sym_bpFaced_hist',
       'sym_baseline_rally_hist', 'sym_intensity_hist', 'sym_ace_rate_hist',
       'sym_df_rate_hist', 'sym_serve_win_rate_hist',
       'sym_serve_efficiency_hist', 'sym_clutch_ability_hist'],
      dtype='object')

In [7]:
from sklearn.base import clone  # 新增导入解决NameError
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [8]:
df = new_data.copy()

In [9]:
df.columns

Index(['tourney_id', 'tourney_date', 'round_code', 'best_of', 'surface_Clay',
       'surface_Grass', 'surface_Hard', 'player1_id', 'player2_id', 'result',
       'sym_seed_bucket', 'sym_entry', 'sym_host', 'sym_hand', 'sym_ht',
       'sym_age', 'sym_ace', 'sym_df', 'sym_svpt', 'sym_fstIn', 'sym_fstWon',
       'sym_sndWon', 'sym_SvGms', 'sym_bpSaved', 'sym_bpFaced', 'sym_ace_rate',
       'sym_df_rate', 'sym_serve_win_rate', 'sym_serve_efficiency',
       'sym_clutch_ability', 'sym_sets', 'sym_games', 'sym_ret',
       'sym_elo_before', 'sym_elo_after', 'sym_ace_hist', 'sym_df_hist',
       'sym_svpt_hist', 'sym_fstIn_hist', 'sym_fstWon_hist', 'sym_sndWon_hist',
       'sym_SvGms_hist', 'sym_bpSaved_hist', 'sym_bpFaced_hist',
       'sym_baseline_rally_hist', 'sym_intensity_hist', 'sym_ace_rate_hist',
       'sym_df_rate_hist', 'sym_serve_win_rate_hist',
       'sym_serve_efficiency_hist', 'sym_clutch_ability_hist'],
      dtype='object')

In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

# 假设 df 是最终包含所有特征的数据集，result 为标签

# 划分特征集，举例：选取 _hist 与其他特征（这里需要根据实际特征列名来指定）
hist_features = [col for col in df.columns if col.endswith('_hist')]
inmatch_features = [col for col in df.columns if col not in hist_features  + ['result',"tourney_id",
                                                        "tourney_date","player1_id","player2_id", "sym_ret",
                                                        'sym_sets', 'sym_games','sym_sets', 'sym_games']]
supplement_features =['round_code','best_of', 'surface_Clay', 'surface_Grass', 'surface_Hard','sym_seed_bucket',
                     'sym_entry', 'sym_host', 'sym_hand', 'sym_ht','sym_age','sym_elo_before']
important_features =['sym_serve_win_rate','sym_serve_efficiency','sym_clutch_ability']

# 构造不同特征组合
#X_A = df[hist_features]
X_B = df[hist_features + inmatch_features]
X_C = df[hist_features + supplement_features]
X_D = df[['sym_elo_before',] + supplement_features]
X_F = df.drop(["result", "tourney_id", "tourney_date","player1_id","player2_id",
                   "sym_ret",'sym_sets', 'sym_games','sym_serve_win_rate','sym_serve_efficiency','sym_clutch_ability'], axis=1)
y = df['result']

# 分割训练集和测试集
X_A_train, X_A_test, y_train, y_test = train_test_split(X_A, y, test_size=0.3, random_state=42)
X_B_train, X_B_test, _, _ = train_test_split(X_B, y, test_size=0.3, random_state=42)
X_C_train, X_C_test, _, _ = train_test_split(X_C, y, test_size=0.3, random_state=42)
X_D_train, X_D_test, _, _ = train_test_split(X_D, y, test_size=0.3, random_state=42)
X_F_train, X_F_test, _, _ = train_test_split(X_F, y, test_size=0.3, random_state=42)

In [15]:
# ==================================================================
# 定义三个模型（逻辑回归/ANN/决策树）和对应的标准化器
# ==================================================================
models = [
    {
        "name": "Logistic Regression",
        "clf": LogisticRegression(max_iter=1000, random_state=42),
        "scaler": StandardScaler()  # 逻辑回归需要标准化
    },
    {
        "name": "ANN",
        "clf": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
        "scaler": StandardScaler()  # 神经网络需要标准化
    },
    {
        "name": "Decision Tree",
        "clf": DecisionTreeClassifier(random_state=42),
        "scaler": None  # 决策树不需要标准化
    }
]

# ==================================================================
# 遍历每个模型进行训练和评估
# ==================================================================
for model in models:
    print(f"\n=== 正在评估模型: {model['name']} ===")
    # 遍历不同特征组合
    for feature_set_name, X_train, X_test in [
#        ('A', X_A_train, X_A_test),
#        ('Hist+inmatch', X_B_train, X_B_test),
        ('C', X_C_train, X_C_test),
        ('D', X_D_train, X_D_test),
#        ('inmatch', X_F_train, X_F_test),
    ]:
        # 克隆一个新模型确保每次训练独立
        clf = clone(model["clf"])  # 现在可以正常使用了
        
        # 数据标准化处理（如果该模型需要）
        if model["scaler"] is not None:
            # 训练集标准化
            scaler = clone(model["scaler"])
            X_train_scaled = scaler.fit_transform(X_train)
            # 测试集使用相同的scaler转换
            X_test_scaled = scaler.transform(X_test)
        else:
            X_train_scaled = X_train
            X_test_scaled = X_test
        
        # 模型训练
        clf.fit(X_train_scaled, y_train)
        
        # 预测结果
        y_pred = clf.predict(X_test_scaled)
        
        # 计算评估指标
        acc = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, clf.predict_proba(X_test_scaled)[:, 1])
        f1 = f1_score(y_test, y_pred)
        
        # 打印结果
        print(f"特征组合 [{feature_set_name}] - Accuracy: {acc:.4f}, AUC: {auc:.4f}, F1: {f1:.4f}")


=== 正在评估模型: Logistic Regression ===
特征组合 [C] - Accuracy: 0.6405, AUC: 0.7006, F1: 0.6415
特征组合 [D] - Accuracy: 0.6307, AUC: 0.6821, F1: 0.6331

=== 正在评估模型: ANN ===
特征组合 [C] - Accuracy: 0.5953, AUC: 0.6394, F1: 0.5869
特征组合 [D] - Accuracy: 0.5950, AUC: 0.6510, F1: 0.6096

=== 正在评估模型: Decision Tree ===
特征组合 [C] - Accuracy: 0.5533, AUC: 0.5533, F1: 0.5486
特征组合 [D] - Accuracy: 0.5459, AUC: 0.5465, F1: 0.5448
