In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve

In [2]:
df = pd.read_csv(r"..\dataset\processed_data_1\match_df_20_24.csv",low_memory=False)

In [3]:
def generate_diff_features(df):
    """
    生成差值特征：
    1. 常规规则：对以 '_hist' 结尾的字段，生成 `当前值 - 历史值` 的差值。
    2. 特殊规则：对 `player1_baseline_rally` 和 `player2_baseline_rally`，生成 `baseline_rally - 当前值` 的差值。
    """
    new_df = df.copy()
    
    # --------------------------------------------
    # 1. 常规规则：处理以 _hist 结尾的字段
    # --------------------------------------------
    # 提取所有以 '_hist' 结尾的字段（排除特殊字段的 _hist 版本）
    hist_columns = [
        col for col in df.columns 
        if col.endswith('_hist_e') 
        and not col.startswith(('player1_baseline_rally_hist', 'player2_baseline_rally_hist','player1_intensity_hist', 'player2_intensity_hist'))
    ]
    
    # 生成对应的当前值字段名（去掉 '_hist' 后缀）
    current_columns = [col.replace('_hist_e', '') for col in hist_columns]
    
    # 筛选有效字段对
    valid_pairs = []
    for hist_col, current_col in zip(hist_columns, current_columns):
        if current_col in df.columns:
            valid_pairs.append((current_col, hist_col))
        else:
            print(f"警告：当前值字段 {current_col} 不存在，跳过生成差值特征")
    
    # 计算常规差值特征
    for current_col, hist_col in valid_pairs:
        diff_col = f"{current_col}_diff"
        new_df[diff_col] = new_df[current_col] - new_df[hist_col]
    
  
    # 定义特殊字段列表
    special_columns = ['baseline_rally', 'intensity']
    
    for col in special_columns:
        if col in df.columns:
            diff_col1 = f"player1_{col}_diff"
            diff_col2 = f"player2_{col}_diff"
            hist_col1 = f"player1_{col}_hist"
            hist_col2 = f"player2_{col}_hist"
            new_df[diff_col1] = new_df['baseline_rally'] - new_df[hist_col1]
            new_df[diff_col1] = new_df['baseline_rally'] - new_df[hist_col2]
        else:
            print(f"警告：特殊字段 {col} 不存在，跳过生成差值特征")
    
    return new_df

# 示例调用
# 假设 df 是原始 DataFrame
df_processed = generate_diff_features(df)

In [4]:
df_processed.head(1)

Unnamed: 0,player1_index,tourney_id,tourney_date,round_code,best_of,surface_Clay,surface_Grass,surface_Hard,player1_id,player2_id,player1_seed_bucket,player1_entry,player1_host,player1_hand,player1_ht,player1_age,player1_rank,player1_rank_points,player1_elo,player1_ace,player1_df,player1_svpt,player1_fstIn,player1_fstWon,player1_sndWon,player1_SvGms,player1_bpSaved,player1_bpFaced,baseline_rally,intensity,player1_ace_rate,player1_df_rate,player1_serve_win_rate,player1_serve_efficiency,player1_clutch_ability,player1_o_seed_bucket,player1_o_entry,player1_o_host,player1_o_hand,player1_o_ht,player1_o_age,player1_o_rank,player1_o_rank_points,player1_o_ace,player1_o_df,player1_o_svpt,player1_o_fstIn,player1_o_fstWon,player1_o_sndWon,player1_o_SvGms,player1_o_bpSaved,player1_o_bpFaced,player1_o_ace_rate,player1_o_df_rate,player1_o_serve_win_rate,player1_o_serve_efficiency,player1_o_clutch_ability,player1_o_sets,player1_o_games,player1_o_elo,result,player1_ace_hist,player1_df_hist,player1_svpt_hist,player1_fstIn_hist,player1_fstWon_hist,player1_sndWon_hist,player1_SvGms_hist,player1_bpSaved_hist,player1_bpFaced_hist,player1_baseline_rally_hist,player1_intensity_hist,player1_ace_rate_hist,player1_df_rate_hist,player1_serve_win_rate_hist,player1_serve_efficiency_hist,player1_clutch_ability_hist,player1_ace_hist_e,player1_df_hist_e,player1_svpt_hist_e,player1_fstIn_hist_e,player1_fstWon_hist_e,player1_sndWon_hist_e,player1_SvGms_hist_e,player1_bpSaved_hist_e,player1_bpFaced_hist_e,player1_baseline_rally_hist_e,player1_intensity_hist_e,player1_ace_rate_hist_e,player1_df_rate_hist_e,player1_serve_win_rate_hist_e,player1_serve_efficiency_hist_e,player1_clutch_ability_hist_e,player1_o_seed_bucket_histo,player1_o_entry_histo,player1_o_host_histo,player1_o_hand_histo,player1_o_ht_histo,player1_o_age_histo,player1_o_rank_histo,player1_o_rank_points_histo,player1_o_ace_histo,player1_o_df_histo,player1_o_svpt_histo,player1_o_fstIn_histo,player1_o_fstWon_histo,player1_o_sndWon_histo,player1_o_SvGms_histo,player1_o_bpSaved_histo,player1_o_bpFaced_histo,player1_o_ace_rate_histo,player1_o_df_rate_histo,player1_o_serve_win_rate_histo,player1_o_serve_efficiency_histo,player1_o_clutch_ability_histo,player1_o_elo_histo,player1_baseline_rally_histo,player1_intensity_histo,player2_index,player2_seed_bucket,player2_entry,player2_host,player2_hand,player2_ht,player2_age,player2_rank,player2_rank_points,player2_elo,player2_ace,player2_df,player2_svpt,player2_fstIn,player2_fstWon,player2_sndWon,player2_SvGms,player2_bpSaved,player2_bpFaced,player2_ace_rate,player2_df_rate,player2_serve_win_rate,player2_serve_efficiency,player2_clutch_ability,player2_o_seed_bucket,player2_o_entry,player2_o_host,player2_o_hand,player2_o_ht,player2_o_age,player2_o_rank,player2_o_rank_points,player2_o_ace,player2_o_df,player2_o_svpt,player2_o_fstIn,player2_o_fstWon,player2_o_sndWon,player2_o_SvGms,player2_o_bpSaved,player2_o_bpFaced,player2_o_ace_rate,player2_o_df_rate,player2_o_serve_win_rate,player2_o_serve_efficiency,player2_o_clutch_ability,player2_o_sets,player2_o_games,player2_o_elo,player2_ace_hist,player2_df_hist,player2_svpt_hist,player2_fstIn_hist,player2_fstWon_hist,player2_sndWon_hist,player2_SvGms_hist,player2_bpSaved_hist,player2_bpFaced_hist,player2_baseline_rally_hist,player2_intensity_hist,player2_ace_rate_hist,player2_df_rate_hist,player2_serve_win_rate_hist,player2_serve_efficiency_hist,player2_clutch_ability_hist,player2_ace_hist_e,player2_df_hist_e,player2_svpt_hist_e,player2_fstIn_hist_e,player2_fstWon_hist_e,player2_sndWon_hist_e,player2_SvGms_hist_e,player2_bpSaved_hist_e,player2_bpFaced_hist_e,player2_baseline_rally_hist_e,player2_intensity_hist_e,player2_ace_rate_hist_e,player2_df_rate_hist_e,player2_serve_win_rate_hist_e,player2_serve_efficiency_hist_e,player2_clutch_ability_hist_e,player2_o_seed_bucket_histo,player2_o_entry_histo,player2_o_host_histo,player2_o_hand_histo,player2_o_ht_histo,player2_o_age_histo,player2_o_rank_histo,player2_o_rank_points_histo,player2_o_ace_histo,player2_o_df_histo,player2_o_svpt_histo,player2_o_fstIn_histo,player2_o_fstWon_histo,player2_o_sndWon_histo,player2_o_SvGms_histo,player2_o_bpSaved_histo,player2_o_bpFaced_histo,player2_o_ace_rate_histo,player2_o_df_rate_histo,player2_o_serve_win_rate_histo,player2_o_serve_efficiency_histo,player2_o_clutch_ability_histo,player2_o_elo_histo,player2_baseline_rally_histo,player2_intensity_histo,player1_ace_diff,player1_df_diff,player1_svpt_diff,player1_fstIn_diff,player1_fstWon_diff,player1_sndWon_diff,player1_SvGms_diff,player1_bpSaved_diff,player1_bpFaced_diff,player1_ace_rate_diff,player1_df_rate_diff,player1_serve_win_rate_diff,player1_serve_efficiency_diff,player1_clutch_ability_diff,player2_ace_diff,player2_df_diff,player2_svpt_diff,player2_fstIn_diff,player2_fstWon_diff,player2_sndWon_diff,player2_SvGms_diff,player2_bpSaved_diff,player2_bpFaced_diff,player2_ace_rate_diff,player2_df_rate_diff,player2_serve_win_rate_diff,player2_serve_efficiency_diff,player2_clutch_ability_diff,player1_baseline_rally_diff,player1_intensity_diff
0,107713,2020-8888,2020-01-06,2.0,3,0,0,1,100644,126774,33,1,0,1,198.0,22.7,7.0,3345.0,1928.907389,2.0,10.0,55.0,25.0,16.0,11.0,8.0,3.0,7.0,0.73,0.470588,0.036364,0.5,0.490909,6.875,0.5,33,1,0,1,193.0,21.4,6.0,5300.0,2.0,1.0,45.0,29.0,24.0,10.0,9.0,0.0,1.0,0.044444,0.066667,0.755556,5.0,0.5,2,12,1749.813724,0,8.018237,4.224924,80.632219,51.24924,37.598784,14.647416,12.87538,3.693009,6.12462,0.681496,0.533083,0.102603,0.173369,0.655457,6.21916,0.520716,5.395045,7.791595,67.576728,42.16625,30.417462,8.257673,10.133944,3.233374,6.717528,0.70898,0.505496,0.075055,0.420287,0.561978,6.594296,0.308349,30.003049,0.817073,0.094512,0.887195,187.780488,27.977439,65.727896,1717.91311,4.728659,2.966463,82.155488,50.167683,35.109756,15.77439,12.841463,4.371951,7.512195,0.057891,0.109637,0.617805,6.366569,0.477965,1845.590544,0.681183,0.532763,107627,33,1,0,1,193.0,21.4,6.0,5300.0,1749.813724,2.0,1.0,45.0,29.0,24.0,10.0,9.0,0.0,1.0,0.044444,0.066667,0.755556,5.0,0.5,33,1,0,1,198.0,22.7,7.0,3345.0,2.0,10.0,55.0,25.0,16.0,11.0,8.0,3.0,7.0,0.036364,0.5,0.490909,6.875,0.5,0,5,1928.907389,7.012346,2.382716,81.469136,49.561728,37.592593,16.969136,13.061728,3.469136,5.462963,0.692176,0.462728,0.086767,0.08518,0.673898,6.203344,0.510616,8.516698,1.995442,97.082971,62.343472,48.079308,17.246844,14.058209,5.193605,6.457917,0.754438,0.339324,0.087322,0.061175,0.675262,6.94618,0.620426,30.524691,0.820988,0.098765,0.901235,187.419753,27.618519,54.975309,2075.598765,5.104938,2.617284,80.91358,51.049383,36.796296,15.54321,12.765432,4.006173,6.444444,0.063838,0.100113,0.646038,6.325173,0.49007,1838.96638,0.692176,0.462728,-3.395045,2.208405,-12.576728,-17.16625,-14.417462,2.742327,-2.133944,-0.233374,0.282472,-0.038691,0.079713,-0.071069,0.280704,0.191651,-6.516698,-0.995442,-52.082971,-33.343472,-24.079308,-7.246844,-5.058209,-5.193605,-5.457917,-0.042877,0.005491,0.080293,-1.94618,-0.120426,0.037824,0.267272


In [5]:
supplement_features =['round_code','best_of',
                      'player1_seed_bucket','player2_seed_bucket',
                     'player1_entry', 'player1_host', 'player1_hand', 'player1_ht','player1_age',
                      'player2_entry', 'player2_host', 'player2_hand', 'player2_ht', 'player2_age',
                      ]
elo_features =['player1_elo','player2_elo']
rank_features =['player1_rank', 'player1_rank_points','player2_rank', 'player2_rank_points']
diff_features = [col for col in df_processed.columns if col.endswith('_diff')]

#features = supplement_features + elo_features +  rank_features + diff_features
features = diff_features
# 数值型特征
continuous_features = diff_features#+['player1_ht','player1_age','player2_ht', 'player2_age',]
# 目标变量
X = df_processed[features]
y = df_processed['result']

In [6]:
diff_features

['player1_ace_diff',
 'player1_df_diff',
 'player1_svpt_diff',
 'player1_fstIn_diff',
 'player1_fstWon_diff',
 'player1_sndWon_diff',
 'player1_SvGms_diff',
 'player1_bpSaved_diff',
 'player1_bpFaced_diff',
 'player1_ace_rate_diff',
 'player1_df_rate_diff',
 'player1_serve_win_rate_diff',
 'player1_serve_efficiency_diff',
 'player1_clutch_ability_diff',
 'player2_ace_diff',
 'player2_df_diff',
 'player2_svpt_diff',
 'player2_fstIn_diff',
 'player2_fstWon_diff',
 'player2_sndWon_diff',
 'player2_SvGms_diff',
 'player2_bpSaved_diff',
 'player2_bpFaced_diff',
 'player2_ace_rate_diff',
 'player2_df_rate_diff',
 'player2_serve_win_rate_diff',
 'player2_serve_efficiency_diff',
 'player2_clutch_ability_diff',
 'player1_baseline_rally_diff',
 'player1_intensity_diff']

In [7]:
# 显式创建 X 的副本，防止 SettingWithCopyWarning
X = df_processed[features].copy().fillna(0)

# 归一化数值特征
scaler = StandardScaler()
X[continuous_features] = scaler.fit_transform(X[continuous_features])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
model = LogisticRegression(solver='liblinear', penalty='l2', C=0.01)
model.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # 概率预测

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f'Accuracy: {accuracy:.4f}')
print(f'AUC: {auc:.4f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.8598
AUC: 0.9344
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      1245
           1       0.86      0.86      0.86      1201

    accuracy                           0.86      2446
   macro avg       0.86      0.86      0.86      2446
weighted avg       0.86      0.86      0.86      2446



In [11]:
from sklearn.inspection import permutation_importance
import numpy as np
import statsmodels.api as sm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 方法1：系数绝对值
coef_importance = np.abs(model.coef_[0])

# 方法2：置换重要性
perm_result = permutation_importance(
    model, X_test, y_test, n_repeats=30, random_state=42
)
perm_importance = perm_result.importances_mean

In [12]:
importance_df = pd.DataFrame({
    "feature": X.columns,
    "coef_importance": coef_importance,
    "perm_importance": perm_importance,
})

In [13]:
# 按 coef_importance 降序排序（从高到低）
importance_df_sorted = importance_df.sort_values(
    by="coef_importance",     # 指定排序依据的列
    ascending=False,           # 降序排列（值越大越靠前）
)

# 输出排序后的 DataFrame
print(importance_df_sorted)

                          feature  coef_importance  perm_importance
11    player1_serve_win_rate_diff         1.052562         0.083156
25    player2_serve_win_rate_diff         1.019095         0.076996
8            player1_bpFaced_diff         0.761678         0.046866
22           player2_bpFaced_diff         0.731825         0.041592
27    player2_clutch_ability_diff         0.485223         0.020769
29         player1_intensity_diff         0.440798         0.015004
4             player1_fstWon_diff         0.431021         0.021532
13    player1_clutch_ability_diff         0.419550         0.014554
18            player2_fstWon_diff         0.401986         0.014432
5             player1_sndWon_diff         0.280618         0.012156
19            player2_sndWon_diff         0.280303         0.004960
28    player1_baseline_rally_diff         0.226341         0.006064
2               player1_svpt_diff         0.176924         0.002657
6              player1_SvGms_diff         0.1592