In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [2]:
match_df = pd.read_csv(r"..\dataset\processed_data_1\match_df_20_24.csv",low_memory=False)

In [3]:
print(match_df.columns.tolist())

['player1_index', 'tourney_id', 'tourney_date', 'round_code', 'best_of', 'surface_Clay', 'surface_Grass', 'surface_Hard', 'player1_id', 'player2_id', 'player1_seed_bucket', 'player1_entry', 'player1_host', 'player1_hand', 'player1_ht', 'player1_age', 'player1_rank', 'player1_rank_points', 'player1_elo', 'player1_ace', 'player1_df', 'player1_svpt', 'player1_fstIn', 'player1_fstWon', 'player1_sndWon', 'player1_SvGms', 'player1_bpSaved', 'player1_bpFaced', 'baseline_rally', 'intensity', 'player1_ace_rate', 'player1_df_rate', 'player1_serve_win_rate', 'player1_serve_efficiency', 'player1_clutch_ability', 'player1_o_seed_bucket', 'player1_o_entry', 'player1_o_host', 'player1_o_hand', 'player1_o_ht', 'player1_o_age', 'player1_o_rank', 'player1_o_rank_points', 'player1_o_ace', 'player1_o_df', 'player1_o_svpt', 'player1_o_fstIn', 'player1_o_fstWon', 'player1_o_sndWon', 'player1_o_SvGms', 'player1_o_bpSaved', 'player1_o_bpFaced', 'player1_o_ace_rate', 'player1_o_df_rate', 'player1_o_serve_win_r

In [4]:
hist_features = [col for col in match_df.columns if col.endswith('_hist')]
hist_e_features = [col for col in match_df.columns if col.endswith('_hist_e')]
supplement_features =['round_code','best_of',
                      'player1_seed_bucket','player2_seed_bucket',
                     'player1_entry', 'player1_host', 'player1_hand', 'player1_ht','player1_age',
                      'player2_entry', 'player2_host', 'player2_hand', 'player2_ht', 'player2_age',
                      ]
elo_features =['player1_elo','player2_elo']
rank_features =['player1_rank', 'player1_rank_points','player2_rank', 'player2_rank_points']
histo_features = [col for col in match_df.columns if col.endswith('_histo')]

features =hist_features + hist_e_features + supplement_features + elo_features +  rank_features + histo_features
# 数值型特征
continuous_features = hist_features+hist_e_features+rank_features+histo_features+elo_features+['player1_ht','player1_age','player2_ht', 'player2_age',]
# 目标变量
X = match_df[features]
y = match_df['result']

In [5]:
# 显式创建 X 的副本，防止 SettingWithCopyWarning
X = match_df[features].copy().fillna(0)

# 归一化数值特征
scaler = StandardScaler()
X[continuous_features] = scaler.fit_transform(X[continuous_features])

In [6]:
#X = pd.get_dummies(X, columns=['round_code', 'best_of','player1_seed_bucket', 'player1_entry', 'player1_host', 'player1_hand', 'player2_seed_bucket', 'player2_entry',
#       'player2_host', 'player2_hand',], drop_first=True)

match_df['year'] = pd.to_datetime(match_df['tourney_date'], format='%Y-%m-%d').dt.year
train_mask = match_df['year'].between(2023,)
test_mask = (match_df['year'] == 2024)

X_train, X_test = X[train_mask], X[test_mask]
y = match_df['result']
y_train = y[train_mask]
y_test = y[test_mask]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
model = LogisticRegression(solver='liblinear', penalty='l2', C=0.01)
model.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # 概率预测

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f'Accuracy: {accuracy:.4f}')
print(f'AUC: {auc:.4f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.6529
AUC: 0.7187
              precision    recall  f1-score   support

           0       0.66      0.66      0.66      1245
           1       0.65      0.65      0.65      1201

    accuracy                           0.65      2446
   macro avg       0.65      0.65      0.65      2446
weighted avg       0.65      0.65      0.65      2446



In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f'Best C: {grid_search.best_params_["C"]}')


Best C: 0.01


In [12]:
from sklearn.inspection import permutation_importance
import statsmodels.api as sm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 方法1：系数绝对值
coef_importance = np.abs(model.coef_[0])

# 方法2：置换重要性
perm_result = permutation_importance(
    model, X_test, y_test, n_repeats=30, random_state=42
)
perm_importance = perm_result.importances_mean

In [13]:
importance_df = pd.DataFrame({
    "feature": X.columns,
    "coef_importance": coef_importance,
    "perm_importance": perm_importance,
})

In [14]:
# 按 coef_importance 降序排序（从高到低）
importance_df_sorted = importance_df.sort_values(
    by="coef_importance",     # 指定排序依据的列
    ascending=False,           # 降序排列（值越大越靠前）
)

# 输出排序后的 DataFrame
print(importance_df_sorted)

                              feature  coef_importance  perm_importance
83                player2_rank_points         0.264689     4.129191e-03
81                player1_rank_points         0.255771     1.015263e-02
82                       player2_rank         0.202171     2.003271e-03
72                        player1_age         0.193864     1.310984e-02
80                       player1_rank         0.187238     1.049332e-03
77                        player2_age         0.186379     1.512674e-03
116       player2_o_rank_points_histo         0.162233     3.284274e-03
91        player1_o_rank_points_histo         0.122854     2.330335e-03
128    player2_o_serve_win_rate_histo         0.118332    -4.946852e-03
57      player2_baseline_rally_hist_e         0.110727    -3.652221e-03
45      player1_serve_win_rate_hist_e         0.107203     1.226492e-03
49                  player2_df_hist_e         0.105752    -4.183701e-03
106               player1_o_elo_histo         0.104249    -1.226