In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
#match_df = pd.read_csv(r"F:\大四\tennis_predicton\processed_data\match_df_20_24.csv",low_memory=False)
match_df = pd.read_csv(r"..\dataset\processed_data\match_df_20_24_symmetry.csv",low_memory=False)

In [3]:
match_df.columns

Index(['tourney_id', 'tourney_date', 'round_code', 'best_of', 'surface_Clay',
       'surface_Grass', 'surface_Hard', 'player1_id', 'player2_id', 'result',
       'sym_seed_bucket', 'sym_entry', 'sym_host', 'sym_hand', 'sym_ht',
       'sym_age', 'sym_elo_before', 'sym_ace', 'sym_df', 'sym_svpt',
       'sym_fstIn', 'sym_fstWon', 'sym_sndWon', 'sym_SvGms', 'sym_bpSaved',
       'sym_bpFaced', 'sym_ace_rate', 'sym_df_rate', 'sym_serve_win_rate',
       'sym_serve_efficiency', 'sym_clutch_ability', 'sym_elo_after',
       'sym_sets', 'sym_games', 'sym_ret', 'sym_ace_hist', 'sym_df_hist',
       'sym_svpt_hist', 'sym_fstIn_hist', 'sym_fstWon_hist', 'sym_sndWon_hist',
       'sym_SvGms_hist', 'sym_bpSaved_hist', 'sym_bpFaced_hist',
       'sym_baseline_rally_hist', 'sym_intensity_hist', 'sym_ace_rate_hist',
       'sym_df_rate_hist', 'sym_serve_win_rate_hist',
       'sym_serve_efficiency_hist', 'sym_clutch_ability_hist'],
      dtype='object')

In [4]:
from sklearn.preprocessing import StandardScaler

# 加载数据（假设数据已准备好）
y = match_df["result"]  # 目标变量
X = match_df.drop("result", axis=1)

# 排除无关特征（如ID、日期等）
excluded_cols = ["tourney_id", "tourney_date","player1_id","player2_id",
#                 "player1_ret","player2_ret",'player1_sets', 'player1_games','player2_sets', 'player2_games']
                 "sym_ret","sym_ret",'sym_sets', 'sym_games','sym_sets', 'sym_games']
X_filtered = X.drop(excluded_cols, axis=1, errors="ignore")


# 标准化数据（对ReliefF非常重要）
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_filtered), columns=X_filtered.columns)

In [5]:
from sklearn.feature_selection import SelectKBest, f_classif

# 计算每个特征的F-value（Fisher Score）
selector = SelectKBest(score_func=f_classif, k="all")
selector.fit(X_scaled, y)

# 获取特征得分并排序
fisher_scores = pd.DataFrame({
    "feature": X_filtered.columns,
    "fisher_score": selector.scores_
}).sort_values("fisher_score", ascending=False)

print("Top 50 Features by Fisher Score:")
print(fisher_scores.head(50))

Top 50 Features by Fisher Score:
                      feature  fisher_score
23         sym_serve_win_rate  18858.513772
20                sym_bpFaced   7073.525999
16                 sym_fstWon   3690.221918
21               sym_ace_rate   1674.407762
24       sym_serve_efficiency   1631.932514
18                  sym_SvGms   1501.941196
19                sym_bpSaved   1285.494001
25         sym_clutch_ability   1071.041951
12                    sym_ace    974.474921
5             sym_seed_bucket    966.916573
17                 sym_sndWon    875.914035
40    sym_serve_win_rate_hist    705.558347
14                   sym_svpt    572.303751
13                     sym_df    526.252293
35           sym_bpFaced_hist    471.957077
26              sym_elo_after    464.861208
11             sym_elo_before    464.844293
41  sym_serve_efficiency_hist    359.814720
22                sym_df_rate    320.939567
6                   sym_entry    259.180494
34           sym_bpSaved_hist    241.245667

In [6]:
from skrebate import ReliefF

# 初始化ReliefF（n_neighbors根据数据规模调整）
relieff = ReliefF(n_neighbors=100, n_jobs=-1)  # 并行加速
relieff.fit(X_scaled.values, y.values)

# 获取特征权重并排序
relieff_scores = pd.DataFrame({
    "feature": X_filtered.columns,
    "relieff_score": relieff.feature_importances_
}).sort_values("relieff_score", ascending=False)

print("\nTop 50 Features by ReliefF:")
print(relieff_scores.head(50))


Top 50 Features by ReliefF:
                      feature  relieff_score
23         sym_serve_win_rate       0.455694
20                sym_bpFaced       0.279843
16                 sym_fstWon       0.157699
19                sym_bpSaved       0.078749
18                  sym_SvGms       0.076540
24       sym_serve_efficiency       0.057559
17                 sym_sndWon       0.052287
25         sym_clutch_ability       0.050945
14                   sym_svpt       0.049825
21               sym_ace_rate       0.043707
12                    sym_ace       0.043392
5             sym_seed_bucket       0.030408
13                     sym_df       0.029878
15                  sym_fstIn       0.021983
26              sym_elo_after       0.018505
11             sym_elo_before       0.018501
40    sym_serve_win_rate_hist       0.016684
35           sym_bpFaced_hist       0.015134
41  sym_serve_efficiency_hist       0.011926
34           sym_bpSaved_hist       0.010034
6                   sym_en

In [7]:
# 合并两个得分表
combined_scores = pd.merge(
    fisher_scores, 
    relieff_scores, 
    on="feature", 
    suffixes=('_fisher', '_relieff')
)

# 标准化得分并计算综合权重
combined_scores["combined_score"] = (
    combined_scores["fisher_score"].rank() + 
    combined_scores["relieff_score"].rank()
)

# 按综合得分排序
final_features = combined_scores.sort_values(
    "combined_score", ascending=False
).head(20)["feature"].tolist()

print("\nTop Combined Features:")
print(final_features)


Top Combined Features:
['sym_serve_win_rate', 'sym_bpFaced', 'sym_fstWon', 'sym_serve_efficiency', 'sym_SvGms', 'sym_bpSaved', 'sym_ace_rate', 'sym_clutch_ability', 'sym_sndWon', 'sym_ace', 'sym_seed_bucket', 'sym_svpt', 'sym_df', 'sym_serve_win_rate_hist', 'sym_elo_after', 'sym_bpFaced_hist', 'sym_elo_before', 'sym_serve_efficiency_hist', 'sym_df_rate', 'sym_entry']
