In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
match_df = pd.read_csv(r"..\dataset\processed_data_1\match_df_20_24.csv",low_memory=False)
#match_df = pd.read_csv(r"..\dataset\processed_data\match_df_20_24_symmetry.csv",low_memory=False)

In [3]:
print(match_df.columns.tolist())

['tourney_id', 'tourney_date', 'round_code', 'best_of', 'player1_id', 'player2_id', 'player1_seed_bucket', 'player1_entry', 'player1_host', 'player1_hand', 'player1_ht', 'player1_age', 'player1_rank', 'player1_rank_points', 'player1_elo', 'player1_match_counts', 'player1_ace', 'player1_df', 'player1_svpt', 'player1_fstIn', 'player1_fstWon', 'player1_sndWon', 'player1_SvGms', 'player1_bpSaved', 'player1_bpFaced', 'baseline_rally', 'intensity', 'player1_ace_rate', 'player1_df_rate', 'player1_serve_win_rate', 'player1_serve_efficiency', 'player1_clutch_ability', 'player1_o_seed_bucket', 'player1_o_entry', 'player1_o_host', 'player1_o_hand', 'player1_o_ht', 'player1_o_age', 'player1_o_rank', 'player1_o_rank_points', 'player1_o_ace', 'player1_o_df', 'player1_o_svpt', 'player1_o_fstIn', 'player1_o_fstWon', 'player1_o_sndWon', 'player1_o_SvGms', 'player1_o_bpSaved', 'player1_o_bpFaced', 'player1_o_ace_rate', 'player1_o_df_rate', 'player1_o_serve_win_rate', 'player1_o_serve_efficiency', 'playe

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

y = match_df["result"]  # 目标变量
#X = match_df.drop(["result", "tourney_id", "tourney_date","player1_id","player2_id",'sym_sets', 'sym_games','sym_sets', 'sym_games'], axis=1)  # 移除无关特征
X = match_df.drop(["result", "tourney_id", "tourney_date","player1_id","player2_id"], axis=1)
# 处理缺失值（示例：用均值填充）
X = X.fillna(X.mean())

# 标准化数据（对线性模型重要）
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [7]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

# 初始化模型（以逻辑回归为例）
model = LogisticRegression(max_iter=500)

# 前向选择：选择30个最优特征（k_features可调整）
forward_selector = SequentialFeatureSelector(
    model,
    k_features=15,  # 目标特征数量
    forward=True,    # 前向模式
    scoring="accuracy",
    cv=5             # 交叉验证折数
)
forward_selector.fit(X_train, y_train)

# 输出选中的特征
selected_forward = list(forward_selector.k_feature_names_)
print("Forward Selected Features:", selected_forward)

Forward Selected Features: ['player1_host', 'player1_bpFaced', 'player1_serve_win_rate', 'player1_serve_efficiency', 'player1_clutch_ability', 'player1_o_rank_points', 'player1_o_bpFaced', 'player1_o_serve_win_rate', 'player1_o_serve_efficiency', 'player1_o_seed_bucket_histo', 'player1_o_elo_histo', 'player2_o_serve_win_rate', 'player2_o_rank_histo', 'player2_o_clutch_ability_histo', 'player2_o_elo_histo']


In [8]:
from sklearn.metrics import accuracy_score

def backward_elimination(X_train, X_val, y_train, y_val, model, threshold=0.001):
    features = list(X_train.columns)
    while len(features) > 1:
        model.fit(X_train[features], y_train)
        y_pred = model.predict(X_val[features])
        baseline_acc = accuracy_score(y_val, y_pred)
        
        # 尝试移除每个特征并评估性能变化
        worst_feature = None
        for feature in features:
            temp_features = [f for f in features if f != feature]
            model.fit(X_train[temp_features], y_train)
            y_pred_temp = model.predict(X_val[temp_features])
            temp_acc = accuracy_score(y_val, y_pred_temp)
            # 如果移除后性能下降不超过阈值，则标记为可删除
            if (temp_acc - baseline_acc) >= -threshold:
                worst_feature = feature
                break
        
        if worst_feature:
            features.remove(worst_feature)
        else:
            break  # 没有可移除的特征时停止
    return features

# 初始化模型 以逻辑回归为例
model = LogisticRegression(max_iter=1000)

# 执行后向消除  
selected_backward = backward_elimination(X_train, X_val, y_train, y_val, model)
print("Backward Selected Features:", selected_backward)

Backward Selected Features: ['player2_fstWon', 'player2_sndWon', 'player2_bpSaved', 'player2_bpFaced', 'player2_serve_efficiency', 'player2_o_SvGms', 'player2_o_bpSaved', 'player2_o_bpFaced', 'player2_o_serve_win_rate', 'player2_o_serve_efficiency_histo', 'player2_o_elo_histo']


In [9]:
# 合并两种方法选出的特征
final_features = list(set(selected_forward + selected_backward))

# 训练最终模型
model_final = LogisticRegression(max_iter=1000)
model_final.fit(X_train[final_features], y_train)

# 验证集性能
val_acc = model_final.score(X_val[final_features], y_val)
print(f"Validation Accuracy with Selected Features: {val_acc:.4f}")

# 对比全特征模型
model_full = LogisticRegression(max_iter=1000)
model_full.fit(X_train, y_train)
full_acc = model_full.score(X_val, y_val)
print(f"Validation Accuracy with All Features: {full_acc:.4f}")

Validation Accuracy with Selected Features: 0.9473
Validation Accuracy with All Features: 0.9477
