In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pylab import mpl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
match_df = pd.read_csv(r"F:\大四\tennis_predicton\processed_data\match_df_20_24.csv",low_memory=False)

In [3]:
y = match_df["result"]  # 目标变量
X = match_df.drop(["result", "tourney_id", "tourney_date","player1_id","player2_id",
                   "player1_ret","player2_ret",'player1_sets', 'player1_games','player2_sets', 'player2_games'], axis=1)  # 移除无关特征

# 处理缺失值（示例：用均值填充）
X = X.fillna(X.mean())

# 标准化数据（对L1正则化等线性模型重要）
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.linear_model import LogisticRegression

# 使用L1正则化的逻辑回归
lasso_model = LogisticRegression(
    penalty="l1",         # L1正则化
    C=0.1,                # 正则化强度（C越小，惩罚越强）
    solver="saga",        # 支持L1的优化器
    max_iter=2000,
    random_state=42
)
lasso_model.fit(X_train, y_train)

# 提取非零系数对应的特征
selected_features_l1 = X.columns[np.abs(lasso_model.coef_[0]) > 0].tolist()
print("L1 Selected Features:", selected_features_l1)
print("Number of Selected Features:", len(selected_features_l1))

L1 Selected Features: ['surface_Hard', 'player1_entry', 'player1_host', 'player1_hand', 'player1_ht', 'player1_elo_before', 'player1_df', 'player1_fstIn', 'player1_fstWon', 'player1_sndWon', 'player1_bpFaced', 'player1_serve_win_rate', 'player1_serve_efficiency', 'player1_clutch_ability', 'player1_elo_after', 'player1_df_hist', 'player1_fstIn_hist', 'player1_bpFaced_hist', 'player1_serve_efficiency_hist', 'player2_ht', 'player2_age', 'player2_df', 'player2_fstIn', 'player2_fstWon', 'player2_sndWon', 'player2_bpSaved', 'player2_bpFaced', 'player2_serve_win_rate', 'player2_serve_efficiency', 'player2_clutch_ability', 'player2_sndWon_hist', 'player2_bpSaved_hist', 'player2_serve_efficiency_hist', 'player2_clutch_ability_hist']
Number of Selected Features: 34


In [5]:
from sklearn.ensemble import RandomForestClassifier

# 训练随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 提取特征重要性
feature_importances = pd.DataFrame({
    "feature": X.columns,
    "importance": rf_model.feature_importances_
}).sort_values("importance", ascending=False)

# 选择重要性高于阈值的特征（示例：取Top 20）
threshold = np.percentile(feature_importances["importance"], 80)  # 取前20%
selected_features_rf = feature_importances[feature_importances["importance"] > threshold]["feature"].tolist()
print("\nRandomForest Selected Features:", selected_features_rf)


RandomForest Selected Features: ['player1_serve_win_rate', 'player2_serve_win_rate', 'player2_bpFaced', 'player1_bpFaced', 'player2_clutch_ability', 'player1_clutch_ability', 'player2_fstWon', 'intensity', 'player2_bpSaved', 'player1_fstWon', 'player1_serve_efficiency', 'player2_serve_efficiency', 'player1_bpSaved', 'player2_ace_rate', 'player1_ace_rate', 'player2_sndWon', 'player1_svpt']


In [6]:
import xgboost as xgb

# 训练XGBoost模型
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# 获取特征重要性（'weight'表示特征被使用的次数）
feature_importances_xgb = pd.DataFrame({
    "feature": X.columns,
    "importance": xgb_model.feature_importances_
}).sort_values("importance", ascending=False)

# 选择Top N特征（例如Top 15）
selected_features_xgb = feature_importances_xgb.head(15)["feature"].tolist()
print("\nXGBoost Selected Features:", selected_features_xgb)


XGBoost Selected Features: ['player2_serve_win_rate', 'player1_serve_win_rate', 'best_of', 'player1_clutch_ability', 'player2_clutch_ability', 'player2_serve_efficiency', 'player1_fstWon', 'player1_serve_efficiency', 'player2_fstWon', 'player1_svpt', 'player2_svpt', 'surface_Hard', 'player1_bpFaced', 'player2_bpFaced', 'player2_df']


In [7]:
# 合并不同方法选出的特征
final_features = list(
    set(selected_features_l1 + selected_features_rf + selected_features_xgb)
)

# 训练最终模型（以XGBoost为例）
final_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
final_model.fit(X_train[final_features], y_train)

# 验证集性能
val_acc = final_model.score(X_val[final_features], y_val)
print(f"\nValidation Accuracy with Selected Features: {val_acc:.4f}")

# 对比全特征模型
full_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
full_model.fit(X_train, y_train)
full_acc = full_model.score(X_val, y_val)
print(f"Validation Accuracy with All Features: {full_acc:.4f}")


Validation Accuracy with Selected Features: 0.9407
Validation Accuracy with All Features: 0.9448
