In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# 定义文件路径
file_paths = [
    f"../dataset/processed_data_1/df_{start:02d}_{end:02d}_eloed.csv"
    for start, end in zip(range(0, 25, 5), range(4, 29, 5))
]

# 读取数据并存入字典
dfs = {f"df_{i}": pd.read_csv(file,low_memory=False) for i, file in zip(["00_04", "05_09", "10_14", "15_19", "20_24"], file_paths)}

# 访问
print(dfs["df_00_04"].columns)  # 查看 2000-2004 年数据的前几行

Index(['tourney_id', 'tourney_name', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'tourney_ioc', 'tourney_year', 'w_host', 'l_host', 'surface_Clay',
       'surface_Grass', 'surface_Hard', 'round_code', 'winner_seed_bucket',
       'loser_seed_bucket', 'win_sets', 'lose_sets', 'total_win_games',
       'total_lose_games', 'game_diff', 'ret', 'winner_elo_before_hard',
       'winner_elo_be

In [3]:
import pandas as pd
import numpy as np

def safe_divide(numerator, denominator):
    """分母为0时返回0的安全除法"""
    return np.divide(numerator, denominator, out=np.zeros_like(numerator), where=(denominator != 0))

for key in dfs:
    df = dfs[key].copy()
    
    # 预处理基础字段
    base_cols = ['minutes', 'w_svpt', 'l_svpt', 'w_1stIn', 'l_1stIn', 
                'w_SvGms', 'l_SvGms', 'w_bpSaved', 'l_bpSaved']
    df[base_cols] = df[base_cols].fillna(0)
    
    # 计算核心指标
    # 1. 底线相持程度
    df['baseline_rally'] = safe_divide(df['minutes'], df['w_svpt'] + df['l_svpt'])
    
    # 2. Ace率 & DF率
    for side in ['w', 'l']:
        # Ace率
        df[f'{side}_ace_rate'] = safe_divide(df[f'{side}_ace'], df[f'{side}_svpt'])
        
        # DF率（二发次数 = 总发球 - 一发成功 - 双误）
        second_serve = df[f'{side}_svpt'] - df[f'{side}_1stIn'] - df[f'{side}_df']
        second_serve = second_serve.clip(lower=0)  # 避免负数
        df[f'{side}_df_rate'] = safe_divide(df[f'{side}_df'], second_serve)
        
        # 发球得分率
        second_won = safe_divide(df[f'{side}_2ndWon'], second_serve) * second_serve
        df[f'{side}_serve_win_rate'] = safe_divide(
            df[f'{side}_1stWon'] + second_won, 
            df[f'{side}_svpt']
        )
        
        # 发球效率
        df[f'{side}_serve_efficiency'] = safe_divide(
            df[f'{side}_svpt'], 
            df[f'{side}_SvGms']
        )
    
    # 3. 关键分能力
    total_bp = df['w_bpFaced'] + df['l_bpFaced']
    df['w_clutch_ability'] = safe_divide(
        df['w_bpSaved'] + df['l_bpFaced'] - df['l_bpSaved'], 
        total_bp
    )
    df['l_clutch_ability'] = safe_divide(
        df['l_bpSaved'] + df['w_bpFaced'] - df['w_bpSaved'], 
        total_bp
    )
    #4 激烈程度
    df['intensity'] = safe_divide(
        df['w_bpFaced'] + df['l_bpFaced'], 
        df['w_SvGms'] + df['l_SvGms']
        
    )
    
    dfs[key] = df

In [4]:
# 定义需要检查的列（根据计算逻辑生成的新列）
calculated_columns = [
    'baseline_rally','intensity',
    'w_ace_rate', 'l_ace_rate',
    'w_df_rate', 'l_df_rate',
    'w_serve_win_rate', 'l_serve_win_rate',
    'w_serve_efficiency', 'l_serve_efficiency',
    'w_clutch_ability', 'l_clutch_ability'
]

# 遍历每个数据集
for dataset_name, df in dfs.items():
    print(f"\n=== {dataset_name} 空值分析 ===")
    
    # 统计每个特征的空值情况
    null_report = df[calculated_columns].isnull().sum().to_frame(name='空值数量')
    
    # 显示空值统计
    print(null_report)
    
    # 显示分母可能为0的列情况
    print("\n分母列零值统计:")
    denominator_cols = ['w_svpt', 'l_svpt', 'w_SvGms', 'l_SvGms','w_bpFaced','l_bpFaced']
    zero_counts = (df[denominator_cols] == 0).sum()
    print(zero_counts)

    print("\n" + "="*60 + "\n")


=== df_00_04 空值分析 ===
                    空值数量
baseline_rally         0
intensity              0
w_ace_rate             0
l_ace_rate             0
w_df_rate              0
l_df_rate              0
w_serve_win_rate       0
l_serve_win_rate       0
w_serve_efficiency     0
l_serve_efficiency     0
w_clutch_ability       0
l_clutch_ability       0

分母列零值统计:
w_svpt          0
l_svpt          0
w_SvGms         0
l_SvGms         0
w_bpFaced    1129
l_bpFaced      14
dtype: int64



=== df_05_09 空值分析 ===
                    空值数量
baseline_rally         0
intensity              0
w_ace_rate             0
l_ace_rate             0
w_df_rate              0
l_df_rate              0
w_serve_win_rate       0
l_serve_win_rate       0
w_serve_efficiency     0
l_serve_efficiency     0
w_clutch_ability       0
l_clutch_ability       0

分母列零值统计:
w_svpt          0
l_svpt          0
w_SvGms         2
l_SvGms         2
w_bpFaced    1257
l_bpFaced      19
dtype: int64



=== df_10_14 空值分析 ===
               

In [5]:
# 计算指标的列
metrics = ['baseline_rally','intensity', 'w_ace_rate', 'w_df_rate', 'w_serve_win_rate', 
           'w_serve_efficiency', 'w_clutch_ability','l_clutch_ability']

# 存储所有处理后的数据
player_stats = []

for key, df in dfs.items():
    df = df.copy()  # 避免修改原数据
    
    # 重新整理数据，使 winner_id 和 loser_id 作为 player_id
    winner_df = df[['tourney_year', 'winner_id'] + metrics].rename(columns={'winner_id': 'player_id'})
    loser_df = df[['tourney_year', 'loser_id'] + metrics].rename(columns={'loser_id': 'player_id'})
    
    # 合并 winner 和 loser 数据
    player_df = pd.concat([winner_df, loser_df], ignore_index=True)

    # 计算每个 player_id 在每个 tourney_year 下的均值和方差
    agg_funcs = {col: ['mean', 'var'] for col in metrics}
    yearly_stats = player_df.groupby(['tourney_year', 'player_id']).agg(agg_funcs)

    # 重命名列，使其更易读
    yearly_stats.columns = [f"{col}_{stat}" for col, stat in yearly_stats.columns]

    # 重置索引
    yearly_stats = yearly_stats.reset_index()
    
    # 存储结果
    player_stats.append(yearly_stats)

# 合并所有年份的数据
final_player_stats = pd.concat(player_stats, ignore_index=True)

# 显示前几行数据
print(final_player_stats.head(5))


   tourney_year  player_id  baseline_rally_mean  baseline_rally_var  \
0          2000     101086             0.636083            0.001991   
1          2000     101150             0.666488            0.004232   
2          2000     101167             0.641509                 NaN   
3          2000     101185             0.636742            0.002580   
4          2000     101191             0.562961            0.003850   

   intensity_mean  intensity_var  w_ace_rate_mean  w_ace_rate_var  \
0        0.705618       0.095776         0.061910        0.001649   
1        0.610268       0.049910         0.075316        0.002218   
2        0.666667            NaN         0.090909             NaN   
3        0.832784       0.072028         0.053993        0.001525   
4        0.684028       0.033010         0.115196        0.002030   

   w_df_rate_mean  w_df_rate_var  w_serve_win_rate_mean  w_serve_win_rate_var  \
0        0.119658       0.013084               0.654798              0.004037

In [6]:
for key, df in dfs.items():
    df['winner_elo'] = (
        df['winner_elo_before_hard'] * df['surface_Hard'] +
        df['winner_elo_before_clay'] * df['surface_Clay'] +
        df['winner_elo_before_grass'] * df['surface_Grass']
    )
    
    # 计算 loser_elo
    df['loser_elo'] = (
        df['loser_elo_before_hard'] * df['surface_Hard'] +
        df['loser_elo_before_clay'] * df['surface_Clay'] +
        df['loser_elo_before_grass'] * df['surface_Grass']
    )
    dfs[key] = df

In [7]:
for key, df in dfs.items():
    print(df.isna().sum())

tourney_id                 0
tourney_name               0
draw_size                  0
tourney_level              0
tourney_date               0
match_num                  0
winner_id                  0
winner_seed                0
winner_entry               0
winner_name                0
winner_hand                0
winner_ht                  0
winner_ioc                 0
winner_age                 0
loser_id                   0
loser_seed                 0
loser_entry                0
loser_name                 0
loser_hand                 0
loser_ht                   0
loser_ioc                  0
loser_age                  0
score                      0
best_of                    0
round                      0
minutes                    0
w_ace                      0
w_df                       0
w_svpt                     0
w_1stIn                    0
w_1stWon                   0
w_2ndWon                   0
w_SvGms                    0
w_bpSaved                  0
w_bpFaced     

### 数据重组

    winner_df = df[[
        'tourney_id', 'tourney_date', 'round_code', 'best_of',
        'surface_Clay', 'surface_Grass', 'surface_Hard',
        'winner_id', 'loser_id', 'winner_seed_bucket', 'winner_entry', 'w_host',
        'winner_hand', 'winner_ht', 'winner_age', 'winner_rank', 'winner_rank_points',
        'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
        'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'baseline_rally','intensity',
        'w_ace_rate', 'w_df_rate', 'w_serve_win_rate', 'w_serve_efficiency',
        'w_clutch_ability',  'win_sets', 'total_win_games', 'ret',
        'winner_elo_before_hard','winner_elo_before_clay','winner_elo_before_grass',
        'loser_seed_bucket', 'loser_entry', 'l_host',
        'loser_hand', 'loser_ht', 'loser_age', 'loser_rank', 'loser_rank_points',
        'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon',
        'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'baseline_rally','intensity',
        'l_ace_rate', 'l_df_rate', 'l_serve_win_rate', 'l_serve_efficiency',
        'l_clutch_ability', 'lose_sets', 'total_lose_games', 'ret',
        'loser_elo_before_hard','loser_elo_before_clay', 'loser_elo_before_grass'    
    ]].copy()
    
    # 生成输家视角
    loser_df = df[[
        'tourney_id', 'tourney_date', 'round_code', 'best_of',
         'surface_Clay', 'surface_Grass', 'surface_Hard',
        'loser_id', 'winner_id', 'loser_seed_bucket', 'loser_entry', 'l_host',
        'loser_hand', 'loser_ht', 'loser_age', 'loser_rank', 'loser_rank_points',
        'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon',
        'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'baseline_rally','intensity',
        'l_ace_rate', 'l_df_rate', 'l_serve_win_rate', 'l_serve_efficiency',
        'l_clutch_ability', 'lose_sets', 'total_lose_games', 'ret',
        'loser_elo_before_hard','loser_elo_before_clay', 'loser_elo_before_grass',
        'winner_seed_bucket', 'winner_entry', 'w_host',
        'winner_hand', 'winner_ht', 'winner_age', 'winner_rank', 'winner_rank_points',
        'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
        'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'baseline_rally','intensity',
        'w_ace_rate', 'w_df_rate', 'w_serve_win_rate', 'w_serve_efficiency',
        'w_clutch_ability',  'win_sets', 'total_win_games', 'ret',
        'winner_elo_before_hard','winner_elo_before_clay','winner_elo_before_grass',
    ]].copy()
    
    # 统一列名
    common_columns = [
        'tourney_id', 'tourney_date', 'round_code', 'best_of',
        'surface_Clay', 'surface_Grass', 'surface_Hard',
        'player_id', 'opponent_id', 'seed_bucket', 'entry', 'host',
        'hand', 'ht', 'age', 'rank', 'rank_points',
        'ace', 'df', 'svpt', 'fstIn', 'fstWon', 'sndWon',
        'SvGms', 'bpSaved', 'bpFaced', 'baseline_rally','intensity',
        'ace_rate', 'df_rate', 'serve_win_rate', 'serve_efficiency',
        'clutch_ability',  'sets', 'games', 'ret',
        'elo_before_hard','elo_before_clay','elo_before_grass',
        'o_seed_bucket', 'o_entry', 'o_host',
        'o_hand', 'o_ht', 'o_age', 'o_rank', 'o_rank_points',
        'o_ace', 'o_df', 'o_svpt', 'o_fstIn', 'o_fstWon', 'o_sndWon',
        'o_SvGms', 'o_bpSaved', 'o_bpFaced', 'o_baseline_rally','o_intensity',
        'o_ace_rate', 'o_df_rate', 'o_serve_win_rate', 'o_serve_efficiency',
        'o_clutch_ability',  'o_sets', 'o_games', 'o_ret',
        'o_elo_before_hard','o_elo_before_clay','o_elo_before_grass',
    ]

In [8]:
def process_single_df(df):
    """处理单个原始比赛数据表"""
    # 生成赢家视角
    winner_df = df[[
        'tourney_id', 'tourney_date', 'round_code', 'best_of',
        'winner_id', 'loser_id', 'winner_seed_bucket', 'winner_entry', 'w_host',
        'winner_hand', 'winner_ht', 'winner_age', 'winner_rank', 'winner_rank_points',
        'winner_elo','winner_match_counts',
        'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
        'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'baseline_rally','intensity',
        'w_ace_rate', 'w_df_rate', 'w_serve_win_rate', 'w_serve_efficiency',
        'w_clutch_ability', 
        'loser_seed_bucket', 'loser_entry', 'l_host',
        'loser_hand', 'loser_ht', 'loser_age', 'loser_rank', 'loser_rank_points',
        'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon',
        'l_SvGms', 'l_bpSaved', 'l_bpFaced', 
        'l_ace_rate', 'l_df_rate', 'l_serve_win_rate', 'l_serve_efficiency',
        'l_clutch_ability', 'loser_elo'    
    ]].copy()
    
    # 生成输家视角
    loser_df = df[[
        'tourney_id', 'tourney_date', 'round_code', 'best_of',
        'loser_id', 'winner_id', 'loser_seed_bucket', 'loser_entry', 'l_host',
        'loser_hand', 'loser_ht', 'loser_age', 'loser_rank', 'loser_rank_points',
        'loser_elo','loser_match_counts',
        'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon',
        'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'baseline_rally','intensity',
        'l_ace_rate', 'l_df_rate', 'l_serve_win_rate', 'l_serve_efficiency',
        'l_clutch_ability',
        'winner_seed_bucket', 'winner_entry', 'w_host',
        'winner_hand', 'winner_ht', 'winner_age', 'winner_rank', 'winner_rank_points',
        'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
        'w_SvGms', 'w_bpSaved', 'w_bpFaced',
        'w_ace_rate', 'w_df_rate', 'w_serve_win_rate', 'w_serve_efficiency',
        'w_clutch_ability',  'winner_elo',
    ]].copy()
    
    # 统一列名
    common_columns = [
        'tourney_id', 'tourney_date', 'round_code', 'best_of',
        'player_id', 'opponent_id', 'seed_bucket', 'entry', 'host',
        'hand', 'ht', 'age', 'rank', 'rank_points',
        'elo','match_counts',
        'ace', 'df', 'svpt', 'fstIn', 'fstWon', 'sndWon',
        'SvGms', 'bpSaved', 'bpFaced', 'baseline_rally','intensity',
        'ace_rate', 'df_rate', 'serve_win_rate', 'serve_efficiency',
        'clutch_ability',
        'o_seed_bucket', 'o_entry', 'o_host',
        'o_hand', 'o_ht', 'o_age', 'o_rank', 'o_rank_points',
        'o_ace', 'o_df', 'o_svpt', 'o_fstIn', 'o_fstWon', 'o_sndWon',
        'o_SvGms', 'o_bpSaved', 'o_bpFaced', 
        'o_ace_rate', 'o_df_rate', 'o_serve_win_rate', 'o_serve_efficiency',
        'o_clutch_ability',  'o_elo',
    ]
    
    winner_df.columns = common_columns
    loser_df.columns = common_columns
    
    # 标记结果
    winner_df['result'] = 1
    loser_df['result'] = 0
    
    return pd.concat([winner_df, loser_df], ignore_index=True)

processed_dfs = {}
for name, df in dfs.items():  # 遍历每个数据表
    processed_dfs[name] = process_single_df(df)

# 合并所有数据（如果需要）
player_df = pd.concat(processed_dfs.values(), ignore_index=True)

# 按时间排序 + 重置索引
player_df = player_df.sort_values(['tourney_date', 'tourney_id']).reset_index(drop=True)

###  使用两种方式估计历史表现 取平均值

In [9]:
# 确保 tourney_date 为 datetime 类型
player_df['tourney_date'] = pd.to_datetime(player_df['tourney_date'])

# 指定需要计算历史平均值的列
cols_to_average = [
    'ace', 'df', 'svpt', 'fstIn', 'fstWon', 'sndWon',
    'SvGms', 'bpSaved', 'bpFaced', 'baseline_rally','intensity',
    'ace_rate', 'df_rate', 'serve_win_rate', 'serve_efficiency', 'clutch_ability'
]

# 按 player_id 和 tourney_date 排序，确保时间顺序正确
player_df = player_df.sort_values(by=['player_id', 'tourney_date'])

# 利用 transform 保证输出 Series 的索引和原始 DataFrame 一致
# 计算历史平均值并填充 NaN
for col in cols_to_average:
    hist_col = col + '_hist'
    player_df[hist_col] = player_df.groupby('player_id')[col].transform(
        lambda x: x.expanding().mean().shift(1)
    )
    # 如果是第一场比赛（NaN），则填充当前值
    player_df[hist_col] = player_df[hist_col].fillna(player_df[col])

# 查看结果
print(player_df.head(1))


      tourney_id tourney_date  round_code  best_of  player_id  opponent_id  \
74029   2013-414   2013-07-15         2.0        3     100644       105138   

       seed_bucket  entry  host  hand     ht   age   rank  rank_points  \
74029           33      0     1     1  198.0  16.2  798.0         20.0   

          elo  match_counts  ace   df  svpt  fstIn  fstWon  sndWon  SvGms  \
74029  1500.0             0  1.0  7.0  39.0   14.0     7.0     6.0    8.0   

       bpSaved  bpFaced  baseline_rally  intensity  ace_rate   df_rate  \
74029      2.0      8.0        0.705263        1.0  0.025641  0.388889   

       serve_win_rate  serve_efficiency  clutch_ability  o_seed_bucket  \
74029        0.333333             4.875        0.294118             33   

       o_entry  o_host  o_hand   o_ht  o_age  o_rank  o_rank_points  o_ace  \
74029        1       0       1  183.0   25.2    49.0          872.0    2.0   

       o_df  o_svpt  o_fstIn  o_fstWon  o_sndWon  o_SvGms  o_bpSaved  \
74029   0.0 

In [10]:
# 确保 tourney_date 为 datetime 类型
player_df['tourney_date'] = pd.to_datetime(player_df['tourney_date'])

# 指定需要计算历史平均值的列
cols_to_c = [
    'ace', 'df', 'svpt', 'fstIn', 'fstWon', 'sndWon',
    'SvGms', 'bpSaved', 'bpFaced', 'baseline_rally','intensity',
    'ace_rate', 'df_rate', 'serve_win_rate', 'serve_efficiency', 'clutch_ability'
]

# 按 player_id 和 tourney_date 排序，确保时间顺序正确
player_df = player_df.sort_values(by=['player_id', 'tourney_date'])

# 指定指数衰减因子（α = 2 / (span + 1)，span 可调整）
span = 3  # 你可以根据需要调整这个参数
alpha = 2 / (span + 1)

# 计算指数加权移动平均，并填充 NaN
for col in cols_to_c:
    hist_col_e = col + '_hist_e'
    player_df[hist_col_e] = player_df.groupby('player_id')[col].transform(
        lambda x: x.shift(1).ewm(alpha=alpha, adjust=False).mean()
    )
    # 如果是第一场比赛（NaN），则填充当前值
    player_df[hist_col_e] = player_df[hist_col_e].fillna(player_df[col])

# 查看结果
print(player_df.head(1))


      tourney_id tourney_date  round_code  best_of  player_id  opponent_id  \
74029   2013-414   2013-07-15         2.0        3     100644       105138   

       seed_bucket  entry  host  hand     ht   age   rank  rank_points  \
74029           33      0     1     1  198.0  16.2  798.0         20.0   

          elo  match_counts  ace   df  svpt  fstIn  fstWon  sndWon  SvGms  \
74029  1500.0             0  1.0  7.0  39.0   14.0     7.0     6.0    8.0   

       bpSaved  bpFaced  baseline_rally  intensity  ace_rate   df_rate  \
74029      2.0      8.0        0.705263        1.0  0.025641  0.388889   

       serve_win_rate  serve_efficiency  clutch_ability  o_seed_bucket  \
74029        0.333333             4.875        0.294118             33   

       o_entry  o_host  o_hand   o_ht  o_age  o_rank  o_rank_points  o_ace  \
74029        1       0       1  183.0   25.2    49.0          872.0    2.0   

       o_df  o_svpt  o_fstIn  o_fstWon  o_sndWon  o_SvGms  o_bpSaved  \
74029   0.0 

In [11]:
# 确保 tourney_date 为 datetime 类型并按日期排序
player_df['tourney_date'] = pd.to_datetime(player_df['tourney_date'])
player_df = player_df.sort_values(['player_id', 'tourney_date'])

# 定义列（保持不变）
cols_p1 = [
        'seed_bucket', 'entry', 'host',
        'hand', 'ht', 'age', 'rank', 'rank_points',
        'ace', 'df', 'svpt', 'fstIn', 'fstWon', 'sndWon',
        'SvGms', 'bpSaved', 'bpFaced', 
        'ace_rate', 'df_rate', 'serve_win_rate', 'serve_efficiency',
        'clutch_ability',  'elo',
]

cols_o = [
        'o_seed_bucket', 'o_entry', 'o_host',
        'o_hand', 'o_ht', 'o_age', 'o_rank', 'o_rank_points',
        'o_ace', 'o_df', 'o_svpt', 'o_fstIn', 'o_fstWon', 'o_sndWon',
        'o_SvGms', 'o_bpSaved', 'o_bpFaced', 
        'o_ace_rate', 'o_df_rate', 'o_serve_win_rate', 'o_serve_efficiency',
        'o_clutch_ability', 'o_elo',
]

def calculate_hist(group):
    # 确保组内按日期排序
    group = group.sort_values('tourney_date')
    
    # 标记首场比赛
    group['is_first'] = False
    first_idx = group.index[0]
    group.loc[first_idx, 'is_first'] = True
    first_match_mask = group['is_first']
    
    # 初始化历史列
    histo_cols = [col + '_histo' for col in cols_o]
    histo_df = pd.DataFrame(index=group.index, columns=histo_cols, dtype=float)
    
    # 处理首场比赛
    for col_p1, col_o in zip(cols_p1, cols_o):
        histo_col = col_o + '_histo'
        mask_0 = first_match_mask & (group['result'] == 0)
        mask_1 = first_match_mask & (group['result'] == 1)
        histo_df.loc[mask_0, histo_col] = group.loc[mask_0, col_p1]
        histo_df.loc[mask_1, histo_col] = group.loc[mask_1, col_o]
    
    # 处理非首场比赛
    non_first_mask = ~first_match_mask
    valid_subset = group[non_first_mask]
    
    if not valid_subset.empty:
        for histo_col in histo_cols:
            col_o = histo_col.replace('_histo', '')
            # 获取首场比赛的历史值
            first_val = histo_df.loc[first_match_mask, histo_col].values[0]
            # 后续比赛的原始col_o值
            subsequent_vals = valid_subset[col_o]
            # 合并首场值和后续值，计算扩展均值
            combined = pd.Series([first_val]).append(subsequent_vals)
            expanding_means = combined.expanding().mean().shift(1).iloc[1:]
            # 填充到历史列
            histo_df.loc[valid_subset.index, histo_col] = expanding_means.values
    
    # 更新并返回结果
    group[histo_cols] = histo_df
    group.drop(columns=['is_first'], inplace=True)
    return group

# 应用函数
player_df = player_df.groupby('player_id', group_keys=False).apply(calculate_hist)

In [12]:
player_df.isna().sum()

tourney_id                  0
tourney_date                0
round_code                  0
best_of                     0
player_id                   0
opponent_id                 0
seed_bucket                 0
entry                       0
host                        0
hand                        0
ht                          0
age                         0
rank                        0
rank_points                 0
elo                         0
match_counts                0
ace                         0
df                          0
svpt                        0
fstIn                       0
fstWon                      0
sndWon                      0
SvGms                       0
bpSaved                     0
bpFaced                     0
baseline_rally              0
intensity                   0
ace_rate                    0
df_rate                     0
serve_win_rate              0
serve_efficiency            0
clutch_ability              0
o_seed_bucket               0
o_entry   

In [13]:
player_df.columns

Index(['tourney_id', 'tourney_date', 'round_code', 'best_of', 'player_id',
       'opponent_id', 'seed_bucket', 'entry', 'host', 'hand',
       ...
       'o_sndWon_histo', 'o_SvGms_histo', 'o_bpSaved_histo', 'o_bpFaced_histo',
       'o_ace_rate_histo', 'o_df_rate_histo', 'o_serve_win_rate_histo',
       'o_serve_efficiency_histo', 'o_clutch_ability_histo', 'o_elo_histo'],
      dtype='object', length=111)

### 构建包含历史数据的比赛数据

In [15]:
# 复制 player_df，分别作为 player1 和 player2
player1_df = player_df.copy()
player2_df = player_df.copy()

# 给 player1 相关列加前缀
player1_df = player1_df.rename(columns={
    col: f'player1_{col}' for col in player_df.columns if col not in ['tourney_id', 'tourney_date', 'round_code', 'best_of', 'player_id', 'opponent_id','baseline_rally', 'intensity']
})
player1_df = player1_df.rename(columns={'player_id': 'player1_id', 'opponent_id': 'player2_id'})

# 给 player2 相关列加前缀
player2_df = player2_df.rename(columns={
    col: f'player2_{col}' for col in player_df.columns if col not in ['tourney_id', 'tourney_date', 'round_code', 'best_of','player_id', 'opponent_id','baseline_rally', 'intensity']
})
player2_df = player2_df.rename(columns={'player_id': 'player2_id', 'opponent_id': 'player1_id'})

# 通过 'tourney_id', 'tourney_date', 'player1_id', 'player2_id' 进行合并
match_df = player1_df.merge(player2_df, on=['tourney_id', 'tourney_date', 'round_code', 'best_of', 'player1_id', 'player2_id','baseline_rally', 'intensity'], suffixes=('_p1', '_p2'))

# 取 player1 视角的 result
match_df = match_df.rename(columns={'player1_result': 'result'})
# 删除 player2_result 列，避免冗余
match_df = match_df.drop(columns=['player2_result'])

# 查看结果
print(match_df.head(1))


  tourney_id tourney_date  round_code  best_of  player1_id  player2_id  \
0   2013-414   2013-07-15         2.0        3      100644      105138   

   player1_seed_bucket  player1_entry  player1_host  player1_hand  player1_ht  \
0                   33              0             1             1       198.0   

   player1_age  player1_rank  player1_rank_points  player1_elo  \
0         16.2         798.0                 20.0       1500.0   

   player1_match_counts  player1_ace  player1_df  player1_svpt  player1_fstIn  \
0                     0          1.0         7.0          39.0           14.0   

   player1_fstWon  player1_sndWon  player1_SvGms  player1_bpSaved  \
0             7.0             6.0            8.0              2.0   

   player1_bpFaced  baseline_rally  intensity  player1_ace_rate  \
0              8.0        0.705263        1.0          0.025641   

   player1_df_rate  player1_serve_win_rate  player1_serve_efficiency  \
0         0.388889                0.333333    

In [16]:
# 创建一个唯一标识匹配的列，确保 (player1, player2) 和 (player2, player1) 视为相同比赛
match_df['match_key'] = match_df.apply(lambda row: '-'.join(sorted([str(row['player1_id']), str(row['player2_id'])])), axis=1)

# 按照 `tourney_date` 和 `match_key` 进行分组，并随机删除一条重复记录
match_df = match_df.groupby(['tourney_date', 'match_key'], group_keys=False).apply(lambda x: x.sample(1)).reset_index(drop=True)

# 删除临时列 `match_key`
match_df = match_df.drop(columns=['match_key'])

# 查看结果
print(match_df.head(1))


  tourney_id tourney_date  round_code  best_of  player1_id  player2_id  \
0   2000-891   2000-01-03         3.0        3      101086      104156   

   player1_seed_bucket  player1_entry  player1_host  player1_hand  player1_ht  \
0                   26              1             0             1       180.0   

   player1_age  player1_rank  player1_rank_points  player1_elo  \
0         35.1          96.0                492.0       1509.6   

   player1_match_counts  player1_ace  player1_df  player1_svpt  player1_fstIn  \
0                     0          3.0         0.0          64.0           35.0   

   player1_fstWon  player1_sndWon  player1_SvGms  player1_bpSaved  \
0            26.0            19.0           10.0              2.0   

   player1_bpFaced  baseline_rally  intensity  player1_ace_rate  \
0              3.0        0.650794       0.35          0.046875   

   player1_df_rate  player1_serve_win_rate  player1_serve_efficiency  \
0              0.0                0.703125    

In [17]:
match_df.groupby("result").size()

result
0    33197
1    32801
dtype: int64

In [18]:
match_df.isna().sum()

tourney_id                          0
tourney_date                        0
round_code                          0
best_of                             0
player1_id                          0
player2_id                          0
player1_seed_bucket                 0
player1_entry                       0
player1_host                        0
player1_hand                        0
player1_ht                          0
player1_age                         0
player1_rank                        0
player1_rank_points                 0
player1_elo                         0
player1_match_counts                0
player1_ace                         0
player1_df                          0
player1_svpt                        0
player1_fstIn                       0
player1_fstWon                      0
player1_sndWon                      0
player1_SvGms                       0
player1_bpSaved                     0
player1_bpFaced                     0
baseline_rally                      0
intensity   

In [19]:
# 确保 tourney_date 是 datetime 类型
match_df['tourney_date'] = pd.to_datetime(match_df['tourney_date'])

# 根据年份范围划分数据
match_df_00_04 = match_df[(match_df['tourney_date'].dt.year >= 2000) & (match_df['tourney_date'].dt.year <= 2004)]
match_df_05_09 = match_df[(match_df['tourney_date'].dt.year >= 2005) & (match_df['tourney_date'].dt.year <= 2009)]
match_df_10_14 = match_df[(match_df['tourney_date'].dt.year >= 2010) & (match_df['tourney_date'].dt.year <= 2014)]
match_df_15_19 = match_df[(match_df['tourney_date'].dt.year >= 2015) & (match_df['tourney_date'].dt.year <= 2019)]
match_df_20_24 = match_df[(match_df['tourney_date'].dt.year >= 2020) & (match_df['tourney_date'].dt.year <= 2024)]


In [20]:
dfs = {
    "match_df_00_04": match_df_00_04,
    "match_df_05_09": match_df_05_09,
    "match_df_10_14": match_df_10_14,
    "match_df_15_19": match_df_15_19,
    "match_df_20_24": match_df_20_24
}

output_directory = os.path.join("..", "dataset", "processed_data_1")
os.makedirs(output_directory, exist_ok=True)  # 创建目录（如果不存在）
for key, df in dfs.items():
    print(f"{key}: {len(df)} rows")
    output_file = os.path.join(output_directory, f"{key}.csv")
    df.to_csv(output_file, index=False, encoding="utf-8-sig")
    print(f"已保存: {output_file}")

match_df_00_04: 13970 rows
已保存: F:\大四\tennis_predicton\processed_data_1\match_df_00_04.csv
match_df_05_09: 13616 rows
已保存: F:\大四\tennis_predicton\processed_data_1\match_df_05_09.csv
match_df_10_14: 12770 rows
已保存: F:\大四\tennis_predicton\processed_data_1\match_df_10_14.csv
match_df_15_19: 13414 rows
已保存: F:\大四\tennis_predicton\processed_data_1\match_df_15_19.csv
match_df_20_24: 12228 rows
已保存: F:\大四\tennis_predicton\processed_data_1\match_df_20_24.csv
