In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# 定义文件路径
file_paths = [
    f"../dataset/processed_data_1/df_{start:02d}_{end:02d}_cleaned.csv"
    for start, end in zip(range(0, 25, 5), range(4, 29, 5))
]

# 读取数据并存入字典
dfs = {f"df_{i}": pd.read_csv(file,low_memory=False) for i, file in zip(["00_04", "05_09", "10_14", "15_19", "20_24"], file_paths)}


In [3]:
# 访问
print(dfs["df_05_09"].columns)  # 查看 2000-2004 年数据的前几行

Index(['tourney_id', 'tourney_name', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'tourney_ioc', 'tourney_year', 'w_host', 'l_host', 'surface_Clay',
       'surface_Grass', 'surface_Hard', 'round_code', 'winner_seed_bucket',
       'loser_seed_bucket'],
      dtype='object')


In [4]:
import re

# 解析 score 计算 胜负盘数、总局数、局差
def parse_score_extended(score):
    if pd.isna(score):
        return None, None, None, None, None, None  # 处理缺失值
    
    # 处理退赛情况
    ret_flag = 1 if "RET" in score else 0
    score = score.replace("RET", "").strip()  # 移除 RET，防止干扰解析
    
    # 拆分比分
    score_parts = str(score).split(" ")

    win_sets, lose_sets = 0, 0  # 统计胜负盘数
    total_win_games, total_lose_games = 0, 0  # 统计胜负总局数

    for part in score_parts:
        match = re.match(r"(\d+)-(\d+)(?:\((\d+)\))?", part)
        if match:
            p1, p2 = int(match.group(1)), int(match.group(2))
            tiebreak = int(match.group(3)) if match.group(3) else None
            
            # 处理抢七局，如果抢七分数 > 4，视为 8-6
            if tiebreak and tiebreak > 4:
                p1, p2 = 8, 6

            # 统计胜负盘数
            if p1 > p2:
                win_sets += 1
            else:
                lose_sets += 1

            # 统计总局数
            total_win_games += p1
            total_lose_games += p2

    # 计算局差
    game_difference = total_win_games - total_lose_games

    return win_sets, lose_sets, total_win_games, total_lose_games, game_difference, ret_flag


In [5]:
# 遍历所有 DataFrame 并应用解析函数
for name, df in dfs.items():
    df[["win_sets", "lose_sets", "total_win_games", "total_lose_games", "game_diff", "ret"]] = df["score"].apply(
        lambda s: pd.Series(parse_score_extended(s))
    )


In [6]:
# 输出结果
for name, df in dfs.items():
    print(f"==== {name} ====")
    print(df.sample(5))


==== df_00_04 ====
      tourney_id   tourney_name  draw_size  tourney_level tourney_date  \
1522    2000-423    Los Angeles         32              1   2000-07-24   
1049    2000-408         London         32              1   2000-02-21   
10413   2003-451           Doha         32              1   2002-12-30   
613     2000-352  Paris Masters         48              2   2000-11-13   
7010    2002-416   Rome Masters         64              2   2002-05-06   

       match_num  winner_id  winner_seed  winner_entry        winner_name  \
1522          17     102201          0.0             0        Lionel Roux   
1049          24     103819          0.0             1      Roger Federer   
10413         23     102905          0.0             1      Stefan Koubek   
613            5     101320          0.0             0  Magnus Gustafsson   
7010          32     102856          2.0             1    Gustavo Kuerten   

       winner_hand  winner_ht winner_ioc  winner_age  loser_id  loser_see

In [7]:
print(dfs["df_00_04"].columns)

Index(['tourney_id', 'tourney_name', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'tourney_ioc', 'tourney_year', 'w_host', 'l_host', 'surface_Clay',
       'surface_Grass', 'surface_Hard', 'round_code', 'winner_seed_bucket',
       'loser_seed_bucket', 'win_sets', 'lose_sets', 'total_win_games',
       'total_lose_games', 'game_diff', 'ret'],
      dtype='object')


In [8]:
 for key, df in dfs.items():
        #print(df.loc[df['total_win_games']==0])
        df = df[df['ret'] != 1]
        dfs[key] = df

In [9]:
import importlib
import welo_surface_g
from welo_surface_g import WEloRatingSystem
importlib.reload(welo_surface_g)

<module 'welo_surface_g' from 'F:\\大四\\tennis_predicton\\welo_surface_g.py'>

In [10]:
import numpy as np
# 假设 elo_system 已经实例化
elo_system = WEloRatingSystem()

for key, df in dfs.items():
    # 按比赛日期排序并重置索引
    df = df.sort_values(by="tourney_date").reset_index(drop=True)
    
    # 为预存的 ELO 评分列预先创建空列
    df["winner_elo_before_hard"] = np.nan
    df["winner_elo_before_clay"] = np.nan
    df["winner_elo_before_grass"] = np.nan
    df["loser_elo_before_hard"] = np.nan
    df["loser_elo_before_clay"] = np.nan
    df["loser_elo_before_grass"] = np.nan
    df["winner_match_counts"] = 0
    df["loser_match_counts"] = 0

    for idx, row in df.iterrows():
        # 从数据行中提取信息
        winner_id = row["winner_id"]
        loser_id = row["loser_id"]
        winner_games = row["total_win_games"]
        loser_games = row["total_lose_games"]

        # 通过 check_player 保证选手已初始化
        flag = elo_system.check_player(winner_id)
        elo_system.check_player(loser_id)

        if flag:
            # 将比赛前的 ELO 评分保存到 DataFrame 对应列中
            df.loc[idx, "winner_elo_before_hard"]  = elo_system.ratings["hard"][winner_id]
            df.loc[idx, "winner_elo_before_clay"]  = elo_system.ratings["clay"][winner_id]
            df.loc[idx, "winner_elo_before_grass"] = elo_system.ratings["grass"][winner_id]
    
            df.loc[idx, "loser_elo_before_hard"]  = elo_system.ratings["hard"][loser_id]
            df.loc[idx, "loser_elo_before_clay"]  = elo_system.ratings["clay"][loser_id]
            df.loc[idx, "loser_elo_before_grass"] = elo_system.ratings["grass"][loser_id]

            df.loc[idx, "winner_match_counts"] = elo_system.match_counts[winner_id]
            df.loc[idx, "loser_match_counts"] = elo_system.match_counts[loser_id]

        # 根据布尔标志判断比赛场地（假设只有一个为 True）
        if row.get("surface_Hard", False):
            surface = "hard"
        elif row.get("surface_Grass", False):
            surface = "grass"
        elif row.get("surface_Clay", False):
            surface = "clay"
        else:
            # 如果均未设置，默认使用硬地
            surface = "hard"
        
        # 对于本场比赛，胜者实际得分为 1，调用评分更新函数
        elo_system.update_rating(
            winner_id=winner_id,
            loser_id=loser_id,
            outcome=1,  # 胜者得分 1
            winner_game=winner_games,
            loser_game=loser_games,
            surface=surface
        )
        if not flag:
            df.loc[idx, "winner_elo_before_hard"]  = elo_system.ratings["hard"][winner_id]
            df.loc[idx, "winner_elo_before_clay"]  = elo_system.ratings["clay"][winner_id]
            df.loc[idx, "winner_elo_before_grass"] = elo_system.ratings["grass"][winner_id]
    
            df.loc[idx, "loser_elo_before_hard"]  = elo_system.ratings["hard"][loser_id]
            df.loc[idx, "loser_elo_before_clay"]  = elo_system.ratings["clay"][loser_id]
            df.loc[idx, "loser_elo_before_grass"] = elo_system.ratings["grass"][loser_id]
            

    dfs[key] = df


In [11]:
for key, df in dfs.items():
    print(df.isna().sum())

tourney_id                 0
tourney_name               0
draw_size                  0
tourney_level              0
tourney_date               0
match_num                  0
winner_id                  0
winner_seed                0
winner_entry               0
winner_name                0
winner_hand                0
winner_ht                  0
winner_ioc                 0
winner_age                 0
loser_id                   0
loser_seed                 0
loser_entry                0
loser_name                 0
loser_hand                 0
loser_ht                   0
loser_ioc                  0
loser_age                  0
score                      0
best_of                    0
round                      0
minutes                    0
w_ace                      0
w_df                       0
w_svpt                     0
w_1stIn                    0
w_1stWon                   0
w_2ndWon                   0
w_SvGms                    0
w_bpSaved                  0
w_bpFaced     

#elo与积分关系
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 假设 elo_system 已经实例化
elo_system = WEloRatingSystem()

elo_data = []  # 用于存储ELO随比赛场次的变化

for key, df in dfs.items():
    df = df.sort_values(by="tourney_date").reset_index(drop=True)

    # 预创建列
    df["winner_elo_before_hard"] = np.nan
    df["winner_match_count"] = np.nan  # 记录比赛场次

    for idx, row in df.iterrows():
        winner_id = row["winner_id"]
        loser_id = row["loser_id"]
        winner_games = row["total_win_games"]
        loser_games = row["total_lose_games"]
        winner_rank_points = row["winner_rank_points"]

        # 确保选手已初始化
        flag = elo_system.check_player(winner_id)
        elo_system.check_player(loser_id)

        # 记录比赛前的ELO和比赛场次
        if flag:
            df.loc[idx, "winner_elo_before_hard"] = elo_system.ratings["hard"][winner_id]
            df.loc[idx, "winner_match_count"] = elo_system.match_counts[winner_id]

            # 存储ELO数据以供分析
            elo_data.append((elo_system.match_counts[winner_id], elo_system.ratings["hard"][winner_id],winner_rank_points))

        # 确定比赛场地
        if row.get("surface_Hard", False):
            surface = "hard"
        elif row.get("surface_Grass", False):
            surface = "grass"
        elif row.get("surface_Clay", False):
            surface = "clay"
        else:
            surface = "hard"

        # 更新ELO评分
        elo_system.update_rating(
            winner_id=winner_id,
            loser_id=loser_id,
            outcome=1,
            winner_game=winner_games,
            loser_game=loser_games,
            surface=surface
        )

    dfs[key] = df


elo_rank_points=pd.DataFrame(elo_data,columns=['match_counts', 'elo_hard', 'rank_points'])

#elo与积分关系，用于初始化
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# 筛选match_counts>10的数据
filtered_df = elo_rank_points[elo_rank_points['match_counts'] > 20]

# 基础分析
print("基础统计:")
print(filtered_df[['elo_hard', 'rank_points']].describe())

# 可视化分析
plt.figure(figsize=(12, 6))

# 散点图
plt.subplot(1, 2, 1)
sns.scatterplot(x='elo_hard', y='rank_points', data=filtered_df, alpha=0.7)
plt.title('ELO评分 vs 排名积分 (match_counts>10)')
plt.xlabel('ELO Hard Score')
plt.ylabel('Rank Points')

# 添加回归线
sns.regplot(x='elo_hard', y='rank_points', data=filtered_df, 
           scatter=False, color='red', line_kws={'linewidth':2})

# 箱线图
plt.subplot(1, 2, 2)
sns.boxplot(x=pd.qcut(filtered_df['elo_hard'], 5), y='rank_points', data=filtered_df)
plt.title('不同ELO区间的积分分布')
plt.xlabel('ELO评分分组')
plt.ylabel('Rank Points')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# 相关性分析
corr_pearson = filtered_df[['elo_hard', 'rank_points']].corr(method='pearson')
corr_spearman = filtered_df[['elo_hard', 'rank_points']].corr(method='spearman')

print("\n皮尔逊相关系数:")
print(corr_pearson)
print("\n斯皮尔曼相关系数:")
print(corr_spearman)

# 回归分析
X = filtered_df['elo_hard']
y = filtered_df['rank_points']
slope, intercept, r_value, p_value, std_err = stats.linregress(X, y)

print(f"\n回归方程: rank_points = {slope:.2f} * elo_hard + {intercept:.2f}")
print(f"R平方值: {r_value**2:.3f}")
print(f"P值: {p_value:.4f}")

# 异常值分析
z_scores = np.abs(stats.zscore(filtered_df[['elo_hard', 'rank_points']]))
filtered_df_no_outliers = filtered_df[(z_scores < 3).all(axis=1)]

print(f"\n原始数据量: {len(filtered_df)}")
print(f"去除异常值后数据量: {len(filtered_df_no_outliers)}")

# 分组分析
bins = [20, 30, 40, np.inf]
labels = ['20-30场', '30-40场', '40+场']
filtered_df['group'] = pd.cut(filtered_df['match_counts'], bins=bins, labels=labels)

group_corr = filtered_df.groupby('group')[['elo_hard', 'rank_points']].corr(method='pearson').unstack()['rank_points']['elo_hard']
print("\n不同比赛场次分组的相关系数:")
print(group_corr)

In [12]:
for key, df in dfs.items():
    # 定义目标列列表
    target_cols = ['winner_elo_before_hard', 'winner_elo_before_clay','winner_elo_before_grass', 'loser_elo_before_hard','loser_elo_before_clay', 'loser_elo_before_grass']
    
    print(df.groupby(['surface_Clay', 'surface_Grass', 'surface_Hard']).size())
    
    for col in target_cols:
        print(df[target_cols].describe())
        #df[target_cols] = df[target_cols].fillna(1500)
        
    
    dfs[key] = df

surface_Clay  surface_Grass  surface_Hard
0             0              1               7716
              1              0               1459
1             0              0               4799
dtype: int64
       winner_elo_before_hard  winner_elo_before_clay  \
count            13974.000000            13974.000000   
mean              1652.826173             1619.867324   
std                106.912272               83.409590   
min               1500.232515             1500.161163   
25%               1562.826204             1550.107913   
50%               1635.164934             1606.604926   
75%               1727.689236             1680.053141   
max               1954.306624             1877.675024   

       winner_elo_before_grass  loser_elo_before_hard  loser_elo_before_clay  \
count             13974.000000           13974.000000           13974.000000   
mean               1599.227479            1623.461561            1599.044672   
std                  70.438550           

In [13]:
#print(dfs['df_00_04'].loc[dfs['df_00_04']['elo_prediction_error_hard'] > 0.9].head(5))

In [14]:
output_directory = os.path.join("..", "dataset", "processed_data_1")
os.makedirs(output_directory, exist_ok=True)  # 创建目录（如果不存在）
for key, df in dfs.items():
    output_file = os.path.join(output_directory, f"{key}_eloed.csv")
    df.to_csv(output_file, index=False, encoding="utf-8-sig")
    print(f"已保存: {output_file}")

已保存: F:\大四\tennis_predicton\processed_data_1\df_00_04_eloed.csv
已保存: F:\大四\tennis_predicton\processed_data_1\df_05_09_eloed.csv
已保存: F:\大四\tennis_predicton\processed_data_1\df_10_14_eloed.csv
已保存: F:\大四\tennis_predicton\processed_data_1\df_15_19_eloed.csv
已保存: F:\大四\tennis_predicton\processed_data_1\df_20_24_eloed.csv
