In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# 定义文件路径
file_paths = [
    f"../dataset/processed_data/df_{start:02d}_{end:02d}_merged.csv"
    for start, end in zip(range(0, 25, 5), range(4, 29, 5))
]

# 读取数据并存入字典
dfs = {f"df_{i}": pd.read_csv(file,low_memory=False) for i, file in zip(["00_04", "05_09", "10_14", "15_19", "20_24"], file_paths)}

# 访问
#print(dfs["df_00_04"].head())  # 查看 2000-2004 年数据的前几行

In [3]:
###缺失值分表统计
for key, df in dfs.items():
    print( "------------" + key + "------------")
    print(df.isna().sum())

------------df_00_04------------
tourney_id                0
tourney_name              0
surface                   0
draw_size                 0
tourney_level             0
tourney_date              0
match_num                 0
winner_id                 0
winner_seed           10185
winner_entry          14467
winner_name               0
winner_hand               0
winner_ht               504
winner_ioc                0
winner_age                2
loser_id                  0
loser_seed            12908
loser_entry           13216
loser_name                0
loser_hand                0
loser_ht                931
loser_ioc                 0
loser_age                 0
score                     0
best_of                   0
round                     0
minutes                1979
w_ace                  1965
w_df                   1965
w_svpt                 1965
w_1stIn                1965
w_1stWon               1965
w_2ndWon               1965
w_SvGms                1965
w_bpSaved      

# 合并数据集 

##### 数据查看

In [4]:
df_concated = pd.concat(dfs, ignore_index=True) 

In [5]:
df_concated.groupby(["tourney_level"]).size()

tourney_level
A    40847
D     6987
F      437
G    12573
M    13998
O       64
dtype: int64

In [6]:
df_concated.groupby(["round"]).size()

round
BR          9
ER         32
F        1664
QF       6400
R128     7936
R16     12736
R32     23472
R64     11325
RR       8056
SF       3276
dtype: int64

In [7]:
#可能的异常值查看
df_concated.loc[df_concated["minutes"] < 60].head(1)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_ioc,tourney_year,w_host,l_host
2,2000-301,Auckland,Hard,32,A,2000-01-10,3,103252,,,Alberto Martin,R,175.0,ESP,21.3,102238,,,Alberto Berasategui,R,173.0,ESP,26.5,6-3 6-1,3,R32,56.0,0.0,0.0,55.0,35.0,25.0,12.0,8.0,1.0,1.0,0.0,6.0,56.0,33.0,20.0,7.0,8.0,7.0,11.0,48.0,726.0,59.0,649.0,NZL,2000,0,0


In [8]:
df_concated.loc[df_concated["round"] == "RR"].sample(1)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_ioc,tourney_year,w_host,l_host
19350,2005-D005,Davis Cup WG R1: RUS vs CHI,Carpet,4,D,2005-03-04,2,103498,,,Marat Safin,R,193.0,RUS,25.0,103201,,,Adrian Garcia,R,175.0,CHI,26.7,6-1 3-6 6-3 7-6(4),5,RR,,,,,,,,,,,,,,,,,,,,4.0,3360.0,117.0,378.0,,2005,0,0


In [9]:
df_concated.loc[df_concated["round"] == "RR"].groupby("tourney_name", group_keys=False).size()

tourney_name
Adelaide                            24
Atp Cup                            144
Buenos Aires                        24
Davis Cup Finals F: AUS vs ITA       2
Davis Cup Finals F: CAN vs AUS       2
Davis Cup Finals F: CAN vs ESP       2
Davis Cup Finals F: ITA vs NED       2
Davis Cup Finals F: RTF vs CRO       2
Davis Cup Finals QF: ARG vs ESP      2
Davis Cup Finals QF: AUS vs CAN      2
Davis Cup Finals QF: AUS vs NED      2
Davis Cup Finals QF: CAN vs FIN      2
Davis Cup Finals QF: CRO vs ESP      2
Davis Cup Finals QF: CZE vs AUS      2
Davis Cup Finals QF: GBR vs GER      4
Davis Cup Finals QF: GER vs CAN      4
Davis Cup Finals QF: ITA vs ARG      2
Davis Cup Finals QF: ITA vs CRO      2
Davis Cup Finals QF: ITA vs NED      2
Davis Cup Finals QF: ITA vs USA      2
Davis Cup Finals QF: NED vs ESP      2
Davis Cup Finals QF: RTF vs SWE      2
Davis Cup Finals QF: SRB vs GBR      2
Davis Cup Finals QF: SRB vs KAZ      2
Davis Cup Finals QF: SRB vs RUS      2
Davis Cup Fi

##### 总体缺失值统计

In [10]:
df_concated.isnull().sum()

tourney_id                0
tourney_name              0
surface                  53
draw_size                 0
tourney_level             0
tourney_date              0
match_num                 0
winner_id                 0
winner_seed           43786
winner_entry          65400
winner_name               0
winner_hand               0
winner_ht              1425
winner_ioc                0
winner_age                5
loser_id                  0
loser_seed            57668
loser_entry           59484
loser_name                0
loser_hand                4
loser_ht               2909
loser_ioc                 0
loser_age                 3
score                     0
best_of                   0
round                     0
minutes                8174
w_ace                  6520
w_df                   6520
w_svpt                 6520
w_1stIn                6520
w_1stWon               6520
w_2ndWon               6520
w_SvGms                6520
w_bpSaved              6520
w_bpFaced           

In [11]:
df_concated.describe()

Unnamed: 0,draw_size,match_num,winner_id,winner_seed,winner_ht,winner_age,loser_id,loser_seed,loser_ht,loser_age,best_of,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_year,w_host,l_host
count,74906.0,74906.0,74906.0,31120.0,73481.0,74901.0,74906.0,17238.0,71997.0,74903.0,74906.0,66732.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,68386.0,74333.0,74333.0,73438.0,73438.0,74906.0,74906.0,74906.0
mean,55.345126,102.269164,111056.333818,7.402635,186.273363,26.289788,111055.887299,8.920931,185.67157,26.401226,3.449417,107.223356,6.930644,2.634984,77.97925,48.044731,36.366098,16.598661,12.517796,3.45774,5.023367,5.133565,3.351168,81.013351,48.635481,32.47479,14.929576,12.308104,4.771898,8.601468,79.64425,1608.435755,117.699965,977.409039,2011.493552,0.087149,0.092196
std,40.31519,133.184752,22766.835939,6.815177,7.028887,3.980862,22671.363557,7.366649,6.979333,4.091029,0.834786,41.324962,5.516213,2.286877,29.164329,18.92696,13.559045,6.964207,4.220051,3.072644,4.02494,4.878886,2.527369,29.139202,19.198909,14.358641,7.185611,4.220819,3.271866,4.14555,138.795266,2006.339266,185.938285,1125.591169,7.240265,0.282055,0.289304
min,2.0,1.0,100644.0,1.0,3.0,14.9,100644.0,1.0,3.0,14.5,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,2000.0,0.0,0.0
25%,32.0,12.0,103565.0,3.0,183.0,23.4,103490.0,4.0,181.0,23.4,3.0,77.0,3.0,1.0,57.0,34.0,27.0,12.0,10.0,1.0,2.0,2.0,2.0,60.0,35.0,22.0,10.0,9.0,2.0,6.0,18.0,577.0,36.0,430.0,2005.0,0.0,0.0
50%,32.0,30.0,104494.0,5.0,185.0,26.1,104471.0,7.0,185.0,26.2,3.0,100.0,6.0,2.0,73.0,45.0,34.0,16.0,11.0,3.0,4.0,4.0,3.0,76.0,45.0,30.0,14.0,11.0,4.0,8.0,45.0,940.0,68.0,710.0,2011.0,0.0,0.0
75%,64.0,220.0,105493.0,9.0,190.0,28.9,105613.0,12.0,190.0,29.2,3.0,130.0,9.0,4.0,94.0,58.0,43.0,20.0,15.0,5.0,7.0,7.0,5.0,97.0,59.0,40.0,19.0,15.0,7.0,11.0,85.0,1732.0,114.0,1105.0,2018.0,0.0,0.0
max,128.0,1701.0,212721.0,35.0,211.0,44.6,212970.0,35.0,211.0,44.0,5.0,1146.0,113.0,26.0,491.0,361.0,292.0,82.0,90.0,24.0,30.0,103.0,26.0,489.0,328.0,284.0,101.0,91.0,27.0,38.0,2101.0,16950.0,2159.0,16950.0,2024.0,1.0,1.0


In [12]:
df_concated.groupby(["round"]).size()

round
BR          9
ER         32
F        1664
QF       6400
R128     7936
R16     12736
R32     23472
R64     11325
RR       8056
SF       3276
dtype: int64

### 处理排名缺失值

In [13]:
import glob
directory = r"F:\大四\tennis_predicton\rank"
file_pattern = os.path.join(directory, "atp_rankings_*")  # 匹配文件名

# 获取所有符合条件的文件
file_list = glob.glob(file_pattern)

# 读取所有文件并拼接
df_rank_list = [pd.read_csv(file) for file in file_list]
df_rank = pd.concat(df_rank_list, ignore_index=True)
df_rank["ranking_date"] = pd.to_datetime(df_rank["ranking_date"], format="%Y%m%d").dt.date

In [14]:
# 筛选出 winner_rank 或 loser_rank 为空的记录
missing_rank = df_concated[df_concated["winner_rank"].isnull() | df_concated["loser_rank"].isnull()]

# 显示前5行缺失数据
print("包含排名缺失值的行示例:")
print(missing_rank.head(1))

包含排名缺失值的行示例:
   tourney_id tourney_name surface  draw_size tourney_level tourney_date  \
75   2000-308       Munich    Clay         32             A   2000-05-01   

    match_num  winner_id  winner_seed winner_entry     winner_name  \
75         14     210013          NaN          NaN  Martin Damm Sr   

   winner_hand  winner_ht winner_ioc  winner_age  loser_id  loser_seed  \
75           R      188.0        CZE        27.7    102563         NaN   

   loser_entry        loser_name loser_hand  loser_ht loser_ioc  loser_age  \
75         NaN  Thomas Johansson          R     180.0       SWE       25.1   

                score  best_of round  minutes  w_ace  w_df  w_svpt  w_1stIn  \
75  6-7(6) 7-6(5) 6-3        3   R32    153.0   16.0   0.0   105.0     69.0   

    w_1stWon  w_2ndWon  w_SvGms  w_bpSaved  w_bpFaced  l_ace  l_df  l_svpt  \
75      57.0      22.0     17.0        1.0        1.0   11.0   2.0   119.0   

    l_1stIn  l_1stWon  l_2ndWon  l_SvGms  l_bpSaved  l_bpFaced  winner_

In [15]:
missing_rank.describe()

Unnamed: 0,draw_size,match_num,winner_id,winner_seed,winner_ht,winner_age,loser_id,loser_seed,loser_ht,loser_age,best_of,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_year,w_host,l_host
count,1766.0,1766.0,1766.0,69.0,1022.0,1761.0,1766.0,11.0,490.0,1763.0,1766.0,419.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,1193.0,1193.0,298.0,298.0,1766.0,1766.0,1766.0
mean,10.151755,15.43658,112871.639298,7.144928,183.712329,24.648154,120173.849377,9.0,183.891837,23.805559,3.85957,92.264916,6.121212,2.703963,67.634033,41.759907,32.121212,14.706294,11.286713,2.755245,4.060606,3.694639,3.911422,72.965035,43.797203,27.515152,12.524476,10.941725,4.536131,8.853147,498.975692,336.234702,824.88255,137.442953,2009.514156,0.013024,0.045866
std,19.982759,59.003857,25677.081077,5.788618,10.191671,4.731838,32931.970374,7.615773,13.229241,5.31346,0.990371,41.972524,5.357259,2.624206,27.357086,17.344386,12.589093,6.575761,3.89636,2.748534,3.73116,3.989764,2.76858,27.142772,17.661892,13.567515,7.259701,3.875258,2.944459,3.639011,423.204201,737.566339,543.972695,300.332013,6.877938,0.113408,0.209254
min,4.0,1.0,100644.0,1.0,3.0,14.9,100754.0,3.0,3.0,14.5,3.0,21.0,0.0,0.0,19.0,9.0,5.0,3.0,4.0,0.0,0.0,0.0,0.0,17.0,8.0,4.0,0.0,3.0,0.0,0.0,2.0,1.0,9.0,1.0,2000.0,0.0,0.0
25%,4.0,2.0,103600.0,4.0,180.0,21.1,104262.0,5.5,180.0,19.4,3.0,62.0,2.0,1.0,48.0,29.0,24.0,10.0,8.0,1.0,1.0,1.0,2.0,53.0,31.0,17.0,7.0,8.0,2.0,6.0,137.0,18.0,383.0,4.0,2003.0,0.0,0.0
50%,4.0,4.0,104699.0,6.0,183.0,24.2,108615.5,7.0,185.0,22.8,3.0,84.0,5.0,2.0,61.0,38.0,29.0,14.0,10.0,2.0,3.0,2.0,3.0,68.0,40.0,25.0,11.0,10.0,4.0,8.0,371.0,98.0,811.0,16.5,2009.0,0.0,0.0
75%,4.0,5.0,106406.75,8.0,188.0,27.7,108956.0,7.5,188.0,27.5,5.0,110.0,8.0,4.0,82.0,51.0,38.0,18.0,14.0,4.0,6.0,5.0,5.0,88.0,55.0,36.0,17.0,13.0,6.0,11.0,769.0,363.0,1166.0,84.5,2015.0,0.0,0.0
max,128.0,1125.0,212721.0,28.0,208.0,44.6,212722.0,30.0,203.0,43.9,5.0,356.0,33.0,16.0,185.0,113.0,94.0,41.0,29.0,21.0,26.0,21.0,20.0,180.0,118.0,88.0,39.0,29.0,16.0,23.0,2018.0,8460.0,2119.0,2800.0,2024.0,1.0,1.0


In [16]:
# 确保 ranking_date 和 tourney_date 已经转换为 datetime 类型
df_rank['ranking_date'] = pd.to_datetime(df_rank['ranking_date'], format='%Y-%m-%d')
missing_rank = missing_rank.copy()
missing_rank['tourney_date'] = pd.to_datetime(missing_rank['tourney_date'], format='%Y-%m-%d')


# 遍历 missing_rank 中的每一行进行计算
for idx, row in missing_rank.iterrows():
    winner_id = row['winner_id']
    loser_id = row['loser_id']
    tourney_date = row['tourney_date']
    
    # 筛选 df_rank 中与当前 winner_id 匹配，并计算日期差
    winner_matches = df_rank[(df_rank['player'] == winner_id) & 
                              (df_rank['ranking_date'] <= tourney_date + pd.Timedelta(days=30)) & 
                              (df_rank['ranking_date'] >= tourney_date - pd.Timedelta(days=60))]
    print(str(winner_matches["ranking_date"] - tourney_date))
    # 如果有符合条件的记录，计算 rank 和 points 的平均值
    if not winner_matches.empty:
        avg_winner_rank = winner_matches['rank'].mean()
        avg_winner_points = winner_matches['points'].mean()
        missing_rank.at[idx, 'winner_rank'] = avg_winner_rank
        missing_rank.at[idx, 'winner_rank_points'] = avg_winner_points
    
    # 筛选 df_rank 中与当前 loser_id 匹配，并计算日期差
    loser_matches = df_rank[(df_rank['player'] == loser_id) & 
                             (df_rank['ranking_date'] <= tourney_date + pd.Timedelta(days=30)) & 
                             (df_rank['ranking_date'] >= tourney_date - pd.Timedelta(days=60))]
    
    # 如果有符合条件的记录，计算 rank 和 points 的平均值
    if not loser_matches.empty:
        avg_loser_rank = loser_matches['rank'].mean()
        avg_loser_points = loser_matches['points'].mean()
        missing_rank.at[idx, 'loser_rank'] = avg_loser_rank
        missing_rank.at[idx, 'loser_rank_points'] = avg_loser_points


Series([], Name: ranking_date, dtype: timedelta64[ns])
12613   -56 days
14191   -49 days
15778   -42 days
17372   -35 days
18965   -28 days
20558   -21 days
22157   -14 days
23748    -7 days
25350     0 days
26917     7 days
28515    14 days
30111    21 days
31706    28 days
Name: ranking_date, dtype: timedelta64[ns]
28577   -56 days
30182   -49 days
31766   -42 days
33373   -35 days
34970   -28 days
36559   -21 days
38137   -14 days
39733    -7 days
41336     0 days
42912     7 days
44505    14 days
46085    21 days
47670    28 days
Name: ranking_date, dtype: timedelta64[ns]
16      0 days
1589    7 days
3160   14 days
4738   21 days
6312   28 days
Name: ranking_date, dtype: timedelta64[ns]
94      -56 days
1667    -49 days
3238    -42 days
4812    -35 days
6388    -28 days
7961    -21 days
9533    -14 days
11092    -7 days
12678     0 days
14250     7 days
15834    14 days
17428    21 days
19022    28 days
Name: ranking_date, dtype: timedelta64[ns]
102     -56 days
1683    -49 days
3

6434    -60 days
8008    -53 days
9577    -46 days
11145   -39 days
12707   -32 days
14300   -25 days
15885   -18 days
17479   -11 days
19068    -4 days
20666     3 days
22265    10 days
23860    17 days
25452    24 days
Name: ranking_date, dtype: timedelta64[ns]
6499    -60 days
8072    -53 days
9643    -46 days
11207   -39 days
12786   -32 days
14381   -25 days
15968   -18 days
17562   -11 days
19179    -4 days
20769     3 days
22363    10 days
23958    17 days
25547    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
48007   -60 days
49472   -53 days
51056   -46 days
52622   -39 days
54201   -32 days
55756   -25 days
57352   -18 days
58947   -11 days
60544    -4 days
62176     3 days
63818    10 days
65416    17 days
67044    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
7454    -60 days
9025    -53 days
10597   -46 days
12168   -39 days
13741   -32 days
15325   -25 d

820     -25 days
2391    -18 days
3962    -11 days
5468     -4 days
7040      3 days
8614     10 days
10187    17 days
11755    24 days
Name: ranking_date, dtype: timedelta64[ns]
145    -18 days
1716   -11 days
3287    -4 days
4860     3 days
6432    10 days
8007    17 days
9575    24 days
Name: ranking_date, dtype: timedelta64[ns]
6       -25 days
1578    -18 days
3149    -11 days
4721     -4 days
6295      3 days
7869     10 days
9439     17 days
11008    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
248     -25 days
1817    -18 days
3388    -11 days
4963     -4 days
6535      3 days
8109     10 days
9682     17 days
11254    24 days
Name: ranking_date, dtype: timedelta64[ns]
234     -25 days
1802    -18 days
3373    -11 days
4942     -4 days
6517      3 days
8102     10 days
9673     17 days
11208    24 days
Name: ranking_date, dtype: timedelta64[ns]
558     -25 days
2130    -18 days
3701    -11 days
5274     -4 days
6848  

Series([], Name: ranking_date, dtype: timedelta64[ns])
76562   -60 days
78168   -53 days
79775   -46 days
81381   -39 days
82995   -32 days
84601   -25 days
86207   -18 days
87847   -11 days
89450    -4 days
91059     3 days
92642    10 days
94218    17 days
95873    24 days
Name: ranking_date, dtype: timedelta64[ns]
94149    -60 days
95746    -53 days
97349    -46 days
98946    -39 days
100540   -32 days
102129   -25 days
103722   -18 days
105302   -11 days
106897    -4 days
108493     3 days
110084    10 days
111674    17 days
113275    24 days
Name: ranking_date, dtype: timedelta64[ns]
94400    -60 days
95998    -53 days
97619    -46 days
99215    -39 days
100809   -32 days
102412   -25 days
103992   -18 days
105592   -11 days
107175    -4 days
108768     3 days
110361    10 days
111953    17 days
113549    24 days
Name: ranking_date, dtype: timedelta64[ns]
94567    -60 days
96161    -53 days
97763    -46 days
99355    -39 days
100949   -32 days
102487   -25 days
104080   -18 days
1

90465    -60 days
92061    -53 days
93648    -46 days
95245    -39 days
96844    -32 days
98445    -25 days
100042   -18 days
101635   -11 days
103234    -4 days
104831     3 days
106428    10 days
108019    17 days
109614    24 days
Name: ranking_date, dtype: timedelta64[ns]
89447    -60 days
91048    -53 days
92643    -46 days
94240    -39 days
95820    -32 days
97425    -25 days
99021    -18 days
100615   -11 days
102209    -4 days
103800     3 days
105395    10 days
106991    17 days
108582    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
108175   17 days
109768   24 days
Name: ranking_date, dtype: timedelta64[ns]
108175   17 days
109768   24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
89506    -60 days
91108    -53 days
92747    -46 days
94330    -39 days


Series([], Name: ranking_date, dtype: timedelta64[ns])
181353   -60 days
182987   -53 days
184659   -46 days
186329   -39 days
188141   -32 days
189812   -25 days
191483   -18 days
193276   -11 days
194956    -4 days
196548     3 days
198235    10 days
199920    17 days
201616    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
180647   -60 days
182300   -53 days
183968   -46 days
185638   -39 days
187312   -32 days
188984   -25 days
190652   -18 days
192324   -11 days
193998    -4 days
195583     3 days
197263    10 days
198949    17 days
200635    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
181294   -60 days
182961   -53 days
184631   -46 days
186301   -39 days
187969   -32 days
189646   -25 days
191268   -18 days
192942   -11 days
194622    -4 days
196217   

161839   -60 days
163509   -53 days
165178   -46 days
166845   -39 days
168512   -32 days
170180   -25 days
171849   -18 days
173512   -11 days
175182    -4 days
176853     3 days
178530    10 days
180199    17 days
181862    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
175075   -60 days
176745   -53 days
178421   -46 days
180088   -39 days
181754   -32 days
183420   -25 days
185086   -18 days
186756   -11 days
188417    -4 days
190084     3 days
191754    10 days
193434    17 days
Name: ranking_date, dtype: timedelta64[ns]
174651   -60 days
176322   -53 days
177994   -46 days
179660   -39 days
181327   -32 days
182994   -25 days
184667   -18 days
186337   -11 days
188003    -4 days
189675     3 days
191351    10 days
193025    17 days
194705    24 days
Name: ranking_date, dtype: timedelta64[ns]
175011   -60 days
176679   -53 days
178354   -46 days
180019   -39 days
181686   -32 days
183351   -25 days
185017   -18 days
186687

213570   3 days
Name: ranking_date, dtype: timedelta64[ns]
198016   -60 days
199700   -53 days
201402   -46 days
203097   -39 days
204783   -32 days
206472   -25 days
208165   -18 days
209849   -11 days
211536    -4 days
212946     3 days
214648    10 days
216049    17 days
217453    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
213996   -60 days
215384   -53 days
216778   -46 days
218178   -39 days
219585   -32 days
220992   -25 days
222405   -18 days
223815   -11 days
225235    -4 days
226653     3 days
228361    10 days
230062    17 days
231752    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
213996   -60 days
215384   -53 days
216778   -46 days
218178   -39 days
219585   -32 days
220992   -25 days
222405   -18 days
223815   -11 days
225235    -4 days
226653     3 days
228361    10 days
230062    17 days
231752

247028   -60 days
248690   -53 days
250352   -46 days
252011   -39 days
253675   -32 days
255340   -25 days
257002   -18 days
258670   -11 days
260302    -4 days
261973     3 days
263646    10 days
265318    17 days
266984    24 days
Name: ranking_date, dtype: timedelta64[ns]
258666   -60 days
260329   -53 days
261993   -46 days
263663   -39 days
265330   -32 days
266998   -25 days
268662   -18 days
270330   -11 days
271989    -4 days
273649     3 days
275285    10 days
276953    17 days
278620    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
258739   -60 days
260398   -53 days
262067   -46 days
263742   -39 days
265412   -32 days
267082   -25 days
268745   -18 days
270413   -11 days
272080    -4 days
273745     3 days
275411    10 days
277082    17 days
278748    24 days
Name: ranking_date, dtype: timedelta64[ns]
259247   -60 days
260908   -53 days
262578   -46 days
264216   -39 days
265809   -32 days
267477   -25 days
269143

258462   -60 days
260128   -53 days
261793   -46 days
263460   -39 days
265166   -32 days
266869   -25 days
268584   -18 days
270252   -11 days
271915    -4 days
273576     3 days
275243    10 days
276909    17 days
278578    24 days
Name: ranking_date, dtype: timedelta64[ns]
258554   -60 days
260220   -53 days
261890   -46 days
263566   -39 days
265223   -32 days
266907   -25 days
268577   -18 days
270245   -11 days
271908    -4 days
273572     3 days
275240    10 days
276906    17 days
278574    24 days
Name: ranking_date, dtype: timedelta64[ns]
258462   -60 days
260128   -53 days
261793   -46 days
263460   -39 days
265166   -32 days
266869   -25 days
268584   -18 days
270252   -11 days
271915    -4 days
273576     3 days
275243    10 days
276909    17 days
278578    24 days
Name: ranking_date, dtype: timedelta64[ns]
298927   -60 days
300631   -53 days
302338   -46 days
304042   -39 days
305733   -32 days
307433   -25 days
309135   -18 days
310843   -11 days
312563    -4 days
314292 

333191   -60 days
334925   -53 days
336668   -46 days
338411   -39 days
340154   -32 days
341893   -25 days
343640   -18 days
345386   -11 days
347128    -4 days
348869     3 days
350608    10 days
352343    17 days
354070    24 days
Name: ranking_date, dtype: timedelta64[ns]
333778   -60 days
335512   -53 days
337255   -46 days
338998   -39 days
340741   -32 days
342486   -25 days
344233   -18 days
345979   -11 days
347722    -4 days
349462     3 days
351196    10 days
352926    17 days
354597    24 days
Name: ranking_date, dtype: timedelta64[ns]
334290   -60 days
336024   -53 days
337767   -46 days
339510   -39 days
341253   -32 days
342998   -25 days
344744   -18 days
346490   -11 days
348228    -4 days
349965     3 days
351704    10 days
353438    17 days
354801    24 days
Name: ranking_date, dtype: timedelta64[ns]
334057   -60 days
335795   -53 days
337538   -46 days
339281   -39 days
341024   -32 days
Name: ranking_date, dtype: timedelta64[ns]
334057   -60 days
335795   -53 days


334031   -60 days
335600   -53 days
337343   -46 days
339086   -39 days
340829   -32 days
342572   -25 days
344319   -18 days
346065   -11 days
347804    -4 days
349539     3 days
351278    10 days
353013    17 days
354742    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
334062   -60 days
335800   -53 days
337543   -46 days
339286   -39 days
341029   -32 days
342731   -25 days
344477   -18 days
346223   -11 days
347960    -4 days
349697     3 days
351435    10 days
353169    17 days
354900    24 days
Name: ranking_date, dtype: timedelta64[ns]
334498   -60 days
336239   -53 days
337982   -46 days
339725   -39 days
341468   -32 days
343213   -25 days
344959   -18 days
346705   -11 days
348446    -4 days
350183     3 days
351919    10 days
353649    17 days
355384    24 days
Name: ranking_date, dtype: timedelta64[ns]
334018   -60 days
335677   -53 days
337420   -46 days
339163   -39 days
340906   -32 days
342651   -25 days
344398

Series([], Name: ranking_date, dtype: timedelta64[ns])
445803   -60 days
447642   -53 days
449486   -46 days
451323   -39 days
453168   -32 days
455007   -25 days
456849   -18 days
458691   -11 days
460533    -4 days
462375     3 days
464221    10 days
466059    17 days
467907    24 days
Name: ranking_date, dtype: timedelta64[ns]
446304   -60 days
448150   -53 days
449994   -46 days
451824   -39 days
453669   -32 days
455589   -25 days
457435   -18 days
459273   -11 days
461078    -4 days
462923     3 days
464768    10 days
466604    17 days
468470    24 days
Name: ranking_date, dtype: timedelta64[ns]
446445   -60 days
448290   -53 days
450134   -46 days
451967   -39 days
453812   -32 days
455649   -25 days
457498   -18 days
459337   -11 days
461177    -4 days
463021     3 days
464863    10 days
466803    17 days
468680    24 days
Name: ranking_date, dtype: timedelta64[ns]
446304   -60 days
448150   -53 days
449994   -46 days
451824   -39 days
453669   -32 days
455589   -25 days
457435

465998   -60 days
467850   -53 days
469708   -46 days
471561   -39 days
473432   -32 days
475315   -25 days
477197   -18 days
479072   -11 days
480938    -4 days
482795     3 days
484654    10 days
486514    17 days
488388    24 days
Name: ranking_date, dtype: timedelta64[ns]
466042   -60 days
467891   -53 days
469749   -46 days
471721   -39 days
473593   -32 days
475476   -25 days
477358   -18 days
479234   -11 days
481096    -4 days
482952     3 days
484806    10 days
486751    17 days
488673    24 days
Name: ranking_date, dtype: timedelta64[ns]
465998   -60 days
467850   -53 days
469708   -46 days
471561   -39 days
473432   -32 days
475315   -25 days
477197   -18 days
479072   -11 days
480938    -4 days
482795     3 days
484654    10 days
486514    17 days
488388    24 days
Name: ranking_date, dtype: timedelta64[ns]
466042   -60 days
467891   -53 days
469749   -46 days
471721   -39 days
473593   -32 days
475476   -25 days
477358   -18 days
479234   -11 days
481096    -4 days
482952 

563445   -60 days
565419   -53 days
567395   -46 days
569367   -39 days
571337   -32 days
573309   -25 days
575286   -18 days
577266   -11 days
579246    -4 days
581240     3 days
583246    10 days
585259    17 days
587272    24 days
Name: ranking_date, dtype: timedelta64[ns]
562456   -60 days
564410   -53 days
566387   -46 days
568363   -39 days
570326   -32 days
572292   -25 days
574270   -18 days
576238   -11 days
578219    -4 days
580208     3 days
582208    10 days
584215    17 days
586237    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
579857   -60 days
581862   -53 days
583878   -46 days
585897   -39 days
587911   -32 days
590062   -25 days
592090   -18 days
594115   -11 days
596214    -4 days
598227     3 days
600254    10 days
602162    17 days
604166    24 days
Name: ranking_date, dtype: timedelta64[ns]
5800

533443   -60 days
535356   -53 days
537265   -46 days
539172   -39 days
541084   -32 days
542996   -25 days
544895   -18 days
546818   -11 days
548745    -4 days
550688     3 days
552635    10 days
554583    17 days
556528    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
533443   -60 days
535356   -53 days
537265   -46 days
539172   -39 days
541084   -32 days
542996   -25 days
544895   -18 days
546818   -11 days
548745    -4 days
550688     3 days
552635    10 days
554583    17 days
556528    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
533969   -60 days
535880   -53 days
537786   -46 days
539695   -39 days
541539   -32 days
543451   -25 days
545220   -18 days
547143   -11 days
549070    -4 days
551013     3 days
552966    10 days
554910    17 days
556877    24 days
Name: ranking_date, dtype: timedelta64[ns]
5332

619554   -60 days
621526   -53 days
623504   -46 days
625482   -39 days
627458   -32 days
629433   -25 days
631408   -18 days
633387   -11 days
635343    -4 days
636880     3 days
638786    10 days
640747    17 days
642734    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
664232   -60 days
666148   -53 days
668057   -46 days
669972   -39 days
671864   -32 days
673763   -25 days
675665   -18 days
677595   -11 days
679473    -4 days
681391     3 days
683283    10 days
685195    17 days
687076    24 days
Name: ranking_date, dtype: timedelta64[ns]
681077   -60 days
682967   -53 days
684844   -46 days
686725   -39 days
688596   -32 days
690458   -25 days
692316   -18 days
694189   -11 days
696053    -4 days
697920     3 days
699775    10 days
701610    17 days
703467    24 days
Name: ranking_date, dtype: timedelta64[ns]
681238   -60 days
683130   -53 days
685009   -46 days
686848   -39 days
688723   -32 days
690586   -25 days
692444

636453   -60 days
637990   -53 days
639896   -46 days
641858   -39 days
643809   -32 days
645758   -25 days
647893   -18 days
649823   -11 days
651874    -4 days
653806     3 days
655730    10 days
657651    17 days
659568    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
619678   -60 days
621650   -53 days
623628   -46 days
625606   -39 days
627586   -32 days
629560   -25 days
631535   -18 days
633514   -11 days
635482    -4 days
637021     3 days
638927    10 days
640892    17 days
642852    24 days
Name: ranking_date, dtype: timedelta64[ns]
620276   -60 days
622248   -53 days
624226   -46 days
626205   -39 days
628183   -32 days
630157   -25 days
632132   -18 days
634105   -11 days
636080    -4 days
637617     3 days
639522    10 days
641486    17 days
643439    24 days
Name: ranking_date, dtype: timedelta64[ns]
619959   -60 days
621920   -53 days
623898   -46 days
625877   -39 days
627853   -32 days
629849   -25 days
631824

718277   -60 days
720110   -53 days
721943   -46 days
723777   -39 days
725618   -32 days
727458   -25 days
729298   -18 days
731125   -11 days
732973    -4 days
734817     3 days
736665    10 days
738520    17 days
740375    24 days
Name: ranking_date, dtype: timedelta64[ns]
719073   -60 days
720910   -53 days
722744   -46 days
724577   -39 days
726413   -32 days
728254   -25 days
730094   -18 days
731940   -11 days
733789    -4 days
735635     3 days
737483    10 days
739340    17 days
741066    24 days
Name: ranking_date, dtype: timedelta64[ns]
718519   -60 days
720354   -53 days
722188   -46 days
724023   -39 days
725855   -32 days
727695   -25 days
729535   -18 days
731371   -11 days
733220    -4 days
735064     3 days
736911    10 days
738769    17 days
740614    24 days
Name: ranking_date, dtype: timedelta64[ns]
718571   -60 days
720406   -53 days
722241   -46 days
724076   -39 days
725905   -32 days
727745   -25 days
729585   -18 days
731425   -11 days
733270    -4 days
735113 

760668   -60 days
762523   -53 days
764368   -46 days
766194   -39 days
768034   -32 days
769886   -25 days
771742   -18 days
773591   -11 days
775465    -4 days
777354     3 days
779245    10 days
781140    17 days
783035    24 days
Name: ranking_date, dtype: timedelta64[ns]
761116   -60 days
762974   -53 days
764819   -46 days
766652   -39 days
768498   -32 days
770367   -25 days
772223   -18 days
774081   -11 days
775959    -4 days
777859     3 days
779753    10 days
781649    17 days
783550    24 days
Name: ranking_date, dtype: timedelta64[ns]
734803   -60 days
736651   -53 days
738495   -46 days
740345   -39 days
742212   -32 days
744075   -25 days
745922   -18 days
747784   -11 days
749646    -4 days
751492     3 days
753332    10 days
755167    17 days
757006    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
777402   -60 days
779301   -53 days
781205   -46 days
783098   -39 days
784988   -32 days
786879   -25 days
788760

823666   -60 days
825594   -53 days
827526   -46 days
829458   -39 days
831389   -32 days
833317   -25 days
835242   -18 days
837162   -11 days
839080    -4 days
841002     3 days
842924    10 days
844839    17 days
846754    24 days
Name: ranking_date, dtype: timedelta64[ns]
823911   -60 days
825843   -53 days
827776   -46 days
829708   -39 days
831634   -32 days
833556   -25 days
835521   -18 days
837442   -11 days
839357    -4 days
841275     3 days
843197    10 days
845122    17 days
847037    24 days
Name: ranking_date, dtype: timedelta64[ns]
823823   -60 days
825747   -53 days
827677   -46 days
829609   -39 days
831540   -32 days
833453   -25 days
835367   -18 days
837289   -11 days
839206    -4 days
841125     3 days
843047    10 days
844965    17 days
846880    24 days
Name: ranking_date, dtype: timedelta64[ns]
823734   -60 days
825668   -53 days
827600   -46 days
829532   -39 days
831443   -32 days
833369   -25 days
835289   -18 days
837213   -11 days
839132    -4 days
841053 

858176   -60 days
860090   -53 days
862002   -46 days
863916   -39 days
865834   -32 days
867749   -25 days
869704   -18 days
871603   -11 days
873478    -4 days
875346     3 days
877219    10 days
879088    17 days
880956    24 days
Name: ranking_date, dtype: timedelta64[ns]
859375   -60 days
861284   -53 days
863192   -46 days
865106   -39 days
866981   -32 days
868890   -25 days
870785   -18 days
872684   -11 days
874339    -4 days
876204     3 days
878073    10 days
879936    17 days
881562    24 days
Name: ranking_date, dtype: timedelta64[ns]
858628   -60 days
860546   -53 days
862457   -46 days
864371   -39 days
866290   -32 days
868200   -25 days
870108   -18 days
872007   -11 days
873982    -4 days
875848     3 days
877712    10 days
879573    17 days
881440    24 days
Name: ranking_date, dtype: timedelta64[ns]
858481   -60 days
860393   -53 days
862303   -46 days
864217   -39 days
866117   -32 days
868024   -25 days
869928   -18 days
871827   -11 days
873735    -4 days
875601 

922262   -60 days
924075   -53 days
925888   -46 days
927701   -39 days
929509   -32 days
931307   -25 days
933049   -18 days
934847   -11 days
936637    -4 days
938432     3 days
940229    10 days
942026    17 days
943820    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
921244   -60 days
923054   -53 days
924883   -46 days
926696   -39 days
928500   -32 days
930312   -25 days
932052   -18 days
933843   -11 days
935641    -4 days
937438     3 days
939235    10 days
941118    17 days
942912    24 days
Name: ranking_date, dtype: timedelta64[ns]
922121   -60 days
923933   -53 days
925746   -46 days
927559   -39 days
929365   -32 days
931171   -25 days
932908   -18 days
934705   -11 days
936495    -4 days
938290     3 days
940087    10 days
941881    17 days
943675    24 days
Name: ranking_date, dtype: timedelta64[ns]
921237   -60 days
923050   -53 days
924826   -46 days
926639   -39 days
928454   -32 days
930267   -25 days
932006

1013731   -60 days
1015487   -53 days
1017238   -46 days
1018997   -32 days
1020752   -25 days
1022519   -18 days
1024284   -11 days
1026046    -4 days
1027806     3 days
1029571    17 days
Name: ranking_date, dtype: timedelta64[ns]
1013731   -60 days
1015487   -53 days
1017238   -46 days
1018997   -32 days
1020752   -25 days
1022519   -18 days
1024284   -11 days
1026046    -4 days
1027806     3 days
1029571    17 days
Name: ranking_date, dtype: timedelta64[ns]
1014997   -60 days
1016749   -53 days
1018503   -46 days
1019719   -32 days
1021478   -25 days
1023239   -18 days
1025002   -11 days
1026765    -4 days
1028527     3 days
1030286    17 days
Name: ranking_date, dtype: timedelta64[ns]
1014717   -60 days
1016469   -53 days
1018224   -46 days
1019658   -32 days
1021417   -25 days
1023179   -18 days
1024944   -11 days
1026707    -4 days
1028467     3 days
1030233    17 days
Name: ranking_date, dtype: timedelta64[ns]
1013984   -60 days
1015737   -53 days
1017488   -46 days
1019235   -

Series([], Name: ranking_date, dtype: timedelta64[ns])
1041718   -60 days
1043463   -53 days
1045253   -46 days
1047084   -32 days
1048940   -25 days
Name: ranking_date, dtype: timedelta64[ns]
1041720   -60 days
1043500   -53 days
1045295   -46 days
1047135   -32 days
1048633   -25 days
1050157   -18 days
1052015    -4 days
1053881     3 days
1055751    10 days
1057628    17 days
1059510    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
1040265   -60 days
1042043   -53 days
1043826   -46 days
1045615   -32 days
1047455   -25 days
1049312   -18 days
1051164    -4 days
1053041     3 days
1054902    10 days
1056775    17 days
1058661    24 days
Name: ranking_date, dtype: timedelta64[ns]
1040399   -60 days
1042185  

1093294   -60 days
1095246   -53 days
1097194   -46 days
1099137   -39 days
1101083   -32 days
1103028   -25 days
1104960   -11 days
1106869    -4 days
1108805     3 days
1110734    10 days
1112665    17 days
1114603    24 days
Name: ranking_date, dtype: timedelta64[ns]
1094373   -60 days
1096325   -53 days
1098271   -46 days
1100215   -39 days
1102159   -32 days
1104105   -25 days
1105958   -11 days
1107886    -4 days
1109819     3 days
1111748    10 days
1113681    17 days
1115618    24 days
Name: ranking_date, dtype: timedelta64[ns]
1094373   -60 days
1096325   -53 days
1098271   -46 days
1100215   -39 days
1102159   -32 days
1104105   -25 days
1105958   -11 days
1107886    -4 days
1109819     3 days
1111748    10 days
1113681    17 days
1115618    24 days
Name: ranking_date, dtype: timedelta64[ns]
1093566   -60 days
1095518   -53 days
1097466   -46 days
1099409   -39 days
1101351   -32 days
1103297   -25 days
1105229   -11 days
1107155    -4 days
1109089     3 days
1111017    10 da

Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
1106811   -60 days
1108750   -53 days
1110679   -46 days
1112611   -39 days
1114553   -32 days
1116484   -18 days
1118417    -4 days
1120357     3 days
1122303    10 days
1124250    17 days
1126197    24 days
Name: ranking_date, dtype: timedelta64[ns]
1108385   -60 days
1110316   -53 days
1112240   -46 days
1114174   -39 days
1116119   -32 days
1118058   -18 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
1107848   -60 days
1109784   -53 days
1111712   -46 days
1113645   -39 days
1115581   -32 days
1117530   -18 days
1119468    -4 days
1121416     3 days
1123360    10 days
1125310    17 days
1127264    24 days
Name: ranking_date, dtype: timedelta64[ns]
1107070   -60 days
1109006   -53 days
1110936   -46 days
1112870   -39 days
1114810   -32 days
1116750   -18 days
1118682    -4 days
1120616     3 days
1122535    10 days
112

1180565   -60 days
1182528   -53 days
1184499   -46 days
1186479   -39 days
1188463   -32 days
1190450   -25 days
1192436   -18 days
1194442    -4 days
1196422     3 days
1198397    10 days
1200372    17 days
1202349    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
1180710   -60 days
1182663   -53 days
1184637   -46 days
1186620   -39 days
1188606   -32 days
1190590   -25 days
1192573   -18 days
1194556    -4 days
1196536     3 days
1198514    10 days
1200463    17 days
1202465    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
1180692   -60 days
1182644   -53 days
1184596   -46 days
1186578   -39 days
1188563   -32 days
1190548   -25 days
1192534   -18 days
1194516    -4 days
1196473     3 days
1198451    10 days
1200427    17 days
1202406    24 days
Name: ranking_date, dtype: timedelta64[ns]
1180929   -60 days
1182879   -53 days
1184854   -46 days
1186835   -39 days
1

1197192   -60 days
1199223   -53 days
1201202   -46 days
1203181   -39 days
1205159   -32 days
1207134   -18 days
1209114    -4 days
1211089     3 days
1213059    10 days
1215031    17 days
1216998    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
1197192   -60 days
1199223   -53 days
1201202   -46 days
1203181   -39 days
1205159   -32 days
1207134   -18 days
1209114    -4 days
1211089     3 days
1213059    10 days
1215031    17 days
1216998    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
1196283   -60 days
1198261   -53 days
1200224   -46 days
1202196   -39 days
1204131   -32 days
1206096   -18 days
1208077    -4 days
1210053     3 days
1212021    10 days
1213988    17 days
1215952    24 days
Name: ranking_date, dtype: timedelta64[ns]
1196477   -60 days
11984

1274740   -60 days
1276919   -53 days
1279103   -46 days
1281276   -39 days
1283446   -32 days
1285616   -25 days
1287776   -18 days
1289932    -4 days
1292096     3 days
1294258    10 days
1296480    17 days
1298648    24 days
Name: ranking_date, dtype: timedelta64[ns]
1275989   -60 days
1278175   -53 days
1280346   -46 days
1282520   -39 days
1284688   -32 days
1286851   -25 days
1289012   -18 days
1291179    -4 days
1293342     3 days
1295510    10 days
1297683    17 days
1299854    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
1275407   -60 days
1277595   -53 days
1279778   -46 days
1281948   -39 days
1284120   -32 days
1286287   -25 days
1288447   -18 days
1290607    -4 days
1292768     3 days
1294930    10 days
1297101    17 days
1299275    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranki

1387220   -60 days
1389401   -53 days
1391584   -46 days
1393778   -32 days
1395943   -25 days
1398135   -18 days
1400319   -11 days
1402501    -4 days
1404684     3 days
1406868    17 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
1386624   -60 days
1388806   -53 days
1390988   -46 days
1393168   -32 days
1395359   -25 days
1397548   -18 days
1399734   -11 days
1401922    -4 days
1404104     3 days
1406292    17 days
Name: ranking_date, dtype: timedelta64[ns]
1386304   -60 days
1388485   -53 days
1390664   -46 days
1392842   -32 days
1395036   -25 days
1397224   -18 days
1399410   -11 days
1401590    -4 days
1403773     3 days
1405959    17 days
Name: ranking_date, dtype: timedelta64[ns]
1387862   -60 days
1390044   -53 days
1392229   -46 days
1394413   -32 days
1396603   -25 days
1398795   -18 days
1400980   -11 days
1403165    -4 days
1405348     3 days
1407538    17 days
N

1425787   -49 days
1428036   -42 days
1430264   -35 days
1432506   -28 days
1434804   -14 days
1437197    -7 days
1439365     0 days
1441567     7 days
1443809    14 days
1446062    21 days
1448298    28 days
Name: ranking_date, dtype: timedelta64[ns]
1437159   -60 days
1439422   -53 days
1441664   -46 days
1443893   -39 days
1446139   -32 days
1448379   -25 days
1450593   -18 days
1452870    -4 days
1455088     3 days
1457309    10 days
1459510    17 days
1461732    24 days
Name: ranking_date, dtype: timedelta64[ns]
1437126   -60 days
1439374   -53 days
1441604   -46 days
1443822   -39 days
1446084   -32 days
1448369   -25 days
1450614   -18 days
1452851    -4 days
1455097     3 days
1457318    10 days
1459506    17 days
1461741    24 days
Name: ranking_date, dtype: timedelta64[ns]
1437070   -60 days
1439332   -53 days
1441572   -46 days
1443795   -39 days
1446041   -32 days
1448271   -25 days
1450490   -18 days
1452753    -4 days
1454970     3 days
1457169    10 days
1459392    17 da

1488822   -60 days
1491079   -53 days
1493334   -46 days
1495608   -32 days
1497862   -25 days
1500120   -18 days
1502381   -11 days
1504643    -4 days
1506899     3 days
1509164    17 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
1497758   -56 days
1500017   -49 days
1502289   -42 days
1504550   -35 days
1506749   -28 days
1509010   -14 days
1511275     0 days
1513506     7 days
1515755    14 days
1518030    21 days
1520272    28 days
Name: ranking_date, dtype: timedelta64[ns]
1513288   -56 days
1515534   -49 days
1517781   -42 days
1520022   -35 days
1522254   -28 days
1524477   -21 days
1526688   -14 days
1528903     0 days
1531122     7 days
1532731    14 days
1534908    21 days
Name: ranking_date, dtype: timedelta64[ns]
1539773    7 days
1541925   14 days
1544081   21 days
1546237   28 days
Name: ranking_date, dtype: timedelta64[ns]
1524560   -56 days
1526773   -49 days
1528994   -35 days
1531214   -28 days
1532824   -21 day

1579737   -60 days
1581828   -53 days
1583916   -46 days
1585998   -39 days
1588085   -32 days
1590178   -25 days
1592271   -18 days
1594363    -4 days
1596467     3 days
1598571    10 days
1600674    17 days
1602777    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
1580496   -60 days
1582562   -53 days
1585266   -46 days
1587326   -39 days
1589421   -32 days
1591520   -25 days
1593573   -18 days
1595677    -4 days
1597781     3 days
1599886    10 days
1601992    17 days
1604097    24 days
Name: ranking_date, dtype: timedelta64[ns]
1579570   -60 days
1581662   -53 days
1583753   -46 days
1585834   -39 days
1587924   -32 days
1590021   -25 days
1592116   -18 days
1594213    -4 days
1596309     3 days
1598414    1

1680580   -54 days
1682765   -47 days
Name: ranking_date, dtype: timedelta64[ns]
1679295   -54 days
1681280   -47 days
1683275   -40 days
1685260   -33 days
1687249   -26 days
1689235   -19 days
1691221    -5 days
1693199     2 days
1695166     9 days
1697137    16 days
1699092    23 days
1701062    30 days
Name: ranking_date, dtype: timedelta64[ns]
1680086   -54 days
1682070   -47 days
1684088   -40 days
1686110   -33 days
1688095   -26 days
1690081   -19 days
1692065    -5 days
1694040     2 days
1696013     9 days
1697986    16 days
1699956    23 days
1701924    30 days
Name: ranking_date, dtype: timedelta64[ns]
1680285   -54 days
1682271   -47 days
1684267   -40 days
1686317   -33 days
1688305   -26 days
1690288   -19 days
1692271    -5 days
1694242     2 days
1696214     9 days
1698182    16 days
1700152    23 days
1702120    30 days
Name: ranking_date, dtype: timedelta64[ns]
1680594   -54 days
1682582   -47 days
1684574   -40 days
1686559   -33 days
1702047    30 days
Name: ranki

1784263   -56 days
1784945   -49 days
1785630   -42 days
1786317   -35 days
1787004   -28 days
1787690   -21 days
1788371   -14 days
1789050    -7 days
1789729     0 days
1790405    14 days
1791086    21 days
1791768    28 days
Name: ranking_date, dtype: timedelta64[ns]
1789156   -56 days
1789835   -49 days
1790511   -35 days
1791191   -28 days
1791870   -21 days
1792553   -14 days
1793231     0 days
1793918     7 days
1794602    14 days
1795286    21 days
1797275    28 days
Name: ranking_date, dtype: timedelta64[ns]
1789779   -56 days
1790460   -42 days
1791141   -35 days
1791823   -28 days
1792511   -21 days
1793177    -7 days
1793862     0 days
1794552     7 days
1795245    14 days
1797241    21 days
1799227    28 days
Name: ranking_date, dtype: timedelta64[ns]
1791894   -56 days
1792578   -49 days
1793252   -35 days
1793939   -28 days
1794626   -21 days
1795307   -14 days
1797300    -7 days
1799284     0 days
1801272     7 days
1803287    21 days
1805274    28 days
Name: ranking_da

1821652   -56 days
1823621   -49 days
1825584   -42 days
1827528   -35 days
1829472   -28 days
1831407   -21 days
1833342   -14 days
1835276    -7 days
1838229     0 days
1840035     7 days
1841968    14 days
1843899    28 days
Name: ranking_date, dtype: timedelta64[ns]
1823008   -56 days
1824972   -49 days
1826914   -42 days
1828859   -35 days
1830799   -28 days
1832732   -21 days
1834665   -14 days
1836592    -7 days
1839579     0 days
1840478     7 days
1842364    21 days
1844296    28 days
Name: ranking_date, dtype: timedelta64[ns]
1824914   -56 days
1826856   -49 days
1828801   -42 days
1830741   -35 days
1832674   -28 days
1834607   -21 days
1836533   -14 days
1838461    -7 days
1840388     0 days
1842317    14 days
1844252    21 days
1846186    28 days
Name: ranking_date, dtype: timedelta64[ns]
1830784   -56 days
1832717   -49 days
1834650   -42 days
1836576   -35 days
1838513   -28 days
1840440   -21 days
1842369    -7 days
1844302     0 days
1846237     7 days
1848163    14 da

1889098   -56 days
1891103   -49 days
1893110   -42 days
1895128   -35 days
1897149   -28 days
1899172   -21 days
1901187   -14 days
1903219     0 days
1905218     7 days
1907252    14 days
1909222    21 days
1911267    28 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
1941213   -54 days
1943359   -47 days
1946779   -40 days
1947670   -33 days
1949815   -26 days
1951962   -19 days
1954115    -5 days
1956257     2 days
1958395     9 days
1960526    16 days
1962670    30 days
Name: ranking_date, dtype: timedelta64[ns]
1941201   -54 days
1943337   -47 days
1945485   -40 days
1947642   -33 days
1949792   -26 days
1951939   -19 days
1954094    -5 days
1956239     2 days
1958378     9 days
1960515    16 days
1962656    30 days
Name: ranking_date, dtype: timedelta64[ns]
1942714   -54 days
1944861   -47 days
1947018   -40 days
1949173   -33 days
1951309   -26 days
1953467   -19 days
1955595    -5 days
1957737     2 days
1959869     9 days

2038080   -60 days
2040095   -53 days
2042071   -46 days
2043997   -39 days
2045874   -32 days
2047777   -25 days
2049712   -18 days
2051654    -4 days
2053620     3 days
2055592    10 days
2057567    17 days
2059522    24 days
Name: ranking_date, dtype: timedelta64[ns]
2036733   -60 days
2038772   -53 days
2040782   -46 days
2042769   -39 days
2044705   -32 days
2046596   -25 days
2048505   -18 days
2051496    -4 days
2053478     3 days
2055453    10 days
2057433    17 days
2059414    24 days
Name: ranking_date, dtype: timedelta64[ns]
2036733   -60 days
2038772   -53 days
2040782   -46 days
2042769   -39 days
2044705   -32 days
2046596   -25 days
2048505   -18 days
2051496    -4 days
2053478     3 days
2055453    10 days
2057433    17 days
2059414    24 days
Name: ranking_date, dtype: timedelta64[ns]
2037046   -60 days
2039074   -53 days
2041072   -46 days
2043053   -39 days
2044982   -32 days
2046883   -25 days
2048792   -18 days
2050457    -4 days
2052412     3 days
2054375    10 da

Series([], Name: ranking_date, dtype: timedelta64[ns])
2075907   -60 days
2077905   -53 days
2079897   -46 days
2081887   -39 days
2083880   -32 days
2085870   -25 days
2087856   -18 days
2089835    -4 days
2091821     3 days
2093817    10 days
2095784    17 days
2097780    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
2074946   -60 days
2076952   -53 days
2078942   -46 days
2080940   -39 days
2082911   -32 days
2084905   -25 days
2086885   -18 days
2088881    -4 days
2090878     3 days
2092872    10 days
2094865    17 days
2096870    24 days
Name: ranking_date, dtype: timedelta64[ns]
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
Series([], Name: ranking_date, dtype: timedelta64[ns])
2153115   -56 days
2155172   -49 days
2157217   -42 days
2159264   -35 days
2161306   -28 days
2163348   -21 days
2165393   -14 

Series([], Name: ranking_date, dtype: timedelta64[ns])
2162652   -60 days
2164698   -53 days
2166747   -46 days
2167647   -39 days
2169700   -32 days
2171753   -25 days
2173809   -18 days
2175856    -4 days
2177915     3 days
2179969    10 days
2182023    17 days
2184076    24 days
Name: ranking_date, dtype: timedelta64[ns]
2162813   -60 days
2164858   -53 days
2166932   -46 days
2167836   -39 days
2169890   -32 days
2171945   -25 days
2174001   -18 days
2176053    -4 days
2178111     3 days
2180163    10 days
2182218    17 days
2184272    24 days
Name: ranking_date, dtype: timedelta64[ns]
2163174   -60 days
2165220   -53 days
2167267   -46 days
2168170   -39 days
2170224   -32 days
2172281   -25 days
2174333   -18 days
2176385    -4 days
2178443     3 days
2180497    10 days
2182550    17 days
2184603    24 days
Name: ranking_date, dtype: timedelta64[ns]


In [17]:
missing_rank = missing_rank.dropna(subset=['winner_rank', 'loser_rank'], how='any')

# 输出结果
print(missing_rank.columns)


Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'tourney_ioc', 'tourney_year', 'w_host', 'l_host'],
      dtype='object')


In [18]:
# 定义需要更新的目标列
rank_columns = [
    'winner_rank', 'winner_rank_points',
    'loser_rank', 'loser_rank_points'
]
for key, df in dfs.items():

    # Step 1: 创建合并键（确保列名和类型一致）
    # 注意：直接在原始数据副本上操作，避免污染原始数据
    df_merged = df.copy()
    missing_rank_merged = missing_rank.copy()

    df_merged["merge_key"] = (
        df_merged["tourney_id"].astype(str) 
        + df_merged["winner_id"].astype(str) 
        + df_merged["loser_id"].astype(str)  # 注意检查 loser_id 拼写是否正确
    )

    missing_rank_merged["merge_key"] = (
        missing_rank_merged["tourney_id"].astype(str) 
        + missing_rank_merged["winner_id"].astype(str) 
        + missing_rank_merged["loser_id"].astype(str)
    )

    # Step 2: 左连接合并（保留所有原始行）
    merged = df_merged.merge(
        missing_rank_merged[["merge_key"] + rank_columns],
        on="merge_key",
        how="left",
        suffixes=('', '_new')
    )

    # Step 3: 更新目标列
    for col in rank_columns:
        merged[col] = merged[f"{col}_new"].combine_first(merged[col])
        merged.drop(f"{col}_new", axis=1, inplace=True)

    # Step 4: 将更新后的数据写回原df（不删除merge_key用于验证）
    df = merged  # 保留 merge_key 用于后续验证

    # 验证更新结果（使用保留的 merge_key）
    print("更新统计:")
    print(f"成功匹配的行数: {len(missing_rank_merged[missing_rank_merged['merge_key'].isin(df['merge_key'])])}")
    # 最终清理 merge_key（确认不再需要后）
    df = df.drop("merge_key", axis=1)
    dfs[key] = df

更新统计:
成功匹配的行数: 57
更新统计:
成功匹配的行数: 54
更新统计:
成功匹配的行数: 38
更新统计:
成功匹配的行数: 36
更新统计:
成功匹配的行数: 31


### 删除缺失值

In [19]:
#删除空值
for key, df in dfs.items():
    dfs[key] = df.dropna(
        subset=['winner_age', 'loser_age', 'w_ace', 'l_ace',"winner_rank","loser_rank","w_SvGms","winner_hand","loser_hand"],
        how='any'
    )

In [20]:
# 用平均值填充缺失值winner_ht、loser_ht、minutes
# VAR 填充tourney_ioc 0填充seed
# 对entry非空赋为0 空赋为1
# seed 空值赋为0
for key, df in dfs.items():
    print(f"Processing {key}...")

    # 计算平均值
    avg_winner_ht = df['winner_ht'].mean(skipna=True)
    avg_loser_ht = df['loser_ht'].mean(skipna=True)
    avg_minutes = df['minutes'].mean(skipna=True)

    # 用 .loc 进行赋值，避免警告
    df.loc[:, 'winner_ht'] = df['winner_ht'].fillna(avg_winner_ht)
    df.loc[:, 'loser_ht'] = df['loser_ht'].fillna(avg_loser_ht)
    df.loc[:, 'minutes'] = df['minutes'].fillna(avg_minutes)
    df.loc[:, 'winner_seed'] = df['winner_seed'].fillna(0)
    df.loc[:, 'loser_seed'] = df['loser_seed'].fillna(0)
    df.loc[:, 'tourney_ioc'] = df['tourney_ioc'].fillna("VAR")
    df.loc[:, 'winner_entry'] = df['winner_entry'].isna().astype(int)
    df.loc[:, 'loser_entry'] = df['loser_entry'].isna().astype(int)


Processing df_00_04...
Processing df_05_09...
Processing df_10_14...
Processing df_15_19...
Processing df_20_24...


In [21]:
for key, df in dfs.items():
    print(f"Checking DataFrame: {key}")
    print(df.columns)  # 打印列名

Checking DataFrame: df_00_04
Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'tourney_ioc', 'tourney_year', 'w_host', 'l_host'],
      dtype='object')
Checking DataFrame: df_05_09
Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       '

# Transform

In [22]:
for key, df in dfs.items():
    print(f"Dataset: {key}")
    
    # 将 Carpet 归入 Hard 类
    df["surface"] = df["surface"].replace("Carpet", "Hard")
    
    # 查看所有唯一的 surface 值（验证替换效果）
    unique_surfaces = df['surface'].dropna().unique()
    print(f"Unique surface values after replacement: {unique_surfaces}")

    # 进行 One-Hot 编码
    df = pd.get_dummies(df, columns=['surface'], prefix='surface', dtype=int)

    # 更新 dfs
    dfs[key] = df  
    print("-" * 50)

Dataset: df_00_04
Unique surface values after replacement: ['Hard' 'Clay' 'Grass']
--------------------------------------------------
Dataset: df_05_09
Unique surface values after replacement: ['Hard' 'Clay' 'Grass']
--------------------------------------------------
Dataset: df_10_14
Unique surface values after replacement: ['Hard' 'Clay' 'Grass']
--------------------------------------------------
Dataset: df_15_19
Unique surface values after replacement: ['Hard' 'Clay' 'Grass']
--------------------------------------------------
Dataset: df_20_24
Unique surface values after replacement: ['Hard' 'Clay' 'Grass']
--------------------------------------------------


In [23]:
for key, df in dfs.items():
    df['tourney_level'] = df['tourney_level'].map(lambda x: 
        1 if x in ['A', 'D'] else 
        2 if x in ['M', 'F', 'O'] else 
        3 if x == 'G' else x
    )


### 轮次处理

In [24]:
for key, df in dfs.items():
    # 删除 round 值为 'ER' 和 'BR' 的行
    df = df[~df['round'].isin(['ER', 'BR'])]

    # 更新 dfs 字典中的 DataFrame
    dfs[key] = df


In [25]:
for key, df in dfs.items():
    # 筛选 'round' 列为 'RR' 的行
    rr_df = df[df['round'] == 'RR']
    
    # 根据 'tourney_name' 和 'tourney_year' 分组，并查看每组是否有多个 'RR' 行
    grouped_rr = rr_df.groupby(['tourney_name', 'tourney_year']).filter(lambda group: len(group) > 1)
    
    # 打印结果
    print(f"Rows with 'RR' round and the same 'tourney_name' and 'tourney_year' for {key}:")
    print(grouped_rr)
    print("-" * 50)  # 分隔符


Rows with 'RR' round and the same 'tourney_name' and 'tourney_year' for df_00_04:
      tourney_id tourney_name  draw_size  tourney_level tourney_date  \
2700    2000-605  Masters Cup          8              2   2000-11-27   
2701    2000-605  Masters Cup          8              2   2000-11-27   
2702    2000-605  Masters Cup          8              2   2000-11-27   
2704    2000-605  Masters Cup          8              2   2000-11-27   
2705    2000-605  Masters Cup          8              2   2000-11-27   
2706    2000-605  Masters Cup          8              2   2000-11-27   
2708    2000-605  Masters Cup          8              2   2000-11-27   
2709    2000-605  Masters Cup          8              2   2000-11-27   
2710    2000-605  Masters Cup          8              2   2000-11-27   
2715    2000-615   Dusseldorf         64              1   2000-05-21   
2716    2000-615   Dusseldorf         64              1   2000-05-21   
2717    2000-615   Dusseldorf         64              

      tourney_id  tourney_name  draw_size  tourney_level tourney_date  \
2677    2005-605   Masters Cup          8              2   2005-11-14   
2678    2005-605   Masters Cup          8              2   2005-11-14   
2679    2005-605   Masters Cup          8              2   2005-11-14   
2682    2005-605   Masters Cup          8              2   2005-11-14   
2683    2005-605   Masters Cup          8              2   2005-11-14   
2685    2005-605   Masters Cup          8              2   2005-11-14   
2686    2005-605   Masters Cup          8              2   2005-11-14   
2687    2005-605   Masters Cup          8              2   2005-11-14   
2688    2005-605   Masters Cup          8              2   2005-11-14   
2689    2005-605   Masters Cup          8              2   2005-11-14   
2690    2005-605   Masters Cup          8              2   2005-11-14   
2691    2005-605   Masters Cup          8              2   2005-11-14   
2692    2005-615    Dusseldorf         32          

Rows with 'RR' round and the same 'tourney_name' and 'tourney_year' for df_10_14:
      tourney_id tourney_name  draw_size  tourney_level tourney_date  \
1350    2010-615   Dusseldorf         32              1   2010-05-16   
1351    2010-615   Dusseldorf         32              1   2010-05-16   
1352    2010-615   Dusseldorf         32              1   2010-05-16   
1353    2010-615   Dusseldorf         32              1   2010-05-16   
1354    2010-615   Dusseldorf         32              1   2010-05-16   
1355    2010-615   Dusseldorf         32              1   2010-05-16   
1357    2010-615   Dusseldorf         32              1   2010-05-16   
1358    2010-615   Dusseldorf         32              1   2010-05-16   
1359    2010-615   Dusseldorf         32              1   2010-05-16   
1360    2010-615   Dusseldorf         32              1   2010-05-16   
1361    2010-615   Dusseldorf         32              1   2010-05-16   
1363    2010-615   Dusseldorf         32              

                               tourney_id                     tourney_name  \
2925                             2015-605                      Tour Finals   
2926                             2015-605                      Tour Finals   
2927                             2015-605                      Tour Finals   
2928                             2015-605                      Tour Finals   
2929                             2015-605                      Tour Finals   
2930                             2015-605                      Tour Finals   
2931                             2015-605                      Tour Finals   
2932                             2015-605                      Tour Finals   
2933                             2015-605                      Tour Finals   
2934                             2015-605                      Tour Finals   
2935                             2015-605                      Tour Finals   
2936                             2015-605                      T

                                tourney_id                     tourney_name  \
14                               2020-8888                          Atp Cup   
15                               2020-8888                          Atp Cup   
16                               2020-8888                          Atp Cup   
17                               2020-8888                          Atp Cup   
18                               2020-8888                          Atp Cup   
19                               2020-8888                          Atp Cup   
20                               2020-8888                          Atp Cup   
21                               2020-8888                          Atp Cup   
22                               2020-8888                          Atp Cup   
23                               2020-8888                          Atp Cup   
24                               2020-8888                          Atp Cup   
25                               2020-8888          

In [26]:
import re

for key in dfs.keys():
    df = dfs[key].copy()  # 复制 df，避免修改时的 Pandas 视图问题
    # 初始化 'round_extracted' 列为 'Not Found'
    df['round_extracted'] = 'Not Found'
    # 筛选包含 "Davis Cup" 的赛事名称
    davis_mask = df['tourney_name'].str.contains(r"^Davis Cup", case=False, na=False)
    
    if davis_mask.any():
        print(f"处理 {key} 中的 Davis Cup 赛事...")

        # 提取冒号 `:` 后的轮次信息
        pattern = r':\s*([\w\s-]+)$'

        # 赋值新列
        df.loc[davis_mask, 'round_extracted'] = df.loc[davis_mask, 'tourney_name'].str.extract(pattern, expand=False)

        # 处理空值
        df['round_extracted'].fillna('Not Found', inplace=True)

        # 验证提取结果
        sample_data = df.loc[davis_mask, ['tourney_name', 'round_extracted']].head(3)
        print("提取示例：")
        print(sample_data)
        print("-" * 50)

    # **确保数据保存回 dfs**
    dfs[key] = df


处理 df_15_19 中的 Davis Cup 赛事...
提取示例：
                     tourney_name round_extracted
3554  Davis Cup G1 R1: BAR vs ECU      BAR vs ECU
3557  Davis Cup G1 R1: CHI vs DOM      CHI vs DOM
3558  Davis Cup G1 R1: CHI vs DOM      CHI vs DOM
--------------------------------------------------
处理 df_20_24 中的 Davis Cup 赛事...
提取示例：
                      tourney_name round_extracted
1326  Davis Cup QLS R1: ARG vs COL      ARG vs COL
1327  Davis Cup QLS R1: ARG vs COL      ARG vs COL
1328  Davis Cup QLS R1: ARG vs COL      ARG vs COL
--------------------------------------------------


In [27]:
# 定义映射字典
round_mapping_d = {
    'F': 7,
    'PO': 3,
    'QF': 5,
    'R1': 2,
    'R2': 3,
    'R3': 3,
    'RR': 3,
    'SF': 6
}

for key, df in dfs.items():
    print(f"处理 {key} 的 round_extracted -> round_code 映射...")

    # 应用映射，并用 -1 作为未匹配值的默认值
    df['round_code'] = df['round_extracted'].map(round_mapping_d)#.fillna(-1).astype(int)

    # 可选：打印前几行进行验证
    #print(df[['round_extracted', 'round_code']].drop_duplicates().head())

    # 更新字典中的 DataFrame
    dfs[key] = df


处理 df_00_04 的 round_extracted -> round_code 映射...
处理 df_05_09 的 round_extracted -> round_code 映射...
处理 df_10_14 的 round_extracted -> round_code 映射...
处理 df_15_19 的 round_extracted -> round_code 映射...
处理 df_20_24 的 round_extracted -> round_code 映射...


In [28]:
for key, df in dfs.items():
    print(df.columns)

Index(['tourney_id', 'tourney_name', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'tourney_ioc', 'tourney_year', 'w_host', 'l_host', 'surface_Clay',
       'surface_Grass', 'surface_Hard', 'round_extracted', 'round_code'],
      dtype='object')
Index(['tourney_id', 'tourney_name', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',

In [29]:
# 轮次编码
round_mapping = {
    'R128': 1,  # 128强
    'R64': 2,   # 64强
    'R32': 3,   # 32强
    'R16': 4,   # 16强
    'QF': 5,    # Quarter-Finals (8强)
    'SF': 6,    # Semi-Finals (4强)
    'F': 7      # Final (决赛)
}

for key, df in dfs.items():
    # 仅填充 round_code 为空的行
    df['round_code'] = df['round_code'].fillna(df['round'].map(round_mapping))


In [30]:
#davis赛事重命名id
for key, df in dfs.items():
    df.drop(columns=['round_extracted'], inplace=True)
    davis_mask = df['tourney_name'].str.startswith('Davis Cup', na=False)

    if davis_mask.any():
        print(f"处理 {key} 中的 Davis Cup 赛事...")

        # 先将 tourney_year 转换为字符串，再拼接 '_davis'
        df.loc[davis_mask, 'tourney_id'] = df.loc[davis_mask, 'tourney_year'].astype(str) + '_davis'

        # 更新 dfs
        dfs[key] = df

        # 打印部分数据检查
        print(df.loc[davis_mask, ['tourney_year', 'tourney_id']].head())
        print("-" * 50)


处理 df_15_19 中的 Davis Cup 赛事...
      tourney_year  tourney_id
3554          2016  2016_davis
3557          2016  2016_davis
3558          2016  2016_davis
3560          2016  2016_davis
3561          2016  2016_davis
--------------------------------------------------
处理 df_20_24 中的 Davis Cup 赛事...
      tourney_year  tourney_id
1326          2020  2020_davis
1327          2020  2020_davis
1328          2020  2020_davis
1329          2020  2020_davis
1330          2020  2020_davis
--------------------------------------------------


In [31]:
#使用同一赛事最小值-3代替空缺值，否则默认填充为 3
for key, df in dfs.items():
    # 查找 round_code 为空的行
    missing_mask = df['round_code'].isna()

    if missing_mask.sum() > 0:
        print(f"处理 {key} 中 round_code 为空的情况...")

        # 找到相同 tourney_id 和 tourney_year 下的最小 round_code 值
        min_rounds = (
            df.dropna(subset=['round_code'])  # 仅考虑非空 round_code
            .groupby(['tourney_id', 'tourney_year'])['round_code']
            .min()
            .reset_index()
        )

        # 计算新值（最小值 - 3）
        min_rounds['new_round_code'] = min_rounds['round_code'] - 3

        # 填充 round_code 为空的行，优先使用计算的值，否则默认填充为 3
        df.loc[missing_mask, 'round_code'] = df.loc[missing_mask].merge(
            min_rounds, on=['tourney_id', 'tourney_year'], how='left'
        )['new_round_code'].fillna(3).values

        # 确保数据结构不变
        dfs[key] = df

        # 打印部分数据检查
        print(df.loc[missing_mask, ['tourney_id', 'tourney_year', 'round_code']].head())
        print("-" * 50)


处理 df_00_04 中 round_code 为空的情况...
     tourney_id  tourney_year  round_code
2700   2000-605          2000         3.0
2701   2000-605          2000         3.0
2702   2000-605          2000         3.0
2704   2000-605          2000         3.0
2705   2000-605          2000         3.0
--------------------------------------------------
处理 df_05_09 中 round_code 为空的情况...
     tourney_id  tourney_year  round_code
2677   2005-605          2005         3.0
2678   2005-605          2005         3.0
2679   2005-605          2005         3.0
2682   2005-605          2005         3.0
2683   2005-605          2005         3.0
--------------------------------------------------
处理 df_10_14 中 round_code 为空的情况...
     tourney_id  tourney_year  round_code
1350   2010-615          2010         4.0
1351   2010-615          2010         4.0
1352   2010-615          2010         4.0
1353   2010-615          2010         4.0
1354   2010-615          2010         4.0
----------------------------------------

In [32]:
for key, df in dfs.items():
    # 统计每个 DataFrame 中 'round_code' 列的空值数量
    missing_count = df['round_code'].isna().sum()
    
    # 输出结果
    print(f"{key} 中 round_code 列的空值数量: {missing_count}")


df_00_04 中 round_code 列的空值数量: 0
df_05_09 中 round_code 列的空值数量: 0
df_10_14 中 round_code 列的空值数量: 0
df_15_19 中 round_code 列的空值数量: 0
df_20_24 中 round_code 列的空值数量: 0


import matplotlib.pyplot as plt
import seaborn as sns

### 获取所有 round_code 数据（非空）
all_round_codes = [code for df in dfs.values() for code in df['round_code'].dropna().astype(int)]

#### 确保 bins 只包含整数，并覆盖所有值
max_value = max(all_round_codes) if all_round_codes else 1
bins = range(1, max_value + 2)  # 例如 1,2,3,...,max+1

plt.figure(figsize=(10, 6))
sns.histplot(all_round_codes, bins=bins, kde=False, color='skyblue', discrete=True)

plt.xticks(range(1, max_value + 1))  # 只显示整数刻度
plt.title("轮次分布", fontsize=14)
plt.xlabel("Round Code", fontsize=12)
plt.ylabel("Frequency", fontsize=12)

plt.show()


### 归一化处理 seed（逆序归一化，是否分桶）

In [33]:
# 逆序归一化
def sequence_seed(seed):
    if seed != 0:
        return int(33 - seed)
    else:
        return 33

for key, df in dfs.items():
    df = df.copy()  # 避免 SettingWithCopyWarning

    # 分桶处理
    df['winner_seed_bucket'] = df['winner_seed'].apply(sequence_seed)
    df['loser_seed_bucket'] = df['loser_seed'].apply(sequence_seed)

    dfs[key] = df  # 更新数据集


# 定义分桶函数
def bucket_seed(seed):
    if seed != 0:
        return max(1, (33 - seed) // 4)  # 计算分桶编号
    return 0  # 非种子选手设为 0

for key, df in dfs.items():
    df = df.copy()  # 避免 SettingWithCopyWarning

    print(f"Before transformation ({key}):")
    print(df[['winner_seed', 'loser_seed']].head(10))  # 打印前10行原始数据

    # 分桶处理
    df['winner_seed_bucket'] = df['winner_seed'].apply(bucket_seed)
    df['loser_seed_bucket'] = df['loser_seed'].apply(bucket_seed)

    print(f"After transformation ({key}):")
    print(df[['winner_seed', 'winner_seed_bucket', 'loser_seed', 'loser_seed_bucket']].head(10))  # 打印前10行转换后数据

    dfs[key] = df  # 更新数据集
    print("-" * 50)


### 持拍手标签转化

In [34]:
for key, df in dfs.items():
    print(f"Dataset: {key}")
    
    # 统计 U 的数量
    winner_U_count = (df['winner_hand'] == 'U').sum()
    loser_U_count = (df['loser_hand'] == 'U').sum()

    # 计算占比
    total_winner = len(df['winner_hand'])
    total_loser = len(df['loser_hand'])
    
    winner_U_ratio = winner_U_count / total_winner * 100
    loser_U_ratio = loser_U_count / total_loser * 100

    print(f"  winner_hand 中 'U' 的数量: {winner_U_count} ({winner_U_ratio:.2f}%)")
    print(f"  loser_hand 中 'U' 的数量: {loser_U_count} ({loser_U_ratio:.2f}%)")
    print("-" * 50)


Dataset: df_00_04
  winner_hand 中 'U' 的数量: 3 (0.02%)
  loser_hand 中 'U' 的数量: 14 (0.10%)
--------------------------------------------------
Dataset: df_05_09
  winner_hand 中 'U' 的数量: 0 (0.00%)
  loser_hand 中 'U' 的数量: 11 (0.08%)
--------------------------------------------------
Dataset: df_10_14
  winner_hand 中 'U' 的数量: 3 (0.02%)
  loser_hand 中 'U' 的数量: 15 (0.11%)
--------------------------------------------------
Dataset: df_15_19
  winner_hand 中 'U' 的数量: 0 (0.00%)
  loser_hand 中 'U' 的数量: 17 (0.12%)
--------------------------------------------------
Dataset: df_20_24
  winner_hand 中 'U' 的数量: 4 (0.03%)
  loser_hand 中 'U' 的数量: 13 (0.10%)
--------------------------------------------------


In [35]:
# 定义转换函数
def convert_hand(hand):
    if hand == 'R':
        return 1
    elif hand == 'L':
        return 0
    elif hand == 'U':  # 未知手设为 R（1）
        return 1
    elif hand == 'A':  # A设为 R（1）
        return 1

for key, df in dfs.items():
    df = df.copy()  # 避免 SettingWithCopyWarning

    # 进行转换
    df['winner_hand'] = df['winner_hand'].apply(convert_hand).astype(int)
    df['loser_hand'] = df['loser_hand'].apply(convert_hand).astype(int)

    dfs[key] = df  # 更新数据集


In [36]:
for key, df in dfs.items():
    print(f"Dataset: {key}")
    
    # 统计 winner 和 loser 的 hand 出现次数
    winner_counts = df['winner_hand'].value_counts()
    loser_counts = df['loser_hand'].value_counts()

    # 计算胜率
    total_matches = winner_counts + loser_counts
    win_rates = (winner_counts / total_matches).fillna(0) * 100  # 转换为百分比

    # 显示结果
    print("左右手胜率：")
    for hand in [1,0]:
        total = total_matches.get(hand, 0)
        wins = winner_counts.get(hand, 0)
        rate = win_rates.get(hand, 0)
        print(f"  {hand}: {wins} 胜 / {total} 场 ({rate:.2f}%)")
    
    print("-" * 50)


Dataset: df_00_04
左右手胜率：
  1: 12767 胜 / 25223 场 (50.62%)
  0: 1628 胜 / 3567 场 (45.64%)
--------------------------------------------------
Dataset: df_05_09
左右手胜率：
  1: 12460 胜 / 24898 场 (50.04%)
  0: 1615 胜 / 3252 场 (49.66%)
--------------------------------------------------
Dataset: df_10_14
左右手胜率：
  1: 11546 胜 / 22980 场 (50.24%)
  0: 1682 胜 / 3476 场 (48.39%)
--------------------------------------------------
Dataset: df_15_19
左右手胜率：
  1: 11862 胜 / 23586 场 (50.29%)
  0: 1964 胜 / 4066 场 (48.30%)
--------------------------------------------------
Dataset: df_20_24
左右手胜率：
  1: 10898 胜 / 21656 场 (50.32%)
  0: 1645 胜 / 3430 场 (47.96%)
--------------------------------------------------


In [37]:
for key, df in dfs.items():
    print(df.isnull().sum())

tourney_id            0
tourney_name          0
draw_size             0
tourney_level         0
tourney_date          0
match_num             0
winner_id             0
winner_seed           0
winner_entry          0
winner_name           0
winner_hand           0
winner_ht             0
winner_ioc            0
winner_age            0
loser_id              0
loser_seed            0
loser_entry           0
loser_name            0
loser_hand            0
loser_ht              0
loser_ioc             0
loser_age             0
score                 0
best_of               0
round                 0
minutes               0
w_ace                 0
w_df                  0
w_svpt                0
w_1stIn               0
w_1stWon              0
w_2ndWon              0
w_SvGms               0
w_bpSaved             0
w_bpFaced             0
l_ace                 0
l_df                  0
l_svpt                0
l_1stIn               0
l_1stWon              0
l_2ndWon              0
l_SvGms         

In [38]:
#查看编码后列名
print(dfs["df_00_04"].columns)

Index(['tourney_id', 'tourney_name', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'tourney_ioc', 'tourney_year', 'w_host', 'l_host', 'surface_Clay',
       'surface_Grass', 'surface_Hard', 'round_code', 'winner_seed_bucket',
       'loser_seed_bucket'],
      dtype='object')


In [39]:
for key, df in dfs.items():
    print(df.describe())

          draw_size  tourney_level     match_num      winner_id   winner_seed  \
count  14395.000000   14395.000000  14395.000000   14395.000000  14395.000000   
mean      58.567558       1.549983     29.155818  102969.611323      2.988538   
std       35.698410       0.773806     26.473617     697.223323      5.375035   
min        8.000000       1.000000      1.000000  101086.000000      0.000000   
25%       32.000000       1.000000     11.000000  102434.000000      0.000000   
50%       48.000000       1.000000     22.000000  102950.000000      0.000000   
75%       64.000000       2.000000     36.000000  103498.000000      4.000000   
max      128.000000       3.000000    127.000000  104925.000000     35.000000   

       winner_entry   winner_hand     winner_ht    winner_age       loser_id  \
count  14395.000000  14395.000000  14395.000000  14395.000000   14395.000000   
mean       0.867801      0.886905    184.608502     25.149212  102932.107885   
std        0.338718      0.316

          draw_size  tourney_level     match_num      winner_id   winner_seed  \
count  13228.000000   13228.000000  13228.000000   13228.000000  13228.000000   
mean      58.667070       1.599108     31.754309  104542.176217      3.529936   
std       38.209936       0.788855     43.839390     661.601066      5.996818   
min        8.000000       1.000000      1.000000  100644.000000      0.000000   
25%       28.000000       1.000000     11.000000  103997.000000      0.000000   
50%       32.000000       1.000000     21.000000  104571.000000      0.000000   
75%       96.000000       2.000000     37.250000  104926.000000      5.000000   
max      128.000000       3.000000    526.000000  111575.000000     33.000000   

        winner_hand     winner_ht    winner_age       loser_id    loser_seed  \
count  13228.000000  13228.000000  13228.000000   13228.000000  13228.000000   
mean       0.872845    186.855273     27.034986  104569.052011      2.331569   
std        0.333159      7.054

# 保存

In [None]:
output_directory = os.path.join("..", "dataset", "processed_data_1")
os.makedirs(output_directory, exist_ok=True)  # 创建目录（如果不存在）
for key, df in dfs.items():
    output_file = os.path.join(output_directory, f"{key}_cleaned.csv")
    df.to_csv(output_file, index=False, encoding="utf-8-sig")
    print(f"已保存: {output_file}")