In [4]:
# 由于要实现的模型涉及到比较复杂的数据处理和分析，我们将从简单的数据探索开始，
# 然后尝试定义并计算动量指标，最后进行简单的可视化展示。
# 本例将重点放在如何根据比赛分数计算动量，并不会构建一个完整的状态空间模型。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')


In [5]:
# 改进势头计算函数以包括赢得的局数和盘数，以及更精确的发球优势计算
def calculate_momentum_improved_1(df, index, window_size=3):
    start_index = max(index - window_size, 0)
    end_index = min(index + window_size + 1, len(df))
    time_window_df = df.iloc[start_index:end_index]

    # 初始化势头值
    p1_momentum = 0
    p2_momentum = 0

    # 赢得的局数和盘数
    p1_sets_won = time_window_df['p1_sets'].iloc[-1] - time_window_df['p1_sets'].iloc[0]
    p2_sets_won = time_window_df['p2_sets'].iloc[-1] - time_window_df['p2_sets'].iloc[0]
    p1_games_won = time_window_df['p1_games'].iloc[-1] - time_window_df['p1_games'].iloc[0]
    p2_games_won = time_window_df['p2_games'].iloc[-1] - time_window_df['p2_games'].iloc[0]

    # 发球优势
    # 假设发球方在每个得分上的额外权重为0.1
    serve_advantage_weight = 0.1
    p1_serve_advantage = (time_window_df[time_window_df['server'] == 1]['point_victor'] == 1).sum() * serve_advantage_weight
    p2_serve_advantage = (time_window_df[time_window_df['server'] == 2]['point_victor'] == 2).sum() * serve_advantage_weight

    # 其他因素（得分优势、破发点、非受迫性失误、制胜分）
    p1_points_advantage = time_window_df['point_victor'].apply(lambda x: x == 1).sum() - time_window_df['point_victor'].apply(lambda x: x == 2).sum()
    p2_points_advantage = -p1_points_advantage
    p1_break_points_won = time_window_df['p1_break_pt_won'].sum()
    p2_break_points_won = time_window_df['p2_break_pt_won'].sum()
    p1_unforced_errors = -time_window_df['p1_unf_err'].sum()
    p2_unforced_errors = -time_window_df['p2_unf_err'].sum()
    p1_winners = time_window_df['p1_winner'].sum()
    p2_winners = time_window_df['p2_winner'].sum()

    # 合并计算势头
    p1_momentum = p1_points_advantage + p1_serve_advantage + p1_break_points_won + p1_unforced_errors + p1_winners + p1_sets_won + p1_games_won
    #p2_momentum = p2_points_advantage + p2_serve_advantage + p2_break_points_won + p2_unforced_errors + p2_winners + p2_sets_won + p2_games_won

    return p1_points_advantage, p1_serve_advantage, p1_break_points_won, p1_unforced_errors, p1_winners , p1_sets_won, p1_games_won








# 改进势头计算函数以包括赢得的局数和盘数，以及更精确的发球优势计算
def calculate_momentum_improved_2(df, index, window_size=3):
    start_index = max(index - window_size, 0)
    end_index = min(index + window_size + 1, len(df))
    time_window_df = df.iloc[start_index:end_index]

    # 初始化势头值
    p1_momentum = 0
    p2_momentum = 0

    # 赢得的局数和盘数
    p1_sets_won = time_window_df['p1_sets'].iloc[-1] - time_window_df['p1_sets'].iloc[0]
    p2_sets_won = time_window_df['p2_sets'].iloc[-1] - time_window_df['p2_sets'].iloc[0]
    p1_games_won = time_window_df['p1_games'].iloc[-1] - time_window_df['p1_games'].iloc[0]
    p2_games_won = time_window_df['p2_games'].iloc[-1] - time_window_df['p2_games'].iloc[0]

    # 发球优势
    # 假设发球方在每个得分上的额外权重为0.1
    serve_advantage_weight = 0.1
    p1_serve_advantage = (time_window_df[time_window_df['server'] == 1]['point_victor'] == 1).sum() * serve_advantage_weight
    p2_serve_advantage = (time_window_df[time_window_df['server'] == 2]['point_victor'] == 2).sum() * serve_advantage_weight

    # 其他因素（得分优势、破发点、非受迫性失误、制胜分）
    p1_points_advantage = time_window_df['point_victor'].apply(lambda x: x == 1).sum() - time_window_df['point_victor'].apply(lambda x: x == 2).sum()
    p2_points_advantage = -p1_points_advantage
    p1_break_points_won = time_window_df['p1_break_pt_won'].sum()
    p2_break_points_won = time_window_df['p2_break_pt_won'].sum()
    p1_unforced_errors = -time_window_df['p1_unf_err'].sum()
    p2_unforced_errors = -time_window_df['p2_unf_err'].sum()
    p1_winners = time_window_df['p1_winner'].sum()
    p2_winners = time_window_df['p2_winner'].sum()


    return p2_points_advantage, p2_serve_advantage, p2_break_points_won, p2_unforced_errors, p2_winners , p2_sets_won, p2_games_won






def cumsum_detection(series):
    """
    CUMSUM检测算法实现，用于检测序列中的转折点。
    :param series: 一维数据序列 (Pandas Series)
    :return: 转折点的索引列表
    """
    # 计算差分序列
    diff_series = series.diff().fillna(0)  # 用0填充NaN值
    
    # 计算累积和
    cumsum_series = diff_series.cumsum()
    
    # 识别转折点：当累积和重新穿过零点时，认为是一个转折点
    turning_points = []
    for i in range(1, len(cumsum_series)):
        # 如果累积和的符号与前一个不同，则认为是转折点
        if cumsum_series[i] * cumsum_series[i-1] < 0:
            turning_points.append(i)
    
    return turning_points



from scipy.stats import norm

def runs_test(sequence):
    """
    游程检验，判断序列随机性。
    :param sequence: 输入的序列 (list or np.array)
    :return: Z统计量和p值
    """
    # 将序列分为两类
    median_value = np.median(sequence)
    binary_sequence = [1 if x > median_value else 0 for x in sequence]
    
    # 计算游程数量
    runs = 1  # 至少有一个游程
    for i in range(1, len(binary_sequence)):
        if binary_sequence[i] != binary_sequence[i-1]:
            runs += 1
    
    # 计算期望的游程数量和标准差
    n1 = binary_sequence.count(1)
    n2 = binary_sequence.count(0)
    expected_runs = 2 * n1 * n2 / (n1 + n2) + 1
    variance = (expected_runs - 1) * (expected_runs - 2) / (n1 + n2 - 1)
    
    # 计算Z统计量
    Z = (runs - expected_runs) / np.sqrt(variance)
    
    # 计算p值
    p_value = 2 * (1 - norm.cdf(abs(Z)))  # 双尾检验
    
    return Z, p_value


def mark_indices_in_list(length, indices):
    """
    根据指定的索引列表，在长度为length的列表中标记索引位置。
    
    :param length: 列表的长度
    :param indices: 需要标记为1的索引值列表
    :return: 标记后的列表，其中指定索引位置为1，其他位置为0
    """
    # 初始化列表，长度为length，所有值为0
    marked_list = [0] * length
    
    # 在指定索引位置标记为1
    for index in indices:
        if index < length:  # 确保索引在列表长度范围内
            marked_list[index] = 1
            
    return marked_list




def getva(x):
    if x<0.05:
        return 1
    else:
        return 0

In [6]:
import numpy as np

def entropy_weight_method(data):
    """
    熵权法计算各指标的权重
    :param data: DataFrame, 各指标的数据
    :return: 权重数组
    """
    # 数据标准化处理
    data_normalized = data.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
    
    # 计算每个指标的熵值
    epsilon = 1e-12  # 避免log(0)
    p_matrix = data_normalized / data_normalized.sum()
    e_matrix = -np.sum(p_matrix * np.log(p_matrix + epsilon), axis=0) / np.log(len(data))
    
    # 计算每个指标的差异系数
    d_matrix = 1 - e_matrix
    
    # 计算权重
    weights = d_matrix / d_matrix.sum()
    
    return weights


In [8]:
# 读取数据
df = pd.read_csv('Wimbledon_featured_matches.csv')

# 选择一场特定的比赛进行分析，根据题目描述选择2023年温布尔登决赛
# 假设决赛的match_id为"2023-wimbledon-1701"

dddd=[]
for match_id in df['match_id'].unique():
#     match_id = "2023-wimbledon-1701"
    match_data = df[df['match_id'] == match_id].copy()
    match_data.reset_index(inplace=True,drop=True)
    # 对数据集中的每一行应用计算势头的函数
    momentum_values1 = [calculate_momentum_improved_1(match_data, index) for index in range(len(match_data))]
    
    player1_data=pd.DataFrame(momentum_values1,columns=['p1_points_advantage','p1_serve_advantage',
                                     'p1_break_points_won','p1_unforced_errors',
                                     'p1_winners','p1_sets_won',
                                     'p1_games_won'])
    momentum_values2 = [calculate_momentum_improved_2(match_data, index) for index in range(len(match_data))]
    player2_data=pd.DataFrame(momentum_values2,columns=['p2_points_advantage','p2_serve_advantage',
                                     'p2_break_points_won','p2_unforced_errors',
                                     'p2_winners','p2_sets_won',
                                     'p2_games_won'])
        # 计算球员1和球员2的指标权重
    weights_p1 = entropy_weight_method(player1_data)
    weights_p2 = entropy_weight_method(player2_data)

    # 计算加权后的综合得分
    player1_score = (player1_data * weights_p1).sum(axis=1)
    player2_score = (player2_data * weights_p2).sum(axis=1)


    match_data['p1_momentum']=pd.DataFrame(player1_score)
    match_data['p2_momentum']=pd.DataFrame(player2_score)
    p1_turning_points = cumsum_detection(match_data['p1_momentum'])
    p2_turning_points = cumsum_detection(match_data['p2_momentum'])
    list_length=len(match_data)
    p1_turning_points = mark_indices_in_list(list_length, p1_turning_points)
    p2_turning_points = mark_indices_in_list(list_length, p2_turning_points)

    Z_statistic, p_value1 = runs_test( player1_score)
    Z_statistic, p_value2 = runs_test( player2_score)
    Z_statistic, p_value3 = runs_test( p1_turning_points)
    Z_statistic, p_value4 = runs_test( p2_turning_points)
    dddd.append([match_id,getva(p_value1),getva(p_value2),getva(p_value3),getva(p_value4)])

In [9]:
rrr=pd.DataFrame(dddd,columns=['match_id','p1_momentumisRand','p2_momentumisRand','p1_turning_pointsisRand','p2_turning_pointsisRand'])
rrr

Unnamed: 0,match_id,p1_momentumisRand,p2_momentumisRand,p1_turning_pointsisRand,p2_turning_pointsisRand
0,2023-wimbledon-1301,1,1,0,0
1,2023-wimbledon-1302,1,1,0,0
2,2023-wimbledon-1303,1,1,0,0
3,2023-wimbledon-1304,1,1,0,1
4,2023-wimbledon-1305,1,1,0,1
5,2023-wimbledon-1306,1,1,0,0
6,2023-wimbledon-1307,1,1,0,0
7,2023-wimbledon-1308,1,1,1,0
8,2023-wimbledon-1309,1,1,1,1
9,2023-wimbledon-1310,1,1,0,1


In [14]:
rrr

Unnamed: 0,match_id,p1_momentumisRand,p2_momentumisRand,p1_turning_pointsisRand,p2_turning_pointsisRand
0,2023-wimbledon-1301,1,1,0,0
1,2023-wimbledon-1302,1,1,0,0
2,2023-wimbledon-1303,1,1,0,0
3,2023-wimbledon-1304,1,1,0,1
4,2023-wimbledon-1305,1,1,0,1
5,2023-wimbledon-1306,1,1,0,0
6,2023-wimbledon-1307,1,1,0,0
7,2023-wimbledon-1308,1,1,1,0
8,2023-wimbledon-1309,1,1,1,1
9,2023-wimbledon-1310,1,1,0,1


In [12]:
rrr.describe()

Unnamed: 0,p1_momentumisRand,p2_momentumisRand,p1_turning_pointsisRand,p2_turning_pointsisRand
count,31.0,31.0,31.0,31.0
mean,1.0,1.0,0.387097,0.258065
std,0.0,0.0,0.495138,0.444803
min,1.0,1.0,0.0,0.0
25%,1.0,1.0,0.0,0.0
50%,1.0,1.0,0.0,0.0
75%,1.0,1.0,1.0,0.5
max,1.0,1.0,1.0,1.0


In [13]:
rrr.to_csv('Q2_test.csv',index=None)