In [10]:
import pandas as pd
from getpass import getuser


In [11]:
# Get the current user's name
user = getuser()

# Path to the data file
file_path = f"C:/Users/{user}/Documents/GitHub/davis-cup/data/combined_davis.xlsx"

# Load the dataset
davis_cup_data = pd.read_excel(file_path)


# Mean difference before and after the change

In [12]:
# Grouping the dataset in pairs to represent each match
matches = [davis_cup_data.iloc[i:i+2] for i in range(0, len(davis_cup_data), 2)]

# Function to determine the favorite team for each match
def determine_favorite_for_match(match):
    # Extracting rankings for both teams in the match
    rankings = []
    for _, row in match.iterrows():
        # Convert rankings to numeric, treating non-numeric as NaN
        single_rank = pd.to_numeric(row['Single Ranking_Player1'], errors='coerce') if pd.notna(row['Single Ranking_Player1']) else pd.to_numeric(row['Doubles Ranking_Player1'], errors='coerce')
        rankings.append((row['Team Name'], single_rank))

    # Sort by ranking (lower ranking is better) and handle missing rankings
    rankings = [team for team in sorted(rankings, key=lambda x: (pd.isna(x[1]), x[1]))]

    # Favorite team is the one with the lower ranking
    return rankings[0][0] if not pd.isna(rankings[0][1]) else None

# Identifying the favorite team and the winning team for each match
match_results = []
for match in matches:
    favorite = determine_favorite_for_match(match)
    winner = match.iloc[0]['Team Name'] if match.iloc[0]['Team Name'] == match.iloc[1]['Player 1'] else match.iloc[1]['Team Name']
    year = match.iloc[0]['Year']
    match_results.append({'Year': year, 'Favorite': favorite, 'Winner': winner})

# Converting to DataFrame
match_results_df = pd.DataFrame(match_results)

# Categorizing matches as pre-2019 or post-2019
match_results_df['Period'] = match_results_df['Year'].apply(lambda x: 'Pre-2019' if x < 2019 else 'Post-2019')

# Calculating if the favorite won
match_results_df['Favorite_Win'] = match_results_df['Favorite'] == match_results_df['Winner']

# Calculating win probabilities
win_prob_pre_2019 = match_results_df[match_results_df['Period'] == 'Pre-2019']['Favorite_Win'].mean()
win_prob_post_2019 = match_results_df[match_results_df['Period'] == 'Post-2019']['Favorite_Win'].mean()

print(f"Win Probability Pre-2019: {win_prob_pre_2019}")
print(f"Win Probability Post-2019: {win_prob_post_2019}")


Win Probability Pre-2019: 0.39275766016713093
Win Probability Post-2019: 0.43243243243243246


# statistical significance tests

In [13]:
from scipy.stats import chi2_contingency

# Creating a contingency table for win rates pre- and post-2019
contingency_table = pd.crosstab(match_results_df['Period'], match_results_df['Favorite_Win'])

# Performing the chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

chi2, p


(0.25416275440175795, 0.6141590869908642)

**Interpretation:**

*Chi-square Statistic*: This value indicates the difference between the observed and expected frequencies in the contingency table. A higher value suggests a greater discrepancy.

*p-value*: The p-value is used to determine the statistical significance of the result. A common threshold for significance is 0.05.

Since the p-value (*p* ≈ 0.614) is much higher than 0.05, we fail to reject the null hypothesis. This suggests that there is no statistically significant difference in the win rates of favorite teams before and after the 2019 Davis Cup format change, based on the data we have.