In [61]:
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import os
import glob


def calculate_coaching_effect(df, year_t, year_t4, historical_data):
    """
    Calculate the coaching effect for a single Olympic cycle (T to T+4).
    """
    # Filter data for the two years
    df_t = df[df["Year"] == year_t]
    df_t4 = df[df["Year"] == year_t4]
    
    # Identify returning and new athletes
    athletes_t = set(df_t["Athlete"])
    athletes_t4 = set(df_t4["Athlete"])
    returning_athletes = athletes_t.intersection(athletes_t4)
    new_athletes = athletes_t4.difference(athletes_t)
    
    # Number of athletes
    n_r = len(returning_athletes)
    n_n = len(new_athletes)
    
    if n_r == 0 or n_n == 0:
        return None  # Skip if there aren't enough athletes
    
    # Medals for returning athletes
    m_r_t = df_t[df_t["Athlete"].isin(returning_athletes) & (df_t["Medal"] != "No medal")].shape[0]
    m_r_t4 = df_t4[df_t4["Athlete"].isin(returning_athletes) & (df_t4["Medal"] != "No medal")].shape[0]
    
    # Medals for new athletes
    m_n_t4 = df_t4[df_t4["Athlete"].isin(new_athletes) & (df_t4["Medal"] != "No medal")].shape[0]
    
    # Get expected medals for new athletes from historical data
    historical_avg_row = historical_data[
        (historical_data["Event"] == df_t4["Event"].iloc[0]) &
        (historical_data["Country"] == df_t4["Country"].iloc[0]) &
        (historical_data["Gender"] == df_t4["Gender"].iloc[0])
    ]
    
    if not historical_avg_row.empty:
        historical_avg = historical_avg_row["probability"].iloc[0]
        historical_total_athletes = historical_avg_row["total athletes"].iloc[0]
    else:
        historical_avg = 0.0
        historical_total_athletes = 0.0
    
    expected_m_n_t4 = historical_avg * historical_total_athletes
    
    # Calculate proportions
    p_r_t = m_r_t / n_r if n_r > 0 else 0
    p_r_t4 = m_r_t4 / n_r if n_r > 0 else 0
    
    p_n_t4 = m_n_t4 / n_n if n_n > 0 else 0
    expected_p_n_t4 = expected_m_n_t4 / n_n if n_n > 0 else 0
    
    # Ensure proportions are valid
    p_r_t = max(0, min(p_r_t, 1))
    p_r_t4 = max(0, min(p_r_t4, 1))
    p_n_t4 = max(0, min(p_n_t4, 1))
    expected_p_n_t4 = max(0, min(expected_p_n_t4, 1))
    
    delta_p_r = p_r_t4 - p_r_t
    delta_p_n = p_n_t4 - expected_p_n_t4
    
    
    # Coaching effect
    coaching_effect = delta_p_r + delta_p_n
    
    # Standard errors
    se_r = 0 if n_r == 0 else math.sqrt((p_r_t * (1 - p_r_t)) / n_r + (p_r_t4 * (1 - p_r_t4)) / n_r)
    se_n = 0 if n_n == 0 else math.sqrt((p_n_t4 * (1 - p_n_t4)) / n_n + (expected_p_n_t4 * (1 - expected_p_n_t4)) / n_n)
    
    se_total = math.sqrt(se_r**2 + se_n**2)
    
    # Z-score and p-value
    z_score = coaching_effect / se_total if se_total > 0 else 0
    p_value = 2 * (1 - norm.cdf(abs(z_score)))  # Two-tailed p-value
    
    # Return results
    return {
        "Year_Start": year_t,
        "Year_End": year_t4,
        "Returning_Athlete_Effect": round(delta_p_r * n_r, 2),
        "New_Athlete_Effect": round(delta_p_n * n_n, 2),
        "Coaching_Effect": round(coaching_effect * (n_r + n_n), 2),
        "Z_Score": round(z_score, 3),
        "P_Value": round(p_value, 5),
        "Significant": p_value < 0.05
    }


Medal Counts CSV loaded successfully.
Programs CSV loaded successfully.
Hosts CSV loaded successfully.
Athlete counts data loaded and concatenated.
  Country Code Gender Event  bronze  silver  gold  total athletes  Year
0          CMR      F   JUD     0.0     0.0   0.0             0.0  1972
1          NOR      F   FEN     0.0     0.0   0.0             0.0  1972
2          BEL      M   MPN     0.0     0.0   0.0             0.0  1972
3          HAI      F   ATH     0.0     0.0   0.0             0.0  1972
4          ESP      F   SAL     0.0     0.0   0.0             0.0  1972
Athlete counts aggregated per Year and Country Code.
   Year Country Code  Number_of_Athletes
0  1896          AFG                 0.0
1  1896          AHO                 0.0
2  1896          ALB                 0.0
3  1896          ALG                 0.0
4  1896          AND                 0.0
Unmapped Country Codes found: ['AFG' 'AHO' 'ALB' 'ALG' 'AND' 'ANG' 'ANT' 'ANZ' 'ARG' 'ARM' 'ARU' 'ASA'
 'AUS' 'AUT' 'AZE'

In [58]:
def simulate_across_all_csvs(input_dir, historical_data_path):
    
    # Read historical data
    historical_data = pd.read_csv(historical_data_path)
    # Preprocess historical data to clean probabilities
    historical_data["probability"] = pd.to_numeric(historical_data["probability"], errors="coerce").fillna(0.0)
    # Find all CSV files in the directory
    csv_files = glob.glob(os.path.join(input_dir, "*.csv"))
    all_results = []
    
    # Process each file
    for file_path in csv_files:
        print(f"Processing file: {file_path}")
        df = pd.read_csv(file_path)
        
        # Ensure expected columns are present
        required_columns = {"Athlete", "Year", "Medal", "Event", "Country", "Gender"}
        if not required_columns.issubset(df.columns):
            print(f"Skipping {file_path}: missing required columns {required_columns - set(df.columns)}.")
            continue
        
        # Get all unique years in the file
        years = sorted(df["Year"].unique())
        
        # Compute coaching effects for consecutive years
        for i in range(len(years) - 1):
            year_t = years[i]
            year_t4 = years[i + 1]
            
            result = calculate_coaching_effect(df, year_t, year_t4, historical_data)
            if result:
                result["File"] = os.path.basename(file_path)
                all_results.append(result)
    
    # Convert all results to DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Identify top 10 highest-magnitude coaching effects
    results_df["Abs_Coaching_Effect"] = results_df["Coaching_Effect"].abs()
    top_10 = results_df.nlargest(10, "Abs_Coaching_Effect")
    
    # Plot results
    plt.figure(figsize=(12, 6))
    for file in results_df["File"].unique():
        subset = results_df[results_df["File"] == file]
        plt.plot(subset["Year_End"], subset["Coaching_Effect"], marker="o", label=file)
    plt.axhline(0, color="gray", linestyle="--")
    plt.title("Coaching Effects Across All Files")
    plt.xlabel("Olympic Year")
    plt.ylabel("Coaching Effect (Medals)")
    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize="small")
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    # Save results
    results_df.to_csv("all_coaching_effects_results.csv", index=False)
    top_10.to_csv("top_10_coaching_effects.csv", index=False)
    
    print("All results saved to 'all_coaching_effects_results.csv'.")
    print("Top 10 coaching effects saved to 'top_10_coaching_effects.csv'.")
    print("Top 10 highest-magnitude coaching effects:")
    print(top_10)
    return results_df, top_10


In [None]:

input_dir = "Data/processed_olympics_data"
historical_data_path = "Data/moving_averages.csv"
all_results, top_10 = simulate_across_all_csvs(input_dir, historical_data_path)
