In [35]:
import math
import pandas as pd
from scipy.stats import norm

def calculate_coaching_effect(df, year_t, year_t4, historical_data):
    """
    Calculates the coaching effect based on returning and new athlete performance.
    
    df: DataFrame with columns ["Athlete", "Year", "Medal"]
    year_t: The earlier Olympics (e.g., 2016)
    year_t4: The current Olympics (e.g., 2020)
    historical_data: DataFrame with columns ["Year", "Event Code", "Moving Avg Medals"]
                     that gives historical moving averages for new athletes.
    
    Returns:
      A dictionary with coaching effect results and significance testing.
    """
    # Filter data for the two years
    df_t = df[df["Year"] == year_t]
    df_t4 = df[df["Year"] == year_t4]
    
    # Identify returning and new athletes
    athletes_t = set(df_t["Athlete"])
    athletes_t4 = set(df_t4["Athlete"])
    returning_athletes = athletes_t.intersection(athletes_t4)
    new_athletes = athletes_t4.difference(athletes_t)
    
    # Number of athletes
    n_r = len(returning_athletes)
    n_n = len(new_athletes)
    
    if n_r == 0 or n_n == 0:
        return {"error": "Insufficient data for returning or new athletes."}
    
    # Medals for returning athletes
    m_r_t = df_t[df_t["Athlete"].isin(returning_athletes) & (df_t["Medal"] != "No medal")].shape[0]
    m_r_t4 = df_t4[df_t4["Athlete"].isin(returning_athletes) & (df_t4["Medal"] != "No medal")].shape[0]
    
    # Medals for new athletes
    m_n_t4 = df_t4[df_t4["Athlete"].isin(new_athletes) & (df_t4["Medal"] != "No medal")].shape[0]
    
    # Get expected medals for new athletes from historical data
    historical_avg = historical_data[historical_data["Year"] == year_t4]["Moving Avg Medals"].mean()
    expected_m_n_t4 = historical_avg * n_n  # Scale by number of new athletes
    
    # Calculate proportions
    p_r_t = m_r_t / n_r
    p_r_t4 = m_r_t4 / n_r
    delta_p_r = p_r_t4 - p_r_t
    
    p_n_t4 = m_n_t4 / n_n
    expected_p_n_t4 = expected_m_n_t4 / n_n
    delta_p_n = p_n_t4 - expected_p_n_t4
    
    # Coaching effect
    coaching_effect = delta_p_r + delta_p_n
    
    # Standard errors
    se_r = math.sqrt((p_r_t * (1 - p_r_t)) / n_r + (p_r_t4 * (1 - p_r_t4)) / n_r)
    se_n = math.sqrt((p_n_t4 * (1 - p_n_t4)) / n_n + (expected_p_n_t4 * (1 - expected_p_n_t4)) / n_n)
    se_total = math.sqrt(se_r**2 + se_n**2)
    
    # Z-score and p-value
    z_score = coaching_effect / se_total
    p_value = 2 * (1 - norm.cdf(abs(z_score)))  # Two-tailed p-value
    
    # Return results
    return {
        "Returning_Athlete_Effect": round(delta_p_r * n_r, 2),  # Medals attributed to returning athletes
        "New_Athlete_Effect": round(delta_p_n * n_n, 2),        # Medals attributed to new athletes
        "Coaching_Effect": round(coaching_effect * (n_r + n_n), 2),  # Total medals attributed
        "Z_Score": round(z_score, 3),
        "P_Value": round(p_value, 5),
        "Significant": p_value < 0.05
    }


# Example usage:
# df = pd.read_csv("olympic_data.csv")
# historical_data = pd.read_csv("Data/athlete_probabilities_by_year/moving_averages.csv")
# result = calculate_coaching_effect(df, 2016, 2020, historical_data)
# print(result)


In [36]:

def main():
    # Directory containing your CSV files
    input_dir = "Data/processed_olympics_data"  # change if your folder name/path is different
    
    # Find all CSV files in that directory
    csv_files = glob.glob(os.path.join(input_dir, "*.csv"))
    
    # List to hold the cumulative results from all files
    all_results = []
    
    for file_path in csv_files:
        # Read each CSV into a DataFrame
        df = pd.read_csv(file_path)
        
        
        # Compute the coaching effect for consecutive Olympics in this file
        results = calculate_coaching_effect(df)
        
        # Annotate the results with file-specific info (e.g., filename)
        for row in results:
            row["File"] = os.path.basename(file_path)
            
            # Optional: If each file pertains to exactly one country/sport, 
            # you can attempt to extract that from the DataFrame or filename.
            # Example:
            # row["Country"] = df["Country"].unique()[0] if "Country" in df.columns else "Unknown"
            # row["Sport"]   = df["Sport"].unique()[0]   if "Sport"   in df.columns else "Unknown"
        
        # Accumulate
        all_results.extend(results)
    
    # Convert all results to a single DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Save to a CSV file for easy review
    output_file = "coaching_effect_results.csv"
    results_df.to_csv(output_file, index=False)
    
    print(f"Coaching effect analysis complete. Results saved to '{output_file}'.")
    

if __name__ == "__main__":
    main()


TypeError: calculate_coaching_effect() missing 3 required positional arguments: 'year_t', 'year_t4', and 'historical_data'