In [9]:
import pandas as pd
import os
import glob

def compute_coaching_effect_percent(df):
    """
    For a given DataFrame of a single CSV (country/sport/gender, etc.), 
    compute the coaching effect in percentage terms across consecutive Olympics.
    
    The formula for the coaching effect (%) from year T to T+4 is:
    
        Coaching_Effect_% = ((Medals_{T+4} - Medals_T) / Medals_T) * Repeat_Rate * 100
        
    Where:
      - Medals_T = number of medals in year T (counting each medal equally)
      - Repeat_Rate = (# of athletes appearing in both T and T+4) / (# athletes in T+4)
    
    If Medals_T = 0, uses a simplified approach:
      Coaching_Effect_% = (Delta_Medals) * (Repeat_Rate) * 100
    """
    # Sort by Year to ensure chronological order
    years = sorted(df["Year"].unique())
    results = []

    for i in range(len(years) - 1):
        y_current = years[i]
        y_next = years[i + 1]
        
        # Slice data for each Olympics
        df_current = df[df["Year"] == y_current]
        df_next = df[df["Year"] == y_next]
        
        # Count medals (treat all medals as 1)
        medals_current = (df_current["Medal"] != "No medal").sum()
        medals_next = (df_next["Medal"] != "No medal").sum()
        
        # Calculate the repeat rate
        athletes_current = set(df_current["Athlete"])
        athletes_next = set(df_next["Athlete"])
        overlap = len(athletes_current.intersection(athletes_next))
        if len(athletes_next) > 0:
            repeat_rate = overlap / len(athletes_next)
        else:
            repeat_rate = 0  # Avoid division by zero if next cycle has no athletes
        
        # Calculate delta medals
        delta_medals = medals_next - medals_current
        
        # Compute coaching effect (%)
        if medals_current == 0:
            # If no medals previously, use an alternate formula to avoid division by zero
            coaching_effect_pct = delta_medals * repeat_rate * 100
        else:
            raw_increase_pct = (delta_medals / medals_current) * 100
            coaching_effect_pct = raw_increase_pct * repeat_rate
        
        # Store results
        results.append({
            "Year_Before": y_current,
            "Year_After": y_next,
            "Medals_Before": medals_current,
            "Medals_After": medals_next,
            "Delta_Medals": delta_medals,
            "Repeat_Rate": round(repeat_rate, 3),  # rounding for readability
            "Coaching_Effect_%": round(coaching_effect_pct, 2)
        })
    
    return results


In [10]:

def main():
    # Directory containing your CSV files
    input_dir = "Data/processed_olympics_data"  # change if your folder name/path is different
    
    # Find all CSV files in that directory
    csv_files = glob.glob(os.path.join(input_dir, "*.csv"))
    
    # List to hold the cumulative results from all files
    all_results = []
    
    for file_path in csv_files:
        # Read each CSV into a DataFrame
        df = pd.read_csv(file_path)
        
        
        # Compute the coaching effect for consecutive Olympics in this file
        results = compute_coaching_effect_percent(df)
        
        # Annotate the results with file-specific info (e.g., filename)
        for row in results:
            row["File"] = os.path.basename(file_path)
            
            # Optional: If each file pertains to exactly one country/sport, 
            # you can attempt to extract that from the DataFrame or filename.
            # Example:
            # row["Country"] = df["Country"].unique()[0] if "Country" in df.columns else "Unknown"
            # row["Sport"]   = df["Sport"].unique()[0]   if "Sport"   in df.columns else "Unknown"
        
        # Accumulate
        all_results.extend(results)
    
    # Convert all results to a single DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Save to a CSV file for easy review
    output_file = "coaching_effect_results.csv"
    results_df.to_csv(output_file, index=False)
    
    print(f"Coaching effect analysis complete. Results saved to '{output_file}'.")


if __name__ == "__main__":
    main()


Coaching effect analysis complete. Results saved to 'coaching_effect_results.csv'.
