In [13]:
import pandas as pd
import os
import re

def load_and_prepare_data():
    # Define paths
    combined_path = os.path.join('data', 'combined_data.csv')
    
    # Load combined data
    if not os.path.exists(combined_path):
        # If combined data doesn't exist, run data_processing.py or handle accordingly
        from data_processing import main as data_processing_main
        data_processing_main()
    
    data = pd.read_csv(combined_path)
    
    return data


In [14]:
def aggregate_medal_counts(data):
    """
    Aggregates medal counts by Country, Sport, Gender, and Year.
    """
    # Count medals excluding 'No medal'
    medal_counts = data[data["Medal"] != "No medal"].groupby(
        ["Country", "Sport", "Gender", "Year"]
    )["Medal"].count().reset_index(name="Medal Count")
    
    return medal_counts

In [15]:
def identify_significant_increases(medal_counts, threshold=50):
    """
    Identifies significant increases in medal counts.
    """
    significant = medal_counts[medal_counts["Medal Change %"] > threshold]
    return significant

In [16]:
def calculate_percentage_change(medal_counts):
    """
    Calculates year-over-year percentage change in medal counts.
    """
    medal_counts = medal_counts.sort_values(["Country", "Sport", "Gender", "Year"])
    medal_counts["Medal Change %"] = medal_counts.groupby(
        ["Country", "Sport", "Gender"]
    )["Medal Count"].pct_change() * 100
    return medal_counts

In [17]:
def merge_medal_athlete_counts(medal_counts, athlete_counts):
    """
    Merges medal counts with athlete counts.
    """
    merged = pd.merge(
        medal_counts,
        athlete_counts,
        on=["Country", "Sport", "Gender", "Year"],
        how="left"
    )
    return merged


In [18]:
def filter_potential_coach_effect(merged_data, medal_increase_threshold=50, athlete_change_threshold=10):
    """
    Filters the data to identify potential 'great coach' effects.
    """
    potential = merged_data[
        (merged_data["Medal Change %"] > medal_increase_threshold) &
        (merged_data["Athlete Change %"] < athlete_change_threshold)
    ]
    return potential


In [19]:
def visualize_medal_trends(data, country, sport, gender):
    """
    Plots medal counts over years for a specific country, sport, and gender.
    """
    subset = data[
        (data["Country"] == country) &
        (data["Sport"] == sport) &
        (data["Gender"] == gender)
    ]
    
    # Aggregate medal counts
    medal_trend = subset[subset["Medal"] != "No medal"].groupby("Year")["Medal"].count().reset_index(name="Medal Count")
    
    # Plot
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=medal_trend, x="Year", y="Medal Count", marker="o")
    plt.title(f"{country} - {sport} - {'Men' if gender == 'M' else 'Women'} Medal Counts Over Years")
    plt.xlabel("Year")
    plt.ylabel("Number of Medals")
    plt.grid(True)
    plt.tight_layout()
    
    # Save the figure
    figure_path = os.path.join('outputs', 'figures', f"{country}_{sport}_{gender}_medal_trend.png")
    os.makedirs(os.path.dirname(figure_path), exist_ok=True)
    plt.savefig(figure_path)
    plt.close()
    print(f"Figure saved to {figure_path}")


In [20]:
def visualize_medal_trends(data, country, sport, gender):
    """
    Plots medal counts over years for a specific country, sport, and gender.
    """
    subset = data[
        (data["Country"] == country) &
        (data["Sport"] == sport) &
        (data["Gender"] == gender)
    ]
    
    # Aggregate medal counts
    medal_trend = subset[subset["Medal"] != "No medal"].groupby("Year")["Medal"].count().reset_index(name="Medal Count")
    
    # Plot
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=medal_trend, x="Year", y="Medal Count", marker="o")
    plt.title(f"{country} - {sport} - {'Men' if gender == 'M' else 'Women'} Medal Counts Over Years")
    plt.xlabel("Year")
    plt.ylabel("Number of Medals")
    plt.grid(True)
    plt.tight_layout()
    
    # Save the figure
    figure_path = os.path.join('outputs', 'figures', f"{country}_{sport}_{gender}_medal_trend.png")
    os.makedirs(os.path.dirname(figure_path), exist_ok=True)
    plt.savefig(figure_path)
    plt.close()
    print(f"Figure saved to {figure_path}")


In [21]:
# Load and prepare data
data = load_and_prepare_data()

# Aggregate medal counts
medal_counts = aggregate_medal_counts(data)

# Calculate percentage change in medals
medal_counts = calculate_percentage_change(medal_counts)

# Identify significant increases in medal counts
significant_increases = identify_significant_increases(medal_counts, threshold=50)

# Aggregate athlete counts
athlete_counts = aggregate_athlete_counts(data)

# Calculate percentage change in athlete participation
athlete_counts = calculate_athlete_percentage_change(athlete_counts)

# Merge medal and athlete counts
merged_data = merge_medal_athlete_counts(medal_counts, athlete_counts)

# Filter for potential 'great coach' effects
potential_coach_effect = filter_potential_coach_effect(
    merged_data,
    medal_increase_threshold=50,
    athlete_change_threshold=10
)

# Estimate coach impact
estimated_impact = estimate_coach_impact(potential_coach_effect)

# Save the results
output_path = os.path.join('outputs', 'reports', 'potential_coach_effect.csv')
os.makedirs(os.path.dirname(output_path), exist_ok=True)
estimated_impact.to_csv(output_path, index=False)
print(f"Potential coach effects saved to {output_path}")

# Visualization (Example: Plotting for a specific group)
# Replace with dynamic selection based on actual data
if not estimated_impact.empty:
    example = estimated_impact.iloc[0]
    visualize_medal_trends(
        data,
        country=example["Country"],
        sport=example["Sport"],
        gender=example["Gender"]
    )

# Further analysis and recommendations can be added here

ModuleNotFoundError: No module named 'data_processing'