<a href="https://colab.research.google.com/github/Cassini-chris/OlympicGames/blob/main/olympics_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import clear_output
import seaborn as sns

# Read the data from the CSV file
df = pd.read_csv("olympics_1896_2024.csv")

# Extract years, sorted low to high
years = df['year'].unique()
years.sort()  # Sort years in ascending order

medal_age_averages = {"Gold": 0, "Silver": 0, "Bronze": 0}
# Initialize empty dictionaries to store counts
age_counts_total = {}
sex_counts_total = {"M": 0, "F": 0}
medal_age_percentages = {"Gold": {"M": 0, "F": 0}, "Silver": {"M": 0, "F": 0}, "Bronze": {"M": 0, "F": 0}}

total_count_overall = 0  # Total count of athletes across all years
medal_age_accumulated = {"Gold": 0, "Silver": 0, "Bronze": 0}  # Accumulated age for each medal category


def update(frame):
    # Clear the plot for better visualization (optional)
    ax.clear()

    # Filter data for the current year
    year_data = df[(df['year'] == years[frame])]

    # Check if year_data is empty before proceeding
    if year_data.empty:
        print(f"Skipping year {years[frame]} (no data for Summer Olympics in Athletics)")
        return None  # Skip to next iteration

    # Check if year_data is empty before proceeding
    if not year_data.empty:
        # Filter data for each medal category
        gold_data = year_data[year_data['medal'] == "Gold"]
        silver_data = year_data[year_data['medal'] == "Silver"]
        bronze_data = year_data[year_data['medal'] == "Bronze"]

        # Count occurrences of each age and sex in the current year
        age_counts = year_data['age'].value_counts()
        sex_counts = year_data['sex'].value_counts()

        # Update aggregated age and sex counts
        for age, count in age_counts.items():
            age_counts_total[age] = age_counts_total.get(age, 0) + count
        for sex, count in sex_counts.items():
            sex_counts_total[sex] = sex_counts_total.get(sex, 0) + count

        # Calculate total count
        total_count = sum(age_counts_total.values())

        # Calculate percentages for M and F (avoid division by zero)
        percent_male = (sex_counts_total.get("M", 0) / total_count) * 100 if total_count > 0 else 0
        percent_female = (sex_counts_total.get("F", 0) / total_count) * 100 if total_count > 0 else 0

        # Use the aggregated age counts for the plot
        ax.set_ylim(0, max(age_counts_total.values()) * 1.1)  # Adjust based on max value
        google_colors = {
            "blue": "#4285F4",
            # "green": "#0F9D58",
            # "yellow": "#F4B400",
            # "red": "#DB4437"
        }
        # Create bar plot with age on x-axis and count on y-axis
        bar_width = 0.8  # Adjust bar width as needed

        bars = ax.bar(age_counts_total.keys(), age_counts_total.values(), bar_width, color=google_colors)

        # Get the youngest and oldest ages (handle potential absence)
        if age_counts_total:
            youngest_age = min(age_counts_total.keys())
            oldest_age = max(age_counts_total.keys())
        else:
            youngest_age = None  # Set to None if no elements in dictionary
            oldest_age = None

        # Get counts for youngest and oldest ages (handle potential absence)
        youngest_count = age_counts_total.get(youngest_age, 0)
        oldest_count = age_counts_total.get(oldest_age, 0)

        # Get the highest count
        highest_count = max(age_counts_total.values())

        # Find the age with the highest count
        max_age = [age for age, count in age_counts_total.items() if count == highest_count][0]

        # Calculate average and median age
        total_age = sum(age * count for age, count in age_counts_total.items())
        average_age = total_age / total_count if total_count > 0 else 0

        # Calculate median age based on cumulative counts
        cumulative_count = 0

        # Calculate average age for gold, silver, and bronze medalists
        gold_total_age = sum(gold_data['age'])
        gold_average_age = gold_total_age / len(gold_data) if len(gold_data) > 0 else 0

        silver_total_age = sum(silver_data['age'])
        silver_average_age = silver_total_age / len(silver_data) if len(silver_data) > 0 else 0

        bronze_total_age = sum(bronze_data['age'])
        bronze_average_age = bronze_total_age / len(bronze_data) if len(bronze_data) > 0 else 0

        # Create overlays for youngest, oldest, highest count, average, median, and sex percentages
        youngest_text = f"Youngest: {youngest_age} (Count: {youngest_count})"
        oldest_text = f"Oldest: {oldest_age} (Count: {oldest_count})"
        highest_text = f"Highest Count: {highest_count} (Age {max_age})"
        average_text = f"Average Age: {average_age:.2f}"
        male_percent_text = f"Male: {percent_male:.2f}%"
        female_percent_text = f"Female: {percent_female:.2f}%"
        gold_average_text = f"Gold Average Age: {gold_average_age:.2f}"
        silver_average_text = f"Silver Average Age: {silver_average_age:.2f}"
        bronze_average_text = f"Bronze Average Age: {bronze_average_age:.2f}"

        # Create overlays for youngest, oldest, highest count, average, median, and sex percentages
        youngest_text = f"Youngest: {youngest_age} (Count: {youngest_count})"
        oldest_text = f"Oldest: {oldest_age} (Count: {oldest_count})"
        highest_text = f"Highest Count: {highest_count} (Age {max_age})"
        average_text = f"Average Age: {average_age:.2f}"
        male_percent_text = f"Male: {percent_male:.2f}%"
        female_percent_text = f"Female: {percent_female:.2f}%"

        # Add text overlays to the right side of the chart
        ax.text(0.95, 0.95, youngest_text, transform=ax.transAxes, ha="right", va="top")
        ax.text(0.95, 0.90, oldest_text, transform=ax.transAxes, ha="right", va="top")
        ax.text(0.95, 0.85, highest_text, transform=ax.transAxes, ha="right", va="top")
        ax.text(0.95, 0.80, average_text, transform=ax.transAxes, ha="right", va="top")
        ax.text(0.95, 0.30, male_percent_text, transform=ax.transAxes, ha="right", va="top")
        ax.text(0.95, 0.25, female_percent_text, transform=ax.transAxes, ha="right", va="top")

        # Define Google's brand colors as a palette (adjust as needed)
        google_colors = ["#4285F4", "#DB4437", "#F4B400", "#0F9D58"]

        # Add overlays for average age of gold, silver, and bronze medalists
        ax.text(0.95, 0.70, gold_average_text, transform=ax.transAxes, ha="right", va="top", color=google_colors[0])
        ax.text(0.95, 0.65, silver_average_text, transform=ax.transAxes, ha="right", va="top", color=google_colors[1])
        ax.text(0.95, 0.60, bronze_average_text, transform=ax.transAxes, ha="right", va="top", color=google_colors[2])

        # Update title and labels
        ax.set_title(f"Aggregated Age Distribution (Up to {years[frame]})")
        ax.set_xlabel("Age")
        ax.set_ylabel("Total Count")
        # Set x-axis limits to show ages between 13 and 64 (inclusive)
        ax.set_xlim(13, 64)

        return bars

# Create the figure and axis with a width of 500px
fig, ax = plt.subplots(figsize=(10, 7))  # Set figure width to 500px (5 inches)

anim = FuncAnimation(fig, update, frames=len(years), interval=300)

# anim.save('medal_count_animation_8888xf.mp4')
# Animate the visualization within Colab (assuming you have Colab running)
def animate_colab():
    for i in range(len(years)):
        update(i)
        clear_output(wait=True)  # Clear previous output
        display(fig)

# Call the animation function (optional)
animate_colab()
anim.save('olympics.mp4')