In [25]:
import pandas as pd
import numpy as np
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Correct File Loading
medal_counts = pd.read_csv('/Users/chris/MCM_2025_C/Data/summerOly_medal_counts.csv')
programs = pd.read_csv('/Users/chris/MCM_2025_C/Data/summerOly_programs.csv')
hosts = pd.read_csv('/Users/chris/MCM_2025_C/Data/summerOly_hosts.csv')  # Corrected

# 2. Load Athlete Counts Data
pattern = os.path.join('/Users/chris/MCM_2025_C/Data/athlete_probabilities_by_year', '*.csv')
athlete_files = glob.glob(pattern, recursive=True)
athlete_dfs = []

for file in athlete_files:
    try:
        df = pd.read_csv(file)
        # Extract year from filename if 'Year' column is absent
        if 'Year' not in df.columns:
            year = os.path.splitext(os.path.basename(file))[0].split('_')[-1]
            df['Year'] = int(year)
        athlete_dfs.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")

athlete_counts = pd.concat(athlete_dfs, ignore_index=True)

# 3. Aggregate Athlete Counts
# Assuming 'Athlete_ID' exists; otherwise, use size()
if 'Athlete_ID' in athlete_counts.columns:
    athlete_counts_agg = athlete_counts.groupby(['Year', 'Country']).agg({
        'Athlete_ID': 'nunique'
    }).reset_index().rename(columns={'Athlete_ID': 'Number_of_Athletes'})
else:
    athlete_counts_agg = athlete_counts.groupby(['Year', 'Country']).size().reset_index(name='Number_of_Athletes')

# 4. Merge with Medal Counts
medal_counts['Country'] = medal_counts['Country'].str.title()
athlete_counts_agg['Country'] = athlete_counts_agg['Country'].str.title()
medal_counts['Year'] = medal_counts['Year'].astype(int)
athlete_counts_agg['Year'] = athlete_counts_agg['Year'].astype(int)

merged_df = pd.merge(medal_counts, athlete_counts_agg, on=['Year', 'Country'], how='left')
merged_df['Number_of_Athletes'] = merged_df['Number_of_Athletes'].fillna(0).astype(int)

# 5. Handle Missing Values
# Example: Fill missing 'Sports_Investment'
if 'Sports_Investment' in merged_df.columns:
    merged_df['Sports_Investment'] = merged_df['Sports_Investment'].fillna(merged_df['Sports_Investment'].median())

# 6. Feature Engineering (Optional)
merged_df['Athletes_per_Event'] = merged_df['Number_of_Athletes'] / merged_df['Total_Events']
merged_df['Log_GDP'] = np.log1p(merged_df['GDP'])  # Handle zero GDP if applicable
merged_df['GDP_Population'] = merged_df['GDP'] * merged_df['Population']

# 7. Define Variables for Modeling
dependent_var = 'Medal_Count'
independent_vars = ['Number_of_Athletes', 'Is_Host', 'Number_of_Sports', 'Total_Events']

# Optionally include engineered features
# independent_vars += ['Athletes_per_Event', 'Log_GDP', 'GDP_Population']

# 8. Split the Data
X = merged_df[independent_vars]
y = merged_df[dependent_var]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 9. Feature Scaling (If Needed)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 10. Model Building (Example: Poisson Regression)
import statsmodels.api as sm

X_train_sm = sm.add_constant(X_train_scaled)  # Add intercept
poisson_model = sm.GLM(y_train, X_train_sm, family=sm.families.Poisson()).fit()
print(poisson_model.summary())

# 11. Predictions
X_test_sm = sm.add_constant(X_test_scaled)
y_pred = poisson_model.predict(X_test_sm)

# 12. Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"MAE: {mae}, RMSE: {rmse}")


KeyError: 'Country'

Adjust Predictions for Fixed Medal Counts:

In [None]:
import scipy.stats as stats

# Assume 'events_list' contains all unique events
events_list = data['Event'].unique()
medal_types = ['Gold', 'Silver', 'Bronze']
final_medal_counts = pd.DataFrame(columns=['Country', 'Medal_Type', 'Medal_Count'])

for event in events_list:
    event_data = data[data['Event'] == event]
    for medal in medal_types:
        # Filter data for the current event
        event_medal_data = event_data[event_data['Medal_Type'] == medal]

        # Define X for this subset
        X_event = event_medal_data[independent_vars]
        X_event = sm.add_constant(X_event)

        # Predict expected counts
        expected_counts = poisson_model.predict(X_event)

        # Convert expected counts to probabilities
        probabilities = expected_counts / expected_counts.sum()

        # Handle cases where sum(expected_counts) is zero
        if expected_counts.sum() == 0:
            probabilities = np.ones(len(expected_counts)) / len(expected_counts)

        # Assign the medal based on probabilities
        assigned_country = np.random.choice(event_medal_data['Country'], size=1, p=probabilities)

        # Update final medal counts
        for country in assigned_country:
            final_medal_counts = final_medal_counts.append({'Country': country, 'Medal_Type': medal, 'Medal_Count': 1}, ignore_index=True)

# Aggregate medal counts per country
aggregated_medal_counts = final_medal_counts.groupby(['Country', 'Medal_Type']).sum().unstack(fill_value=0)
aggregated_medal_counts.columns = aggregated_medal_counts.columns.get_level_values(1)
aggregated_medal_counts['Total'] = aggregated_medal_counts.sum(axis=1)

# View the final medal counts
print(aggregated_medal_counts.head())


## Predicitons and Model Evaluation

In [None]:
import scipy.stats as stats

# Assume 'events_list' contains all unique events
events_list = data['Event'].unique()
medal_types = ['Gold', 'Silver', 'Bronze']
final_medal_counts = pd.DataFrame(columns=['Country', 'Medal_Type', 'Medal_Count'])

for event in events_list:
    event_data = data[data['Event'] == event]
    for medal in medal_types:
        # Filter data for the current event
        event_medal_data = event_data[event_data['Medal_Type'] == medal]

        # Define X for this subset
        X_event = event_medal_data[independent_vars]
        X_event = sm.add_constant(X_event)

        # Predict expected counts
        expected_counts = poisson_model.predict(X_event)

        # Convert expected counts to probabilities
        probabilities = expected_counts / expected_counts.sum()

        # Handle cases where sum(expected_counts) is zero
        if expected_counts.sum() == 0:
            probabilities = np.ones(len(expected_counts)) / len(expected_counts)

        # Assign the medal based on probabilities
        assigned_country = np.random.choice(event_medal_data['Country'], size=1, p=probabilities)

        # Update final medal counts
        for country in assigned_country:
            final_medal_counts = final_medal_counts.append({'Country': country, 'Medal_Type': medal, 'Medal_Count': 1}, ignore_index=True)

# Aggregate medal counts per country
aggregated_medal_counts = final_medal_counts.groupby(['Country', 'Medal_Type']).sum().unstack(fill_value=0)
aggregated_medal_counts.columns = aggregated_medal_counts.columns.get_level_values(1)
aggregated_medal_counts['Total'] = aggregated_medal_counts.sum(axis=1)

# View the final medal counts
print(aggregated_medal_counts.head())


In [None]:
num_simulations = 1000
simulation_results = []

for _ in range(num_simulations):
    temp_medal_counts = pd.DataFrame(columns=['Country', 'Medal_Type', 'Medal_Count'])
    for event in events_list:
        event_data = data[data['Event'] == event]
        for medal in medal_types:
            event_medal_data = event_data[event_data['Medal_Type'] == medal]
            X_event = event_medal_data[independent_vars]
            X_event = sm.add_constant(X_event)
            expected_counts = poisson_model.predict(X_event)
            probabilities = expected_counts / expected_counts.sum() if expected_counts.sum() > 0 else np.ones(len(expected_counts)) / len(expected_counts)
            assigned_country = np.random.choice(event_medal_data['Country'], size=1, p=probabilities)
            for country in assigned_country:
                temp_medal_counts = temp_medal_counts.append({'Country': country, 'Medal_Type': medal, 'Medal_Count': 1}, ignore_index=True)
    aggregated_temp = temp_medal_counts.groupby(['Country', 'Medal_Type']).sum().unstack(fill_value=0)
    aggregated_temp.columns = aggregated_temp.columns.get_level_values(1)
    aggregated_temp['Total'] = aggregated_temp.sum(axis=1)
    simulation_results.append(aggregated_temp['Total'])

# Convert to DataFrame
simulation_df = pd.DataFrame(simulation_results)

# Calculate prediction intervals
prediction_intervals = simulation_df.quantile([0.025, 0.5, 0.975]).T
prediction_intervals.columns = ['Lower_95%', 'Median', 'Upper_95%']

# Merge with final counts
final_with_intervals = aggregated_medal_counts.join(prediction_intervals)

print(final_with_intervals.head())
