In [16]:
import pandas as pd
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import joblib  # For saving and loading models

base_data_path = '/Users/chris/MCM_2025_C/Main/'

# 1. Correct File Loading
medal_counts = pd.read_csv('/Users/chris/MCM_2025_C/Data/summerOly_medal_counts_with_codes.csv')
programs = pd.read_csv('/Users/chris/MCM_2025_C/Data/summerOly_programs.csv')
hosts = pd.read_csv('/Users/chris/MCM_2025_C/Data/summerOly_hosts_with_codes.csv')  # Corrected

# 2. Load Athlete Counts Data
pattern = os.path.join('/Users/chris/MCM_2025_C/Data/athlete_probabilities_by_year', '*.csv')
athlete_files = glob.glob(pattern, recursive=True)
athlete_dfs = []

for file in athlete_files:
    try:
        df = pd.read_csv(file)
        # Standardize column names
        df.rename(columns={
            'bronze': 'Bronze',
            'silver': 'Silver',
            'gold': 'Gold',
            'total_athletes': 'Total_Athletes',
            'year': 'Year'
        }, inplace=True)
        # Extract Year from filename if 'Year' column is absent
        if 'Year' not in df.columns:
            year_str = os.path.splitext(os.path.basename(file))[0].split('_')[-1]
            df['Year'] = int(year_str)
        athlete_dfs.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")


### Process and Aggregate the athlete_probabilities_by_year DataFrame

In [17]:

# Concatenate all athlete DataFrames
athlete_counts = pd.concat(athlete_dfs, ignore_index=True)

# 3. Aggregate Athlete Counts
if 'Total_Athletes' in athlete_counts.columns:
    athlete_counts_agg = athlete_counts.groupby(['Year', 'Country Code']).agg({
        'Total_Athletes': 'sum'
    }).reset_index()
else:
    # If 'Total_Athletes' is not present, count the number of athletes
    athlete_counts_agg = athlete_counts.groupby(['Year', 'Country Code']).size().reset_index(name='Total_Athletes')


# 4. Standardize Country Codes Across DataFrames
def standardize_CountryCode(df, column='Country Code'):
    df[column] = df[column].str.upper().str.strip()
    return df

medal_counts = standardize_CountryCode(medal_counts, 'Country Code')
hosts = standardize_CountryCode(hosts, 'Country Code')        # Assuming hosts has 'Country Code'
athlete_counts_agg = standardize_CountryCode(athlete_counts_agg, 'Country Code')


### Process and Aggregate the programs DataFrame


In [18]:
# 2. Reshape 'programs' DataFrame from Wide to Long Format
# Identify year columns (assuming they are all numeric)
year_columns = [col for col in programs.columns if col.isdigit()]

# Melt the DataFrame
programs_long = programs.melt(
    id_vars=['Sport', 'Discipline', 'Code', 'Sports Governing Body'],
    value_vars=year_columns,
    var_name='Year',
    value_name='Event_Count'
)

# Convert 'Year' to integer
programs_long['Year'] = programs_long['Year'].astype(int)

# Preview the reshaped DataFrame
print("Reshaped Programs DataFrame (Long Format):")
print(programs_long.head())

Reshaped Programs DataFrame (Long Format):
      Sport         Discipline Code Sports Governing Body  Year  Event_Count
0  Aquatics  Artistic Swimming  SWA        World Aquatics  1896          0.0
1  Aquatics             Diving  DIV        World Aquatics  1896          0.0
2  Aquatics  Marathon Swimming  OWS        World Aquatics  1896          0.0
3  Aquatics           Swimming  SWM        World Aquatics  1896          4.0
4  Aquatics         Water Polo  WPO        World Aquatics  1896          0.0


In [19]:
# 3. Aggregate Event Counts and Number of Sports per Year
event_agg = programs_long.groupby('Year').agg(
    Total_Events=('Event_Count', 'sum'),
    Number_of_Sports=('Sport', 'nunique')
).reset_index()

# Preview the aggregated data
print("Aggregated Event Data per Year:")
print(event_agg.head())

Aggregated Event Data per Year:
   Year  Total_Events  Number_of_Sports
0  1896         107.0                51
1  1900         236.0                51
2  1904         224.0                51
3  1906         176.0                51
4  1908         267.0                51


## Merge All DataFrames into merged_df

In [20]:
if not medal_counts.empty and not athlete_counts_agg.empty:
    merged_df = pd.merge(
        medal_counts,
        athlete_counts_agg,
        on=['Year', 'Country Code'],
        how='left'
    )
    print("\nMerged Medal Counts with Athlete Counts successfully.")
else:
    merged_df = medal_counts.copy()
    merged_df['Total_Athletes'] = 0
    print("\nAthlete Counts Aggregated DataFrame is empty. 'Total_Athletes' set to 0 in merged_df.")



Merged Medal Counts with Athlete Counts successfully.


### Handle Missing Total_Athletes

In [21]:
if 'Total_Athletes' in merged_df.columns:
    merged_df['Total_Athletes'] = merged_df['Total_Athletes'].fillna(0).astype(int)
    print("'Total_Athletes' missing values filled with 0.")
else:
    merged_df['Total_Athletes'] = 0
    print("'Total_Athletes' column not found. Created and set to 0.")


'Total_Athletes' missing values filled with 0.


### Merge with hosts DataFrame to Set Is_Host Indicator

In [22]:
if not hosts.empty:
    # Assuming 'hosts' DataFrame has 'Year' and 'Country Code' indicating the host country each year
    host_info = set(hosts[['Year', 'Country Code']].drop_duplicates().itertuples(index=False, name=None))
    
    # Create 'Is_Host' column
    merged_df['Is_Host'] = merged_df.apply(
        lambda row: 1 if (row['Year'], row['Country Code']) in host_info else 0,
        axis=1
    )
    print("'Is_Host' indicator set successfully.")
else:
    merged_df['Is_Host'] = 0
    print("Hosts DataFrame is empty. 'Is_Host' set to 0.")


'Is_Host' indicator set successfully.


### Merge Aggregated Data

In [23]:
if not event_agg.empty:
    merged_df = pd.merge(
        merged_df,
        event_agg,
        on='Year',
        how='left'
    )
    # Fill missing values with 0
    merged_df['Total_Events'] = merged_df['Total_Events'].fillna(0).astype(int)
    merged_df['Number_of_Sports'] = merged_df['Number_of_Sports'].fillna(0).astype(int)
    print("Merged with Aggregated Event Data successfully.")
else:
    merged_df['Total_Events'] = 0
    merged_df['Number_of_Sports'] = 0
    print("Aggregated Event Data is empty. 'Total_Events' and 'Number_of_Sports' set to 0.")


# merged_df['Athletes_per_Event'] = merged_df['Total_Athletes'] / merged_df['Total_Events'].replace(0, 1)  # Avoid division by zero
# Check for unique Country Code
unique_countries = merged_df['Country Code'].nunique()
total_rows = merged_df.shape[0]
print(f"Unique countries: {unique_countries}")
print(f"Total rows in merged_df: {total_rows}")

if unique_countries != total_rows:
    print("Warning: There are duplicate Country Code entries. Aggregating data to ensure one row per country.")
    # Aggregate data (e.g., sum of features) to have one row per country
    merged_df = merged_df.groupby('Country Code').agg({
        'Total': 'sum',
        'Total_Athletes': 'sum',
        'Is_Host': 'max',  # Assuming Is_Host is binary (0 or 1)
        'Number_of_Sports': 'sum',
        'Total_Events': 'sum',
        # Add other relevant features as needed
    }).reset_index()
    print("Data aggregated to have one row per country.")
else:
    print("All Country Code entries are unique.")


print("\nFinal Merged DataFrame:")
print(merged_df.head())


Merged with Aggregated Event Data successfully.
Unique countries: 152
Total rows in merged_df: 1419
Data aggregated to have one row per country.

Final Merged DataFrame:
  Country Code  Total  Total_Athletes  Is_Host  Number_of_Sports  Total_Events
0          AFG      2              48        0               102          1343
1          AHO      1              32        0                51           528
2          ALB      2              26        0                51           738
3          ALG     20             656        0               408          5099
4          ANZ     12              40        0               102           503


### Define Variables for Modeling


In [24]:
# Define the dependent variable
dependent_var = 'Total'  # This is the total medal count

# Verify if 'Total' exists in merged_df
if dependent_var not in merged_df.columns:
    print(f"Error: Dependent variable '{dependent_var}' not found in merged_df.")
    # Optionally, inspect available columns
    print("Available columns:", merged_df.columns.tolist())
else:
    print(f"\nDependent variable set to '{dependent_var}'.")



Dependent variable set to 'Total'.


In [25]:
# Define the list of independent variables
independent_vars = ['Total_Athletes', 'Is_Host', 'Number_of_Sports', 'Total_Events']

# Verify if all independent variables exist in merged_df
missing_vars = [var for var in independent_vars if var not in merged_df.columns]
if missing_vars:
    print(f"Warning: The following independent variables are missing in merged_df: {missing_vars}")
    # Handle missing variables, e.g., create them with default values
    for var in missing_vars:
        merged_df[var] = 0
    print(f"Missing independent variables {missing_vars} created with default value 0.")
else:
    print("All independent variables are present in merged_df.")


All independent variables are present in merged_df.


In [26]:
# Define the output path
output_path = '/Users/chris/MCM_2025_C/Data/merged_data.csv'

# Save merged_df to CSV
try:
    merged_df.to_csv(output_path, index=False)
    print(f"\nMerged DataFrame saved successfully to {output_path}.")
except Exception as e:
    print(f"Error saving merged DataFrame to CSV: {e}")



Merged DataFrame saved successfully to /Users/chris/MCM_2025_C/Data/merged_data.csv.


## Split the Data into Training and Testing Sets

In [27]:
X = merged_df[independent_vars]
y = merged_df[dependent_var]

# Check for missing values in X and y
print("\nMissing values in X:")
print(X.isnull().sum())

print("\nMissing values in y:")
print(y.isnull().sum())

# Handle missing values if any
if X.isnull().values.any():
    X = X.fillna(X.median())
    print("Missing values in X filled with median.")

if y.isnull().values.any():
    y = y.fillna(0)
    print("Missing values in y filled with 0.")

# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nData split into training and testing sets successfully.")

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")



Missing values in X:
Total_Athletes      0
Is_Host             0
Number_of_Sports    0
Total_Events        0
dtype: int64

Missing values in y:
0

Data split into training and testing sets successfully.


Feature Scaling

In [28]:
# 10. Feature Scaling (If Needed)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling applied successfully.")


Feature scaling applied successfully.


## Train the Random Forest Regressor

In [29]:
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Save the trained Random Forest model for future use
joblib.dump(rf_model, os.path.join(base_data_path, 'random_forest_model.pkl'))
print("Random Forest Regressor trained and saved successfully.")


Random Forest Regressor trained and saved successfully.


###  Train the Multinomial Logistic Regression

In [32]:
# Create a binary target: 1 if the country won at least one medal, 0 otherwise
y_train_binary = (y_train > 0).astype(int)
print(y_train_binary)
print(y_train)

# Initialize Multinomial Logistic Regression
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Fit the model on the training data
logreg.fit(X_train, y_train_binary)

# Save the trained Logistic Regression model for future use
joblib.dump(logreg, os.path.join(base_data_path, 'logistic_regression_model.pkl'))
print("Logistic Regression model trained and saved successfully.")


29     1
22     1
51     1
75     1
11     1
      ..
71     1
106    1
14     1
92     1
102    1
Name: Total, Length: 121, dtype: int64
29       4
22     354
51     981
75      78
11       1
      ... 
71     663
106     18
14       2
92       2
102     11
Name: Total, Length: 121, dtype: int64




ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

#### Load data for prediction

In [None]:
# Load 2028 Athlete and Event Data
athletes_2028_path = os.path.join(base_data_path, 'athletes_2028.csv')
athletes_2028 = pd.read_csv(athletes_2028_path)

# Standardize 'Country_Code'
athletes_2028 = standardize_country_code(athletes_2028, 'Country_Code')

# Select Features for Prediction
X_2028 = athletes_2028[independent_vars]

# Load New Countries Data
new_countries_2028_path = os.path.join(base_data_path, 'new_countries_2028.csv')
new_countries_2028 = pd.read_csv(new_countries_2028_path)

# Standardize 'Country_Code' in New Countries Data
new_countries_2028 = standardize_country_code(new_countries_2028, 'Country_Code')

# Select Features for New Countries
X_new_2028 = new_countries_2028[independent_vars]


Scale Features and Load Model

In [None]:
# Initialize the scaler (ensure it's the same scaler used during training)
scaler = StandardScaler()
scaler.fit(X_train)  # Fit on training data

# Transform the 2028 data
X_2028_scaled = scaler.transform(X_2028)
X_new_2028_scaled = scaler.transform(X_new_2028)


# Load the trained Random Forest model
rf_model = joblib.load(os.path.join(base_data_path, 'random_forest_model.pkl'))

# Load the trained Logistic Regression model
logreg = joblib.load(os.path.join(base_data_path, 'logistic_regression_model.pkl'))



## Predict Medal Counts for 2028

In [None]:
# Predict total medals for each country in 2028
predicted_medals_2028 = rf_model.predict(X_2028_scaled)

# Assign predictions to the 2028 DataFrame
athletes_2028['Predicted_Medals'] = predicted_medals_2028.round().astype(int)

# Predict probability of winning at least one medal for new countries
new_countries_proba = logreg.predict_proba(X_new_2028_scaled)[:, 1]

# Combine predictions with new countries data
new_countries_2028['Medal_Probability'] = new_countries_proba

### Allocate All Medals

In [None]:
# Define total medals to be awarded in 2028
total_medals_2028 = 1000  # Replace with actual total

# Sum of predicted medals from Random Forest
sum_pred_medals = athletes_2028['Predicted_Medals'].sum()

# Normalize predictions to match the total medals
athletes_2028['Normalized_Medals'] = athletes_2028['Predicted_Medals'] * (total_medals_2028 / sum_pred_medals)

# Round to integer medal counts
athletes_2028['Normalized_Medals'] = athletes_2028['Normalized_Medals'].round().astype(int)

# Assign normalized medals
athletes_2028['Predicted_Medals'] = athletes_2028['Normalized_Medals']
athletes_2028.drop('Normalized_Medals', axis=1, inplace=True)



# Number of new medals to allocate (e.g., 50)
new_medals = 50  # Adjust based on strategy

# Sort new countries by their medal probability in descending order
new_countries_sorted = new_countries_2028.sort_values(by='Medal_Probability', ascending=False)

# Normalize probabilities
total_prob_new = new_countries_sorted['Medal_Probability'].sum()

# Calculate assigned medals based on probabilities
new_countries_sorted['Assigned_Medals'] = (new_countries_sorted['Medal_Probability'] / total_prob_new * new_medals).round().astype(int)

# Assign medals to new countries
for idx, row in new_countries_sorted.iterrows():
    new_entry = {
        'Country_Code': row['Country_Code'],
        'Predicted_Medals': row['Assigned_Medals']
    }
    athletes_2028 = athletes_2028.append(new_entry, ignore_index=True)

print("\nAssigned Medals to New Countries:")
print(new_countries_sorted[['Country_Code', 'Assigned_Medals']])



NameError: name 'athletes_2028' is not defined

In [None]:
# Calculate the total predicted medals
total_predicted_medals = athletes_2028['Predicted_Medals'].sum()

# Check if it matches the total medals
if total_predicted_medals != total_medals_2028:
    difference = total_medals_2028 - total_predicted_medals
    print(f"\nAdjusting medal counts by {difference} to match the total medals.")
    
    # Add/subtract the difference to/from the country with the highest predicted medals
    if difference > 0:
        idx_max = athletes_2028['Predicted_Medals'].idxmax()
        athletes_2028.at[idx_max, 'Predicted_Medals'] += difference
    else:
        idx_max = athletes_2028['Predicted_Medals'].idxmax()
        athletes_2028.at[idx_max, 'Predicted_Medals'] += difference  # difference is negative
    
    print(f"Adjusted Total Predicted Medals: {athletes_2028['Predicted_Medals'].sum()}")
else:
    print("\nAll medals have been successfully distributed.")


In [None]:
# Save the final predictions to a new CSV
final_predictions_path = '/Users/chris/MCM_2025_C/Mainmedal_count_2028_predictions.csv'
athletes_2028[['Country_Code', 'Predicted_Medals']].to_csv(final_predictions_path, index=False)
print(f"\nFinal medal predictions for 2028 saved to {final_predictions_path}.")
