In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# Load the datasets
athlete_events_df = pd.read_csv('../Resources/DataFrames/Model_prep/Medals/athleteEvents_mf_model.csv')
health_data_df = pd.read_csv('../Resources/DataFrames/Model_prep/Perceived_health_model.csv')

In [3]:
athlete_events_df

Unnamed: 0,NOC,Team,Year,Sex,Medal,Count
0,AFG,Afghanistan,2008,M,Bronze,1
1,AFG,Afghanistan,2012,M,Bronze,1
2,ALG,Algeria,1992,F,Gold,1
3,ALG,Algeria,2000,F,Gold,1
4,ALG,Algeria,2008,F,Bronze,1
...,...,...,...,...,...,...
4505,ZIM,Zimbabwe,2004,F,Bronze,1
4506,ZIM,Zimbabwe,2004,F,Gold,1
4507,ZIM,Zimbabwe,2004,F,Silver,1
4508,ZIM,Zimbabwe,2008,F,Gold,1


In [4]:
health_data_df

Unnamed: 0,STRUCTURE_NAME,REF_AREA,Reference area,TIME_PERIOD,Age,% of pop perceived
0,Health status,FIN,Finland,1980,From 25 to 44 years,73.5
1,Health status,FIN,Finland,1981,From 25 to 44 years,75.4
2,Health status,FIN,Finland,1982,From 25 to 44 years,74.9
3,Health status,FIN,Finland,1983,From 25 to 44 years,76.5
4,Health status,FIN,Finland,1984,From 25 to 44 years,74.4
...,...,...,...,...,...,...
1438,Health status,HRV,Croatia,2019,From 15 to 24 years,95.3
1439,Health status,HRV,Croatia,2020,From 15 to 24 years,95.8
1440,Health status,HRV,Croatia,2021,From 15 to 24 years,95.9
1441,Health status,HRV,Croatia,2022,From 15 to 24 years,96.1


In [5]:
merged_df = pd.merge(athlete_events_df, health_data_df, left_on=['NOC', 'Year'], right_on=['REF_AREA', 'TIME_PERIOD'], how='inner')

In [10]:
merged_df

Unnamed: 0,NOC,Team,Year,Sex,Medal,Count,STRUCTURE_NAME,REF_AREA,Reference area,TIME_PERIOD,Age,% of pop perceived
0,AUS,Australia,2004,F,Bronze,12,Health status,AUS,Australia,2004,From 25 to 44 years,90.0
1,AUS,Australia,2004,F,Bronze,12,Health status,AUS,Australia,2004,From 15 to 24 years,93.3
2,AUS,Australia,2004,F,Gold,18,Health status,AUS,Australia,2004,From 25 to 44 years,90.0
3,AUS,Australia,2004,F,Gold,18,Health status,AUS,Australia,2004,From 15 to 24 years,93.3
4,AUS,Australia,2004,F,Silver,32,Health status,AUS,Australia,2004,From 25 to 44 years,90.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1625,USA,United States-2,2016,F,Gold,1,Health status,USA,United States,2016,From 15 to 24 years,96.1
1626,USA,United States-2,2016,M,Bronze,2,Health status,USA,United States,2016,From 25 to 44 years,93.1
1627,USA,United States-2,2016,M,Bronze,2,Health status,USA,United States,2016,From 15 to 24 years,96.1
1628,USA,United States-2,2016,M,Gold,1,Health status,USA,United States,2016,From 25 to 44 years,93.1


In [12]:
merged_df.columns

Index(['NOC', 'Team', 'Year', 'Sex', 'Medal', 'Count', 'STRUCTURE_NAME',
       'REF_AREA', 'Reference area', 'TIME_PERIOD', 'Age',
       '% of pop perceived '],
      dtype='object')

In [13]:
# Preparing features and target
features = merged_df[['% of pop perceived ']]
target = merged_df['Count']

In [14]:
# Continue with splitting the data and training the model
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [15]:
# Initialize the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

In [16]:
# Train the model
rf.fit(X_train, y_train)

In [17]:
# Make predictions
y_pred = rf.predict(X_test)

In [18]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 7.859775464472833


In [19]:
medal_counts = merged_df.pivot_table(index=['NOC', 'Year'], columns='Medal', values='Count', aggfunc='sum', fill_value=0).reset_index()

In [20]:
merged_df = pd.merge(medal_counts, health_data_df, left_on=['NOC', 'Year'], right_on=['REF_AREA', 'TIME_PERIOD'], how='inner')

In [25]:
# Analyze for each type of medal (Gold, Silver, Bronze) separately
for medal_type in ['Gold', 'Silver', 'Bronze']:
    print(f"\nAnalyzing {medal_type} Medals:")

    # Prepare the features and target
    features = merged_df[['% of pop perceived ']]  # Add other features as needed
    target = merged_df[medal_type]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    
    # Initialize and train the model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    print(f'Mean Absolute Error for {medal_type} Medals: {mae}')


Analyzing Gold Medals:
Mean Absolute Error for Gold Medals: 35.321644332450305

Analyzing Silver Medals:
Mean Absolute Error for Silver Medals: 25.006487228360363

Analyzing Bronze Medals:
Mean Absolute Error for Bronze Medals: 26.959028423897827


In [26]:
# Analyze for each country separately
countries = merged_df['NOC'].unique()

for country in countries:
    print(f"\nAnalyzing {country}:")

    # Filter the data for the specific country
    country_df = merged_df[merged_df['NOC'] == country]
    
    if country_df.shape[0] < 2:
        print(f"Not enough data for {country} to train a model.")
        continue
    
    for medal_type in ['Gold', 'Silver', 'Bronze']:
        print(f"\nAnalyzing {medal_type} Medals for {country}:")

        # Prepare the features and target
        features = country_df[['% of pop perceived ']]  # Add other features as needed
        target = country_df[medal_type]
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
        
        # Initialize and train the model
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(X_train, y_train)
        
        # Make predictions
        y_pred = rf.predict(X_test)
        
        # Evaluate the model
        mae = mean_absolute_error(y_test, y_pred)
        print(f'Mean Absolute Error for {medal_type} Medals in {country}: {mae}')


Analyzing AUS:

Analyzing Gold Medals for AUS:
Mean Absolute Error for Gold Medals in AUS: 92.12

Analyzing Silver Medals for AUS:
Mean Absolute Error for Silver Medals in AUS: 141.0

Analyzing Bronze Medals for AUS:
Mean Absolute Error for Bronze Medals in AUS: 56.4

Analyzing AUT:

Analyzing Gold Medals for AUT:
Mean Absolute Error for Gold Medals in AUT: 10.993333333333332

Analyzing Silver Medals for AUT:
Mean Absolute Error for Silver Medals in AUT: 10.413333333333334

Analyzing Bronze Medals for AUT:
Mean Absolute Error for Bronze Medals in AUT: 9.666666666666666

Analyzing BEL:

Analyzing Gold Medals for BEL:
Mean Absolute Error for Gold Medals in BEL: 0.97

Analyzing Silver Medals for BEL:
Mean Absolute Error for Silver Medals in BEL: 6.125

Analyzing Bronze Medals for BEL:
Mean Absolute Error for Bronze Medals in BEL: 1.83

Analyzing CAN:

Analyzing Gold Medals for CAN:
Mean Absolute Error for Gold Medals in CAN: 52.46166666666666

Analyzing Silver Medals for CAN:
Mean Absolu