In [7]:
import pandas as pd

# Load the dataset
file_path = "data/Updated_Medals_Summary_with_Numeric_ID.csv"
df = pd.read_csv(file_path)

# Correct the Total_Medals column (sum of Gold, Silver, Bronze)
df['Total_Medals_Corrected'] = df[['Gold', 'Silver', 'Bronze']].sum(axis=1)

# Aggregate the data by NOC, numeric_id, and season to calculate totals for Winter and Summer
season_totals = df.groupby(['noc', 'numeric_id', 'season'], as_index=False).agg({
    'Gold': 'sum',
    'Silver': 'sum',
    'Bronze': 'sum',
    'Total_Medals_Corrected': 'sum'
}).rename(columns={
    'Gold': 'Season_Gold',
    'Silver': 'Season_Silver',
    'Bronze': 'Season_Bronze',
    'Total_Medals_Corrected': 'Season_Total_Medals'
})

# Separate Winter and Summer season totals
winter_totals = season_totals[season_totals['season'] == 'Winter']
summer_totals = season_totals[season_totals['season'] == 'Summer']

# Merge Winter and Summer totals back together by noc and numeric_id
df_totals = pd.merge(
    winter_totals[['noc', 'numeric_id', 'Season_Total_Medals']],
    summer_totals[['noc', 'numeric_id', 'Season_Total_Medals']],
    on=['noc', 'numeric_id'],
    how='outer',
    suffixes=('_Winter', '_Summer')
)

# Replace NaN with 0 for countries without Winter or Summer medals
df_totals.fillna(0, inplace=True)

# Convert columns to integers
df_totals[['Season_Total_Medals_Winter', 'Season_Total_Medals_Summer']] = df_totals[
    ['Season_Total_Medals_Winter', 'Season_Total_Medals_Summer']
].astype(int)

# Aggregate overall totals
df_aggregated = df.groupby(['noc', 'numeric_id'], as_index=False).agg({
    'Gold': 'sum',
    'Silver': 'sum',
    'Bronze': 'sum',
    'Total_Medals_Corrected': 'sum'
}).rename(columns={
    'Gold': 'Total_Gold',
    'Silver': 'Total_Silver',
    'Bronze': 'Total_Bronze',
    'Total_Medals_Corrected': 'Total_Medals'
})

# Add Winter and Summer totals to the aggregated dataset
df_final = pd.merge(
    df_aggregated,
    df_totals[['noc', 'numeric_id', 'Season_Total_Medals_Winter', 'Season_Total_Medals_Summer']],
    on=['noc', 'numeric_id'],
    how='left'
)

# Save the new CSV file
new_file_path = "data/Aggregated_Medals_Summary.csv"
df_final.to_csv(new_file_path, index=False)

new_file_path


'data/Aggregated_Medals_Summary.csv'