In [1]:
import pandas as pd

In [19]:
# read in GDP dataset
gdp = pd.read_csv('gdp_country.csv', skiprows=4)

# drop columns we don't need
gdp = gdp.drop(columns=['Indicator Name', 'Indicator Code', 'Country Code', '2023', 'Unnamed: 68'])

# melt dataframe to make it clean and readable
gdp = gdp.melt(id_vars=["Country Name"], 
                  var_name="Year", 
                  value_name="GDP")

# renaming Country column
gdp.rename(columns={'Country Name' : 'Country'}, inplace=True)
gdp

# convert 'Year' to integer
gdp['Year'] = gdp['Year'].astype(int)
gdp.dtypes

# filter to observations after 2010 to match our other dataset
gdp = gdp[gdp['Year'] >= 2010]
gdp

# drop missing values
gdp = gdp.dropna()
gdp

# reset index
gdp.reset_index(drop=True, inplace=True)
gdp

iceland = gdp[gdp['Country'] == 'Iceland']
iceland

Unnamed: 0,Country,Year,GDP
112,Iceland,2010,13751160000.0
373,Iceland,2011,15221620000.0
634,Iceland,2012,14751510000.0
894,Iceland,2013,16125060000.0
1154,Iceland,2014,17867660000.0
1415,Iceland,2015,17517210000.0
1674,Iceland,2016,20793170000.0
1932,Iceland,2017,24728290000.0
2190,Iceland,2018,26260850000.0
2448,Iceland,2019,24681340000.0


In [None]:
# now lets look at the life expectancy by country

# groupby country and order by life expectancy
avg_life_exp = (life_exp.groupby('Country', as_index=False)['Life Expectancy'].mean()
    .rename(columns={'Life Expectancy': 'Avg. Life Expectancy'}))

# now let's look at the life expectancy for the top 5 consumers of each food type
# first extract the top 5 consumers for each food type
top_5_animal_calories = animal_calories.head(5)['Country'].tolist()
top_5_animal_protein = animal_protein.head(5)['Country'].tolist()
top_5_animal_fat = animal_fat.head(5)['Country'].tolist()

top_5_plant_calories = plant_calories.head(5)['Country'].tolist()
top_5_plant_protein = plant_protein.head(5)['Country'].tolist()
top_5_plant_fat = plant_fat.head(5)['Country'].tolist()

# making sure we only get unique values
top_countries = list(set(
    top_5_animal_calories + top_5_animal_protein + top_5_animal_fat +
    top_5_plant_calories + top_5_plant_protein + top_5_plant_fat
))

# fixing the names of countries to match life_exp dataframe
top_consumers_data['Country'] = top_consumers_data['Country'].replace({
    "United States of America": "United States",
    "China, mainland": "China",
    "China, Hong Kong SAR": "Hong Kong Sar, China",
    "TÃ¼rkiye": "Turkiye",
    "Egypt": "Egypt, Arab Rep."
})

# Verify that replacements worked
print("Updated countries in top_consumers_data:\n", top_consumers_data['Country'].unique())

# creating dataframe of top 5 consumers per food
top_consumers_data = pd.DataFrame({
    'Category': (['Animal Calories'] * 5 + ['Animal Protein'] * 5 + ['Animal Fat'] * 5 +
                 ['Plant Calories'] * 5 + ['Plant Protein'] * 5 + ['Plant Fat'] * 5),
    'Country': (top_5_animal_calories + top_5_animal_protein + top_5_animal_fat +
                top_5_plant_calories + top_5_plant_protein + top_5_plant_fat)
})

# merge with life expectancy data to compare
top_consumers_data = top_consumers_data.merge(avg_life_exp, on='Country', how='left')
top_consumers_data

# let's look at the 5 countries with Blue Zones
#blue_zones = life_exp[life_exp['Country'].isin(['Italy', 'Greece', 'United States', 'Costa Rica', 'Japan'])]
#blue_zones