In [1]:
# Import our dependencies
import numpy as np
import pandas as pd
import polars as pl

In [2]:
#  Import and read the input csv
dcs_df = pd.read_csv('../../Raw_Data/Daily_Caloric_Supply-vs-life-expectancy.csv')
dcs_df

Unnamed: 0,Entity,Code,Year,Life expectancy - Sex: all - Age: at birth - Variant: estimates,Daily caloric supply (OWID based on UN FAO & historical sources),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1950,27.7,,7480464.0,
2,Afghanistan,AFG,1951,28.0,,7571542.0,
3,Afghanistan,AFG,1952,28.4,,7667534.0,
4,Afghanistan,AFG,1953,28.9,,7764549.0,
...,...,...,...,...,...,...,...
58281,Zimbabwe,ZWE,1946,,,2526552.0,
58282,Zimbabwe,ZWE,1947,,,2575037.0,
58283,Zimbabwe,ZWE,1948,,,2624453.0,
58284,Zimbabwe,ZWE,1949,,,2681604.0,


In [3]:
# Drop the extra columns
dcs_df = dcs_df.drop(['Code','Continent','Life expectancy - Sex: all - Age: at birth - Variant: estimates','Population (historical estimates)'], axis=1)

In [4]:
# Rename the important columns
dcs_df = dcs_df.rename(columns={'Entity':'Country', 'Daily caloric supply (OWID based on UN FAO & historical sources)':'Daily Caloric Supply'})
dcs_df

Unnamed: 0,Country,Year,Daily Caloric Supply
0,Abkhazia,2015,
1,Afghanistan,1950,
2,Afghanistan,1951,
3,Afghanistan,1952,
4,Afghanistan,1953,
...,...,...,...
58281,Zimbabwe,1946,
58282,Zimbabwe,1947,
58283,Zimbabwe,1948,
58284,Zimbabwe,1949,


In [5]:
dcs_df = dcs_df[dcs_df['Year']>=1990]
dcs_df = dcs_df[dcs_df['Year']<=2023]
dcs_df

Unnamed: 0,Country,Year,Daily Caloric Supply
0,Abkhazia,2015,
41,Afghanistan,1990,2314.0
42,Afghanistan,1991,2044.0
43,Afghanistan,1992,1891.0
44,Afghanistan,1993,1910.0
...,...,...,...
58094,Zimbabwe,2018,1908.0
58095,Zimbabwe,2019,
58096,Zimbabwe,2020,
58097,Zimbabwe,2021,


In [6]:
# Open the list of country name corrections
corrections_df = pd.read_csv('../../Clean_Data/master_country_list/country_name_corrections.csv')

# Convert the corrections dataframe to a dictionary.
correction_dict = dict(zip(corrections_df['wrong'], corrections_df['correct']))

In [7]:
# Apply the correction dictionary to fix the known errors
dcs_df['Country'] = dcs_df['Country'].replace(correction_dict)

In [8]:
# Open the master list of countries
countries_df = pd.read_csv('../../Clean_Data/master_country_list/country_profile_urls.csv')
countries_df = countries_df.drop(['profile_url'], axis=1)

In [9]:
country_list = countries_df['country'].tolist()
country_list

['Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Botswana',
 'Brazil',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo',
 'Democratic Republic of the Congo',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 "Côte d'Ivoire",
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Honduras',
 'Hong Kong (China)',
 

In [10]:
# Create a list of dcs countries
dcs_countries = dcs_df['Country'].tolist()
dcs_countries

['Abkhazia',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (U

In [11]:
# clean the FS data based on the SS country list
no_match = []

for country in dcs_countries:
    if country in country_list:
        continue
    else:
        no_match.append(country)

In [12]:
# Likely need to do manual country name clean-up for accented characters and abbreviations.
no_match

['Abkhazia',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Afghanistan',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (UN)',
 'Africa (U

In [13]:
dcs_clean_df = dcs_df.drop(dcs_df[dcs_df['Country'].isin(no_match)].index.tolist())
dcs_clean_df

Unnamed: 0,Country,Year,Daily Caloric Supply
632,Albania,1990,2568.0
633,Albania,1991,2572.0
634,Albania,1992,2654.0
635,Albania,1993,2795.0
636,Albania,1994,2877.0
...,...,...,...
58093,Zimbabwe,2017,1891.0
58094,Zimbabwe,2018,1908.0
58095,Zimbabwe,2019,
58096,Zimbabwe,2020,


In [14]:
dcs_clean_df = dcs_clean_df.sort_values(['Country','Year'], axis=0)
dcs_clean_df.head(30)

Unnamed: 0,Country,Year,Daily Caloric Supply
632,Albania,1990,2568.0
633,Albania,1991,2572.0
634,Albania,1992,2654.0
635,Albania,1993,2795.0
636,Albania,1994,2877.0
637,Albania,1995,2717.0
638,Albania,1996,2843.0
639,Albania,1997,2725.0
640,Albania,1998,2725.0
641,Albania,1999,2797.0


In [15]:
# Create a complete dataframe with all countries and all the years.

# Define variables to collect the details.
#data_countries = dcs_clean_df['Country'].unique()
data_countries = country_list
years = list(range(1990, 2023))

# Create a helper DataFrame.
helper_df = pd.DataFrame({'Country':np.repeat(data_countries, len(years)), 'Year':np.tile(years, len(data_countries))})

# Merge the helper DataFrame with the original data to ensure the data range is complete.
complete_df = pd.merge(helper_df, dcs_clean_df, on=['Country','Year'], how='left')

In [16]:
dcs_countries_df = pd.DataFrame(data_countries, columns=['Country'])
dcs_countries_df.to_csv('../../Clean_Data/Clean_CSV_Files/dcs_countries.csv', index=False)

In [17]:
complete_df

Unnamed: 0,Country,Year,Daily Caloric Supply
0,Albania,1990,2568.0
1,Albania,1991,2572.0
2,Albania,1992,2654.0
3,Albania,1993,2795.0
4,Albania,1994,2877.0
...,...,...,...
6101,Zimbabwe,2018,1908.0
6102,Zimbabwe,2019,
6103,Zimbabwe,2020,
6104,Zimbabwe,2021,


In [18]:
duplicates = complete_df[complete_df.duplicated(subset=['Country', 'Year'], keep=False)]
duplicates

Unnamed: 0,Country,Year,Daily Caloric Supply
1477,Czechia,2015,3122.0
1478,Czechia,2015,


In [19]:
# Sort DataFrame to isloate the missing values
complete_df = complete_df.sort_values('Daily Caloric Supply', ascending=False)

# Drop duplicates
complete_df = complete_df.drop_duplicates(subset=['Country', 'Year'], keep='first')

In [20]:
complete_df = complete_df.sort_values(['Country','Year'], axis=0)
complete_df.head(5)

Unnamed: 0,Country,Year,Daily Caloric Supply
0,Albania,1990,2568.0
1,Albania,1991,2572.0
2,Albania,1992,2654.0
3,Albania,1993,2795.0
4,Albania,1994,2877.0


In [21]:
# Convert to wide format
wide_df = complete_df.pivot(index='Year', columns='Country', values='Daily Caloric Supply')

# Flatten the hierarchical columns and create 'Country_Series' style column names
wide_df.columns = [f'{col}_Daily Caloric Supply' for col in wide_df.columns]

# Reset the index, so 'Year' becomes a column again
wide_df.reset_index(inplace=True)

# Convert columns to numeric, coerce non-numeric values to NaN
for col in wide_df.columns[0:]:
    wide_df[col] = pd.to_numeric(wide_df[col], errors='coerce')

# Convert to Polars DataFrame
wide_pl_df = pl.from_pandas(wide_df)
    
wide_pl_df.head(5)

Year,Albania_Daily Caloric Supply,Algeria_Daily Caloric Supply,Andorra_Daily Caloric Supply,Angola_Daily Caloric Supply,Antigua and Barbuda_Daily Caloric Supply,Argentina_Daily Caloric Supply,Armenia_Daily Caloric Supply,Aruba_Daily Caloric Supply,Australia_Daily Caloric Supply,Austria_Daily Caloric Supply,Azerbaijan_Daily Caloric Supply,Bahamas_Daily Caloric Supply,Bahrain_Daily Caloric Supply,Bangladesh_Daily Caloric Supply,Barbados_Daily Caloric Supply,Belarus_Daily Caloric Supply,Belgium_Daily Caloric Supply,Belize_Daily Caloric Supply,Benin_Daily Caloric Supply,Bermuda_Daily Caloric Supply,Bhutan_Daily Caloric Supply,Bolivia_Daily Caloric Supply,Botswana_Daily Caloric Supply,Brazil_Daily Caloric Supply,British Virgin Islands_Daily Caloric Supply,Brunei_Daily Caloric Supply,Bulgaria_Daily Caloric Supply,Burkina Faso_Daily Caloric Supply,Burundi_Daily Caloric Supply,Cabo Verde_Daily Caloric Supply,Cambodia_Daily Caloric Supply,Cameroon_Daily Caloric Supply,Canada_Daily Caloric Supply,Central African Republic_Daily Caloric Supply,Chad_Daily Caloric Supply,Chile_Daily Caloric Supply,…,Serbia_Daily Caloric Supply,Seychelles_Daily Caloric Supply,Sierra Leone_Daily Caloric Supply,Singapore_Daily Caloric Supply,Slovakia_Daily Caloric Supply,Slovenia_Daily Caloric Supply,Solomon Islands_Daily Caloric Supply,South Africa_Daily Caloric Supply,South Korea_Daily Caloric Supply,Spain_Daily Caloric Supply,Sri Lanka_Daily Caloric Supply,Sudan_Daily Caloric Supply,Suriname_Daily Caloric Supply,Sweden_Daily Caloric Supply,Switzerland_Daily Caloric Supply,Syria_Daily Caloric Supply,Taiwan (China)_Daily Caloric Supply,Tajikistan_Daily Caloric Supply,Tanzania_Daily Caloric Supply,Thailand_Daily Caloric Supply,Togo_Daily Caloric Supply,Trinidad and Tobago_Daily Caloric Supply,Tunisia_Daily Caloric Supply,Turkmenistan_Daily Caloric Supply,Türkiye_Daily Caloric Supply,Uganda_Daily Caloric Supply,Ukraine_Daily Caloric Supply,United Kingdom_Daily Caloric Supply,United States_Daily Caloric Supply,Uruguay_Daily Caloric Supply,Uzbekistan_Daily Caloric Supply,Vanuatu_Daily Caloric Supply,Venezuela_Daily Caloric Supply,Vietnam_Daily Caloric Supply,Yemen_Daily Caloric Supply,Zambia_Daily Caloric Supply,Zimbabwe_Daily Caloric Supply
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1990,2568.0,2754.0,,1641.0,2467.0,2911.0,,,3174.0,3510.0,,2714.0,,2111.0,3106.0,,,2484.0,2134.0,2785.0,,1938.0,2216.0,2719.0,,2804.0,3129.0,2196.0,,2368.0,1938.0,2034.0,3023.0,1904.0,1648.0,2569.0,…,,,1950.0,,,,2111.0,2830.0,2956.0,3260.0,2167.0,,2445.0,2973.0,3441.0,,2947.0,,2187.0,2180.0,2091.0,2626.0,3154.0,,3775.0,2330.0,,3241.0,3493.0,2527.0,,2567.0,2362.0,1905.0,2048.0,2080.0,2013.0
1991,2572.0,2733.0,,1611.0,2532.0,3010.0,,,3121.0,3556.0,,2607.0,,2116.0,3006.0,,,2521.0,2198.0,2792.0,,2020.0,2224.0,2777.0,,2704.0,3041.0,2233.0,,2269.0,2009.0,2015.0,3042.0,1912.0,1800.0,2601.0,…,,,1991.0,,,,2175.0,2816.0,2950.0,3410.0,2187.0,,2547.0,2942.0,3350.0,,2987.0,,2231.0,2260.0,1930.0,2668.0,3111.0,,3724.0,2272.0,,3208.0,3522.0,2699.0,,2561.0,2423.0,1856.0,2029.0,2006.0,1963.0
1992,2654.0,2865.0,,1625.0,2545.0,3077.0,2184.0,,3125.0,3546.0,2318.0,2503.0,,2111.0,2983.0,3099.0,,2707.0,2142.0,3022.0,,2022.0,2203.0,2771.0,,2831.0,3022.0,2292.0,,2511.0,2037.0,2113.0,3092.0,1904.0,1819.0,2708.0,…,,,2018.0,,,2591.0,2177.0,2796.0,3001.0,3405.0,2153.0,,2507.0,3053.0,3420.0,,2974.0,2045.0,2145.0,2271.0,1896.0,2568.0,3144.0,2561.0,3710.0,2220.0,3367.0,3271.0,3559.0,2762.0,2718.0,2550.0,2522.0,1923.0,2108.0,1983.0,1955.0
1993,2795.0,2865.0,,1564.0,2376.0,3116.0,2099.0,,3042.0,3508.0,2235.0,2420.0,,2010.0,2912.0,3194.0,,2690.0,2155.0,2982.0,,2007.0,2069.0,2774.0,,2801.0,2958.0,2313.0,,2445.0,2026.0,2021.0,3122.0,1877.0,1652.0,2712.0,…,,,2038.0,,2790.0,2781.0,2161.0,2838.0,2956.0,3308.0,2095.0,,2542.0,3137.0,3358.0,,3058.0,2067.0,2100.0,2265.0,1959.0,2546.0,3165.0,2578.0,3729.0,2243.0,3242.0,3218.0,3605.0,2746.0,2707.0,2519.0,2401.0,2026.0,2092.0,1981.0,1943.0
1994,2877.0,2763.0,,1644.0,2255.0,3155.0,2181.0,,3054.0,3557.0,2159.0,2421.0,,2003.0,2905.0,3125.0,,2649.0,2179.0,2817.0,,1959.0,2186.0,2818.0,,2889.0,2873.0,2344.0,,2435.0,2038.0,2032.0,3209.0,1880.0,1979.0,2745.0,…,,,2116.0,,2979.0,2840.0,2288.0,2826.0,2980.0,3304.0,2290.0,,2513.0,3087.0,3332.0,,3064.0,2030.0,2084.0,2363.0,2042.0,2550.0,3113.0,2546.0,3728.0,2229.0,2980.0,3235.0,3665.0,2783.0,2697.0,2530.0,2371.0,2055.0,2051.0,2022.0,1931.0


In [22]:
# Handle the missing data

wide_pl_df = wide_pl_df.interpolate()
wide_pl_df = wide_pl_df.fill_null(strategy='backward')
wide_pl_df = wide_pl_df.fill_null(strategy='forward')
wide_pl_df = wide_pl_df.fill_null(strategy='one')

In [23]:
# Reformat the dataframe with the filled values...
# Convert the DataFrame back to pandas!!!
wide_df = wide_pl_df.to_pandas() 

# Now we can reshape it with pandas functionality
complete_df = wide_df.melt(id_vars='Year', value_vars=wide_df.columns[1:], var_name='Country', value_name='Daily Caloric Supply')

# Removing "_Daily Caloric Supply" from country names
complete_df['Country'] = complete_df['Country'].str.replace('_Daily Caloric Supply', '')

# Creating 'Country_Year' column
complete_df['Country_Year'] = complete_df['Country'] + '_' + complete_df['Year'].astype(str)

complete_df = complete_df.loc[:, ['Country_Year', 'Country', 'Year', 'Daily Caloric Supply']]

In [24]:
complete_df

Unnamed: 0,Country_Year,Country,Year,Daily Caloric Supply
0,Albania_1990,Albania,1990,2568.0
1,Albania_1991,Albania,1991,2572.0
2,Albania_1992,Albania,1992,2654.0
3,Albania_1993,Albania,1993,2795.0
4,Albania_1994,Albania,1994,2877.0
...,...,...,...,...
6100,Zimbabwe_2018,Zimbabwe,2018,1908.0
6101,Zimbabwe_2019,Zimbabwe,2019,1908.0
6102,Zimbabwe_2020,Zimbabwe,2020,1908.0
6103,Zimbabwe_2021,Zimbabwe,2021,1908.0


In [25]:
complete_df.to_csv('../../Clean_Data/Clean_CSV_Files/daily_calorie_supply.csv', index=False)

In [26]:
#fsle_clean_transform_df = fsle_clean_df.set_index('Entity').T
#fsle_clean_transform_df

In [27]:
#fsle_clean_transform_df.to_csv('food_supply_life_expectancy_transform.csv')