In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
co2_df = pd.read_csv("./data/GCB2022v27_MtCO2_flat.csv")

In [37]:
# Dropping all rows that have all of the following columsn as NaN
co2_df.dropna(how="all", subset=['Coal', 'Oil', 'Gas', 'Cement', 'Flaring', 'Other'], inplace=True)

In [38]:
# fixng ugly name, fixing the year column, making sure goes from 1960-2021
co2_df.rename({"ISO 3166-1 alpha-3":"CountryCode", "Total":"Total_MTCO2", "Per Capita":"MTCO2_per_cap"}, axis=1, inplace=True)
co2_df.Year = co2_df.Year.astype('int')
co2_df = co2_df.loc[(co2_df['Year'] >= 1959) & (co2_df['Year'] <= 2021)]


In [39]:
# Setting the country and year as the index
co2_df.set_index(['CountryCode','Year'], inplace=True)
co2_df.sort_values(['CountryCode','Year'], inplace=True)

In [40]:
# Reading in the country data for GDP per capita
gdp_per_cap_df = pd.read_csv("./data/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_5358417.csv")

In [41]:
# dropping unnecessary columns
gdp_per_cap_df.drop(['Indicator Name', 'Indicator Code', 'Unnamed: 66', "Country Name"], axis = 1, inplace=True)

In [42]:
# renamign columns
gdp_per_cap_df.rename({'Country Code':'CountryCode'}, axis=1, inplace=True)
gdp_per_cap_df.head()

Unnamed: 0,CountryCode,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,ABW,,,,,,,,,,...,25609.955724,26515.67808,26942.307976,28421.386493,28451.273745,29326.708058,30220.594523,31650.760537,24487.86356,29342.100858
1,AFE,162.913034,162.551683,172.00246,199.189238,179.387799,198.230368,209.414665,211.70706,224.239783,...,1759.182395,1730.394686,1719.183721,1538.552268,1443.692371,1628.586788,1564.73434,1512.270553,1363.540741,1549.77273
2,AFG,62.369375,62.443703,60.950364,82.021738,85.511073,105.243196,143.103233,167.165675,134.012768,...,663.141053,651.987862,628.146804,592.476537,520.252064,530.149831,502.056771,500.522664,516.866552,368.754614
3,AFW,106.976475,112.047561,117.730633,122.278715,130.599963,137.186142,142.895375,127.303606,128.365494,...,1953.407033,2149.295219,2243.271464,1876.623483,1645.023767,1585.91193,1731.311792,1749.303317,1683.436391,1757.030626
4,AGO,,,,,,,,,,...,4962.552072,5101.983876,5059.080441,3100.830685,1709.515534,2283.214233,2487.500996,2142.238757,1603.993477,1953.533757


In [43]:
#melting values to get same format as co2_country_year
gdp_per_cap_df = gdp_per_cap_df.melt(id_vars=['CountryCode'], var_name='Year', value_name='GDP_per_cap')

In [44]:
# making sure GDP_per_cap is a float
gdp_per_cap_df.GDP_per_cap = gdp_per_cap_df.GDP_per_cap.astype('float')

In [45]:
# setting index as same

gdp_per_cap_df.set_index(['CountryCode', 'Year'], inplace=True)
gdp_per_cap_df.sort_values(['CountryCode', 'Year'], inplace=True)

In [46]:
# I know this is alittle weird, but merge was being not very cool, 
#so I did this instead. Joined but had index on CountryCode and Year instead of Country
co2_df.join(gdp_per_cap_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Country,Total_MTCO2,Coal,Oil,Gas,Cement,Flaring,Other,MTCO2_per_cap,GDP_per_cap
CountryCode,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ABW,1959,Aruba,0.719371,0.0,0.719371,0.0,0.0,0.00000,,13.561777,
ABW,1960,Aruba,0.618561,0.0,0.618561,0.0,0.0,0.00000,,11.327293,
ABW,1961,Aruba,0.645553,0.0,0.645553,0.0,0.0,0.00000,,11.566762,
ABW,1962,Aruba,0.708942,0.0,0.708942,0.0,0.0,0.00000,,12.507362,
ABW,1963,Aruba,0.679088,0.0,0.679088,0.0,0.0,0.00000,,11.815359,
...,...,...,...,...,...,...,...,...,...,...,...
,1988,Pacific Islands (Palau),0.183200,0.0,0.183200,0.0,0.0,0.00000,,1.877025,
,1989,Pacific Islands (Palau),0.227168,0.0,0.227168,0.0,0.0,0.00000,,2.196758,
,1990,Pacific Islands (Palau),0.234496,0.0,0.234496,0.0,0.0,0.00000,,3.764869,
,1991,Kuwaiti Oil Fires,477.924832,0.0,451.104352,0.0,0.0,26.82048,,0.000000,


In [47]:
co2_df.to_csv("./data/processed_data/co2_country_year.csv")

--- Health Stats ---

In [48]:
health_df = pd.read_csv('./data/HEALTH_STAT.csv')
health_df.head()

Unnamed: 0,VAR,Variable,UNIT,Measure,COU,Country,YEA,Year,Value,Flag Codes,Flags
0,PRHSMBAH,"Bad/very bad health, males aged 15+",PERCALEF,% of population (crude rate),FRA,France,2010,2010,7.8,,
1,PRHSMBAH,"Bad/very bad health, males aged 15+",PERCALEF,% of population (crude rate),FRA,France,2011,2011,7.7,,
2,PRHSMBAH,"Bad/very bad health, males aged 15+",PERCALEF,% of population (crude rate),FRA,France,2012,2012,7.5,,
3,PRHSMBAH,"Bad/very bad health, males aged 15+",PERCALEF,% of population (crude rate),FRA,France,2013,2013,7.9,,
4,PRHSMBAH,"Bad/very bad health, males aged 15+",PERCALEF,% of population (crude rate),FRA,France,2014,2014,7.5,,


In [49]:
#Drop columns we dont need
health_df = health_df.drop(columns=["VAR","UNIT","Measure","YEA","Flag Codes","Flags"])
health_df

Unnamed: 0,Variable,COU,Country,Year,Value
0,"Bad/very bad health, males aged 15+",FRA,France,2010,7.8
1,"Bad/very bad health, males aged 15+",FRA,France,2011,7.7
2,"Bad/very bad health, males aged 15+",FRA,France,2012,7.5
3,"Bad/very bad health, males aged 15+",FRA,France,2013,7.9
4,"Bad/very bad health, males aged 15+",FRA,France,2014,7.5
...,...,...,...,...,...
3511,"Bad/very bad health, total aged 15+",HRV,Croatia,2017,18.4
3512,"Bad/very bad health, total aged 15+",HRV,Croatia,2018,17.2
3513,"Bad/very bad health, total aged 15+",HRV,Croatia,2019,16.9
3514,"Bad/very bad health, total aged 15+",HRV,Croatia,2020,15.0


In [50]:
#We only need values from total population, so drop rows split by demographic
keep_rows = ['Bad/very bad health, total aged 15+',
             'Fair (not good, not bad) health, total aged 15+',
             'Good/very good health, total aged 15+']

split_health_df = health_df[health_df['Variable'].isin(keep_rows)].reset_index(drop=True)
split_health_df

Unnamed: 0,Variable,COU,Country,Year,Value
0,"Bad/very bad health, total aged 15+",SWE,Sweden,2010,5.4
1,"Bad/very bad health, total aged 15+",SWE,Sweden,2011,5.4
2,"Bad/very bad health, total aged 15+",SWE,Sweden,2012,4.9
3,"Bad/very bad health, total aged 15+",SWE,Sweden,2013,4.6
4,"Bad/very bad health, total aged 15+",SWE,Sweden,2014,4.6
...,...,...,...,...,...
1167,"Bad/very bad health, total aged 15+",HRV,Croatia,2016,18.8
1168,"Bad/very bad health, total aged 15+",HRV,Croatia,2017,18.4
1169,"Bad/very bad health, total aged 15+",HRV,Croatia,2018,17.2
1170,"Bad/very bad health, total aged 15+",HRV,Croatia,2019,16.9


In [51]:
#Rename columns
split_health_df.rename({'Variable':'PrecievedHealth', 'Value':'PercentOfPopulation'}, inplace=True, axis=1)
split_health_df

Unnamed: 0,PrecievedHealth,COU,Country,Year,PercentOfPopulation
0,"Bad/very bad health, total aged 15+",SWE,Sweden,2010,5.4
1,"Bad/very bad health, total aged 15+",SWE,Sweden,2011,5.4
2,"Bad/very bad health, total aged 15+",SWE,Sweden,2012,4.9
3,"Bad/very bad health, total aged 15+",SWE,Sweden,2013,4.6
4,"Bad/very bad health, total aged 15+",SWE,Sweden,2014,4.6
...,...,...,...,...,...
1167,"Bad/very bad health, total aged 15+",HRV,Croatia,2016,18.8
1168,"Bad/very bad health, total aged 15+",HRV,Croatia,2017,18.4
1169,"Bad/very bad health, total aged 15+",HRV,Croatia,2018,17.2
1170,"Bad/very bad health, total aged 15+",HRV,Croatia,2019,16.9


In [59]:
#Recode Variable column to be more readable
code = {
    'Bad/very bad health, total aged 15+' : 'Bad',
    'Fair (not good, not bad) health, total aged 15+' : 'Fair',
    'Good/very good health, total aged 15+' : 'good'
}

split_health_df['PrecievedHealth'].mask(split_health_df['PrecievedHealth'] == 'Bad/very bad health, total aged 15+', 'Bad', inplace=True)
split_health_df['PrecievedHealth'].mask(split_health_df['PrecievedHealth'] == 'Fair (not good, not bad) health, total aged 15+', 'Fair', inplace=True)
split_health_df['PrecievedHealth'].mask(split_health_df['PrecievedHealth'] == 'Good/very good health, total aged 15+', 'Good', inplace=True)
split_health_df

Unnamed: 0,PrecievedHealth,COU,Country,Year,PercentOfPopulation
0,Bad,SWE,Sweden,2010,5.4
1,Bad,SWE,Sweden,2011,5.4
2,Bad,SWE,Sweden,2012,4.9
3,Bad,SWE,Sweden,2013,4.6
4,Bad,SWE,Sweden,2014,4.6
...,...,...,...,...,...
1167,Bad,HRV,Croatia,2016,18.8
1168,Bad,HRV,Croatia,2017,18.4
1169,Bad,HRV,Croatia,2018,17.2
1170,Bad,HRV,Croatia,2019,16.9
