In [6]:
import pandas as pd
import numpy as np 

In [7]:
## read and format global datasets 

## Global dataset for average CO2, units: 
global_co2 = pd.read_csv('nasa/co2_formatted.csv')
global_co2['Year'] = [int(i) for i in global_co2.date]
global_co2 = global_co2.groupby('Year').mean()
global_co2.reset_index(inplace=True)
global_co2 = global_co2.drop(['#', 'Unnamed: 1', 'date', 'alized', 'Unnamed: 5', 'of days', 'mon mean'], axis =1)
global_co2.columns = ['Year', 'co2_avg']
global_co2 = global_co2[(global_co2.Year >= 1965) & (global_co2.Year <= 2019)]
global_co2.reset_index(inplace=True)
global_co2 = global_co2.drop(['index'], axis=1)

## Global dataset for temperature 
global_temp = pd.read_csv('nasa/global_temp_formatted.csv')
global_temp = global_temp.drop(['No_Smoothing'], axis=1)
global_temp.columns = ['Year', 'temp']
global_temp = global_temp[(global_temp.Year >= 1965) & (global_temp.Year <= 2019)]#benchmarked by CO2 data
global_temp.reset_index(inplace = True)
global_temp = global_temp.drop(['index'], axis=1)

## Global population dataset 
global_pop = pd.read_csv('absolute-increase-global-population.csv')
global_pop = global_pop.drop(['Entity', 'Code',
                              'Projected absolute population increase (OWID based on HYDE & UN)'], axis =1)
global_pop.dropna(inplace= True)
global_pop.columns = ['Year', 'pop_increase']
global_pop = global_pop[(global_pop.Year >= 1965) & (global_pop.Year <= 2019)] #benchmarked by CO2 data
global_pop.reset_index(inplace = True)
global_pop['pop'] = ''
pop_1957 = 2873306090
global_pop.loc[0, 'pop'] = pop_1957 + global_pop.loc[0, 'pop_increase']

for i in range(1,len(global_pop)):
    global_pop.loc[i, 'pop'] = global_pop.loc[i-1, 'pop'] + global_pop.loc[i, 'pop_increase']

# manually append missing years 
missing_yrs = [2017, 2018, 2019]
missing_pop = [7547858925.0, 7631091040.0, 7713468100.0] 
missing_df = pd.DataFrame({'Year': missing_yrs, 'pop_increase': np.zeros([3,]), 'pop': missing_pop })
global_pop = global_pop.append(missing_df)
global_pop.reset_index(inplace=True)
global_pop = global_pop.drop(['level_0', 'index', 'pop_increase'], axis=1)

## Global Energy Dataset 
# Data is consistent (annual) from 1965 onwards 
# source: https://ourworldindata.org/energy-mix
## info: Primary energy is calculated based on the 'substitution method' which takes account of the inefficiencies 
### in fossil fuel production by converting non-fossil energy into the energy inputs required if they had the same 
### conversion losses as fossil fuels.

global_energy = pd.read_csv('global-energy-substitution.csv')
global_energy = global_energy[global_energy.Year >= 1965]

global_energy['fossil_fuels (TWh)'] = global_energy['Oil (TWh; substituted energy)'] 
+ global_energy['Coal (TWh; substituted energy)']
+ global_energy['Gas (TWh; substituted energy)']

global_energy['renewables (TWh)'] = global_energy['Wind (TWh; substituted energy)'] 
+ global_energy['Hydropower (TWh; substituted energy)'] 
+ global_energy['Nuclear (TWh; substituted energy)'] 
+ global_energy['Traditional bimass (TWh; substituted energy)']
+ global_energy['Other renewables (TWh; substituted energy)'] 
+ global_energy['Biofuels (TWh; substituted energy)'] 
+ global_energy['Solar (TWh; substituted energy)']

global_energy = global_energy.drop(['Entity', 'Code', 'Wind (TWh; substituted energy)',
       'Oil (TWh; substituted energy)', 'Nuclear (TWh; substituted energy)',
       'Hydropower (TWh; substituted energy)',
       'Traditional bimass (TWh; substituted energy)',
       'Other renewables (TWh; substituted energy)',
       'Biofuels (TWh; substituted energy)', 'Solar (TWh; substituted energy)',
       'Coal (TWh; substituted energy)', 'Gas (TWh; substituted energy)'], axis=1)

global_energy['ALL Fuels'] = global_energy['fossil_fuels (TWh)'] + global_energy['renewables (TWh)']
global_energy.reset_index(inplace=True)
global_energy = global_energy.drop(['index'], axis =1)

In [8]:
print(global_co2.columns)
print(global_temp.columns)
print(global_pop.columns)
print(global_energy.columns)

Index(['Year', 'co2_avg'], dtype='object')
Index(['Year', 'temp'], dtype='object')
Index(['Year', 'pop'], dtype='object')
Index(['Year', 'fossil_fuels (TWh)', 'renewables (TWh)', 'ALL Fuels'], dtype='object')


In [9]:
## Aggregate Global Dataset, 1965-2019

df_Global = pd.DataFrame({'Year': global_energy.Year, 'pop': global_pop['pop'], 
                          'co2_avg': global_co2['co2_avg'], 'temp': global_temp['temp'], 
                          'fossil_fuels (TWh)': global_energy['fossil_fuels (TWh)'], 
                          'renewables (Twh)': global_energy['renewables (TWh)'], 
                          'All Fuels': global_energy['ALL Fuels']})

## produce csv of data : 

df_Global.to_csv('output/global_data.csv')

### Country-Level data prep


In [74]:
country_GHG = pd.read_csv('OECD/AIR_GHG.csv')
country_pop = pd.read_csv('OECD/POP_1965-2019.csv')
country_energy = pd.read_csv('OECD/primary-energy.csv')

In [70]:
print(country_GHG.columns)
print(country_pop.columns)
print(country_energy.columns)
print(country_co2.columns)

Index(['COU', 'Country', 'POL', 'Pollutant', 'VAR', 'Variable', 'YEA', 'Year',
       'Unit Code', 'Unit', 'PowerCode Code', 'PowerCode',
       'Reference Period Code', 'Reference Period', 'Value', 'Flag Codes',
       'Flags'],
      dtype='object')
Index(['LOCATION', 'Country', 'SEX', 'Sex', 'AGE', 'Age', 'VAR', 'Variant',
       'TIME', 'Time', 'Unit Code', 'Unit', 'PowerCode Code', 'PowerCode',
       'Reference Period Code', 'Reference Period', 'Value', 'Flag Codes',
       'Flags'],
      dtype='object')
Index(['Entity', 'Code', 'Year', 'Primary energy consumption (TWh)'], dtype='object')
Index(['iso_code', 'country', 'year', 'co2', 'co2_growth_prct',
       'co2_growth_abs', 'consumption_co2', 'trade_co2', 'trade_co2_share',
       'co2_per_capita', 'consumption_co2_per_capita', 'share_global_co2',
       'cumulative_co2', 'share_global_cumulative_co2', 'co2_per_gdp',
       'consumption_co2_per_gdp', 'co2_per_unit_energy', 'cement_co2',
       'coal_co2', 'flaring_co2', 'gas_c

In [137]:
print(np.unique(country_co2[Z'country'))

['Afghanistan' 'Africa' 'Albania' 'Algeria' 'Andorra' 'Angola' 'Anguilla'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Asia'
 'Asia (excl. China & India)' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas'
 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin'
 'Bermuda' 'Bhutan' 'Bolivia' 'Bonaire Sint Eustatius and Saba'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'British Virgin Islands'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon'
 'Canada' 'Cape Verde' 'Central African Republic' 'Chad' 'Chile' 'China'
 'Christmas Island' 'Colombia' 'Comoros' 'Congo' 'Cook Islands'
 'Costa Rica' "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus' 'Czechia'
 'Democratic Republic of Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'EU-27' 'EU-28' 'Ecuador' 'Egypt' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Europe'
 'Europe (excl. EU-27)' 'Europe (excl. EU-28)' 'Faeroe Islands' 'Fiji'
 'Finland' 'France' 'French Equatori

In [264]:
country_all = pd.read_csv('OECD/CO2_emissions.csv')
country_all = country_all[(country_all.year >= 1965) & (country_all.year < 2017)]
## source: https://github.com/owid/co2-data {Our world in Data}

## columns that we want to keep: 
cols = ['country', 'year', 'co2', 'co2_per_gdp', 'share_global_co2', 
        'co2_per_capita', 'co2_per_unit_energy', 'primary_energy_consumption', 
        'energy_per_capita', 'energy_per_gdp', 'population', 'gdp']

country_all = country_all[cols]

## countries to keep 

country = ['Australia', 'Austria', 'Belgium', 
           'Brazil', 'Bulgaria', 'Canada', 'Chile', 'China', 'Colombia', 
           'Cyprus', 'Czechia', 'Denmark',  'Ecuador',
           'Finland', 'France', 
           'Germany', 'Greece', 'Hungary', 'Iceland', 'India', 
           'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Italy',
           'Japan', 'South Korea',
           'Pakistan', 'Venezuela',
           'Luxembourg',  'Mexico', 'Netherlands', 'New Zealand', 
           'Norway', 'Poland', 'Portugal',
           'Romania', 'Saudi Arabia', 'Slovakia',
           'South Africa','Spain', 'Sweden', 'Switzerland', 'Turkey', 
           'United Kingdom', 'United States', 'Philippines', 'Egypt', 
           'Vietnam', 'Thailand', 'Hong Kong']

num_countries = len(country)

## dummy variable "data groups"
# Type 1: OPEC + Top Oil Producing Nations 

type_1 = ['Iran', 'Iraq', 'Nigeria', 'Libya', 'United Arab Emirates',
          'Kuwait', 'Saudia Arabia', 'Venezuela', 'United States', 
         'Russia', 'Canada', 'China', 'Brazil',
          'Mexico']

print("Type 1 :", len(type_1))

# Type 2: OECD Countries 
type_2 = ['Australia', 'Austria', 'Belgium', 'Canada', 'Chile', 'Colombia',
         'Denmark', 'Finland', 'France', 'Germany', 'Greece',
         'Hungary', 'Iceland', 'Ireland', 'Italy', 'Japan', 'South Korea',
         'Latvia', 'Lithuania', 'Luxembourg', 'Mexico', 'Netherlands',
         'New Zealand', 'Norway', 'Poland', 'Portugal', 'Slovakia',
         'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 
         'United Kingdom', 'United States']

print("Type 2 :", len(type_2))

# Type 3: Highest Population
type_3 = ['China', 'India', 'United States', 'Indonesia', 'Pakistan', 
          'Brazil', 'Nigeria', 'Bangladesh', 'Russia', 'Mexico', 
          'Japan', 'Ethopia', 'Philippines', 'Egypt', 'Vietnam']

print("Type 3 :", len(type_3))

# Type 4: Most technologically advanced (Top 15)
# source: https://www.gfmag.com/global-data/non-economic-data/best-tech-countries
type_4 = ['Norway', 'Sweden', 'Netherlands', 'Denmark', 'United States', 
          'Singapore', 'Finland', 'United Arab Emirates', 'South Korea', 
          'Hong Kong', 'Switzerland', 'United Kingdom', 'Belgium', 'Canada', 'Australia']

print("Type 4 :", len(type_4))

# Type 5: Highest Renewable Energy Production (BY 2016) 
#source: https://en.wikipedia.org/wiki/List_of_countries_by_renewable_electricity_production  

type_5 = ['China', 'United States', 'India', 'Russia', 'Japan',
          'Canada', 'Brazil', 'South Korea', 'France', 'Germany', 
          'Saudi Arabia', 'United Kingdom', 'Mexico', 'Italy', 'Iran']

print('Type 5 :', len(type_5))

country_all['group1'], country_all['group2'], country_all['group3'], country_all['group4'], country_all['group5'] = np.zeros(len(country_all)), np.zeros(len(country_all)), np.zeros(len(country_all)), np.zeros(len(country_all)), np.zeros(len(country_all))
country_all['growth_co2'] = np.zeros(len(country_all))
country_all['growth_primary_energy'] = np.zeros(len(country_all))
country_all['growth_pop'] = np.zeros(len(country_all))

check = []
for i in country:
    df = country_all[country_all.country == i]
        
    if i in type_1: 
        df.group1 = 1
    if i in type_2: 
        df.group2 = 1
    if i in type_3: 
        df.group3 = 1
    if i in type_4: 
        df.group4 = 1
    if i in type_5: 
        df.group5 = 1
        
    co2, rate_co2 = np.array(df.co2), [0,]
    energy, rate_energy = np.array(df['primary_energy_consumption']), [0,]
    pop, rate_pop = np.array(df['population']), [0,]
    
    for i in range(1, len(co2)): 
        rate_co2.append((co2[i]-co2[i-1])/co2[i-1])
        rate_energy.append((energy[i]-energy[i-1])/energy[i-1])
        rate_pop.append((pop[i]-pop[i-1])/pop[i-1])
    
    df['growth_co2'] = rate_co2
    df['growth_primary_energy'] = rate_energy
    df['growth_pop'] = rate_pop
    
    check.append(df)

country_all = pd.concat(check)
country_all.reset_index(inplace = True)
country_all = country_all.drop(['index'], axis = 1)
print(num_countries)
print(len(np.unique(country_all.country)))

#export to csv 
country_all.to_csv('output/country-level.csv')

Type 1 : 14
Type 2 : 34
Type 3 : 15
Type 4 : 15
Type 5 : 15
51
51


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pa

In [265]:
country_all.loc[np.where(country_all.isnull())[0], :].country

520      Czechia
520      Czechia
520      Czechia
520      Czechia
520      Czechia
          ...   
1995    Slovakia
1995    Slovakia
1995    Slovakia
1995    Slovakia
1996    Slovakia
Name: country, Length: 362, dtype: object

In [267]:
country_all.head()

Unnamed: 0,country,year,co2,co2_per_gdp,share_global_co2,co2_per_capita,co2_per_unit_energy,primary_energy_consumption,energy_per_capita,energy_per_gdp,population,gdp,group1,group2,group3,group4,group5,growth_co2,growth_primary_energy,growth_pop
0,Australia,1965,120.853,0.626,1.072,10.683,0.299,404.19,35727.913,2.094,11313000.0,193000000000.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,Australia,1966,120.219,0.601,1.019,10.37,0.271,443.552,38260.344,2.218,11593000.0,200000000000.0,0.0,1.0,0.0,1.0,0.0,-0.005246,0.097385,0.02475
2,Australia,1967,129.144,0.609,1.061,10.852,0.275,470.455,39534.072,2.219,11900000.0,212000000000.0,0.0,1.0,0.0,1.0,0.0,0.07424,0.060654,0.026481
3,Australia,1968,134.497,0.6,1.048,11.01,0.27,498.948,40843.851,2.227,12216000.0,224000000000.0,0.0,1.0,0.0,1.0,0.0,0.04145,0.060565,0.026555
4,Australia,1969,142.124,0.597,1.038,11.353,0.274,519.445,41492.517,2.183,12519000.0,238000000000.0,0.0,1.0,0.0,1.0,0.0,0.056708,0.04108,0.024804


In [195]:
print(country_GHG.columns)
print(country_pop.columns)
country_GHG.head()

Index(['COU', 'Country', 'POL', 'Pollutant', 'VAR', 'Variable', 'YEA', 'Year',
       'Unit Code', 'Unit', 'PowerCode Code', 'PowerCode',
       'Reference Period Code', 'Reference Period', 'Value', 'Flag Codes',
       'Flags'],
      dtype='object')
Index(['LOCATION', 'Country', 'SEX', 'Sex', 'AGE', 'Age', 'VAR', 'Variant',
       'TIME', 'Time', 'Unit Code', 'Unit', 'PowerCode Code', 'PowerCode',
       'Reference Period Code', 'Reference Period', 'Value', 'Flag Codes',
       'Flags'],
      dtype='object')


Unnamed: 0,COU,Country,POL,Pollutant,VAR,Variable,YEA,Year,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,1990,1990,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,424998.381,,
1,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,1991,1991,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,426015.21,,
2,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,1992,1992,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,430216.38,,
3,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,1993,1993,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,430612.556,,
4,AUS,Australia,GHG,Greenhouse gases,TOTAL,Total emissions excluding LULUCF,1994,1994,T_CO2_EQVT,Tonnes of CO2 equivalent,3,Thousands,,,430653.332,,


In [184]:
country_GHG = pd.read_csv('OECD/AIR_GHG.csv')

cols = ['Country', 'Year', 'Value']
country_ghg = country_GHG[cols]
country_ghg.columns = ['Country', 'Year', 'GHG']


country = ['Algeria', 'Angola', 'Australia', 'Austria', 'Belgium', 
           'Brazil', 'Bulgaria', 'Canada', 'Chile', 
           'Democratic Republic of Congo', 'China', 'Colombia', 
           'Cyprus', 'Czech Republic', 'Denmark', 'Ecuador', 'Ethiopia',
           'Equatorial Guinea', 'Finland', 'France', 'Gabon',
           'Germany', 'Greece', 'Hungary', 'Iceland', 'India', 
           'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Italy',
           'Japan', 'Korea',  'Kuwait', 'Libya', 'Nigeria', 
           'Pakistan', 'Bangladesh', 'Russia', 
           'Qatar', 'United Arab Emirates', 'Venezuela',
           'Luxembourg',  'Mexico', 'Netherlands', 'New Zealand', 
           'Norway', 'OECD - Total', 'Poland', 'Portugal',
           'Romania', 'Saudi Arabia', 'Slovak Republic',
           'South Africa','Spain', 'Sweden', 'Switzerland', 'Turkey', 
           'United Kingdom', 'United States', 'Phillipines', 'Egypt', 
          'Vietnam', 'Thailand', ]

check = []
for i in country:
    df = country_ghg[country_ghg.Country == i]
    check.append(df)
    
country_ghg = pd.concat(check)
country_ghg.reset_index(inplace = True)

country_ghg.to_csv('output/country-all_ghg.csv')