# Data Cleanup

## Setup

### Import Packages

In [195]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from collections import OrderedDict
from datetime import datetime, date
from os import environ
import json

# Set ipython's max row display
pd.set_option('display.max_row', 1000)
# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)

sns.set_style("darkgrid")
plt.rcParams['figure.figsize'] = [10, 5]

### Global Variables

## Importing Dataset

### Combined dataset

In [311]:
read_name = "../data/bronze_tables/combined_dataset.csv"

df_combined = pd.read_csv(read_name)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [312]:
df_combined.head()

Unnamed: 0.1,Unnamed: 0,Entity_x,Code,Year,Annual CO2 emissions,Entity_y,Fossil fuels (% growth),Entity_x.1,Annual change in primary energy consumption (%),Entity_y.1,Gas Consumption - TWh,Coal Consumption - TWh,Oil Consumption - TWh,Entity_x.2,Fossil fuels (TWh),Entity_y.2,Coal Production - TWh,Oil Production - TWh,Gas Production - TWh,Entity_x.3,Fossil fuels per capita (kWh),Entity_y.3,Fossil fuels (% equivalent primary energy),Entity_x.4,"Gas (TWh, direct energy)","Oil (TWh, direct energy)","Coal (TWh, direct energy)",Entity_y.4,Geo Biomass Other - TWh,Solar Generation - TWh,Wind Generation - TWh,Hydro Generation - TWh,Entity_x.5,prod of Electricity from wind (TWh),prod of Electricity from hydro (TWh),prod of Electricity from solar (TWh),prod of Other renewables including bioenergy (TWh),Entity_y.5,Per capita electricity (kWh),Entity_x.6,Renewables per capita (kWh - equivalent),Entity_y.6,Renewables (% electricity)
0,0,Afghanistan,AFG,1949,14656.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,Afghanistan,AFG,1950,84272.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,Afghanistan,AFG,1951,91600.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,Afghanistan,AFG,1952,91600.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,Afghanistan,AFG,1953,106256.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [313]:
df_combined = df_combined[df_combined.columns[1:]]

In [314]:
df = df_combined.copy()

#### Unify Entity Columns

In [317]:
entity_columns = []
for col in df.columns:
    if "Entity" in col:
        entity_columns.append(col)

In [318]:
df_entities = df[entity_columns + ['Code']].drop_duplicates(entity_columns)

In [319]:
df['Entity'] = df['Entity_x']

for col in entity_columns:
    df['Entity'] = df['Entity'].fillna(df[col])

In [320]:
df['Entity'].isna().sum()

0

In [321]:
df = df.drop(columns=entity_columns)

In [322]:
df['Entity'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan',
       'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
       'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia',
       'Bonaire Sint Eustatius and Saba', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'British Virgin Islands', 'Brunei',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Cape Verde', 'Central African Republic', 'Chad',
       'Chile', 'China', 'Christmas Island', 'Colombia', 'Comoros',
       'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia',
       'Cuba', 'Curacao', 'Cyprus', 'Czechia',
       'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiop

#### Drop Duplicate Columns

In [270]:
cols = list(df.columns)
unique_cols = {}

for col in cols:
    unique_cols[col] = col.split('_', 1)[0]

In [271]:
df = df.rename(columns = unique_cols)

In [272]:
df = df.loc[:,~df.columns.duplicated()]

ordered_columns = ['Entity', *df.columns[:-1]]
df = df[ordered_columns]

In [273]:
df.head()

Unnamed: 0,Entity,Code,Year,Annual CO2 emissions,Fossil fuels (% growth),Annual change in primary energy consumption (%),Gas Consumption - TWh,Coal Consumption - TWh,Oil Consumption - TWh,Fossil fuels (TWh),Coal Production - TWh,Oil Production - TWh,Gas Production - TWh,Fossil fuels per capita (kWh),Fossil fuels (% equivalent primary energy),"Gas (TWh, direct energy)","Oil (TWh, direct energy)","Coal (TWh, direct energy)",Geo Biomass Other - TWh,Solar Generation - TWh,Wind Generation - TWh,Hydro Generation - TWh,prod of Electricity from wind (TWh),prod of Electricity from hydro (TWh),prod of Electricity from solar (TWh),prod of Other renewables including bioenergy (TWh),Per capita electricity (kWh),Renewables per capita (kWh - equivalent),Renewables (% electricity)
0,Afghanistan,AFG,1949,14656.0,,,,,,,,,,,,,,,,,,,,,,,,,
1,Afghanistan,AFG,1950,84272.0,,,,,,,,,,,,,,,,,,,,,,,,,
2,Afghanistan,AFG,1951,91600.0,,,,,,,,,,,,,,,,,,,,,,,,,
3,Afghanistan,AFG,1952,91600.0,,,,,,,,,,,,,,,,,,,,,,,,,
4,Afghanistan,AFG,1953,106256.0,,,,,,,,,,,,,,,,,,,,,,,,,


### Income Level Lookup table

In [274]:
read_name = "../data/bronze_tables/income_level_lookup.xlsx"

df_income_lookup = pd.read_excel(read_name, skiprows=0)

df_income_lookup.head()

Unnamed: 0,Economy,Code,Region,Income group,Lending category,Other (EMU or HIPC)
0,Aruba,ABW,Latin America & Caribbean,High income,,
1,Afghanistan,AFG,South Asia,Low income,IDA,HIPC
2,Angola,AGO,Sub-Saharan Africa,Lower middle income,IBRD,
3,Albania,ALB,Europe & Central Asia,Upper middle income,IBRD,
4,Andorra,AND,Europe & Central Asia,High income,,


In [275]:
df_income_lookup = df_income_lookup[df_income_lookup.columns[:5]]

#### join with combined_df

In [276]:
# df_income_lookup = df_income_lookup.rename(columns={'Economy': 'Entity'})
df = df.set_index('Code').join(df_income_lookup.set_index('Code')).reset_index()

### Population & Area

In [277]:
read_name = "../data/bronze_tables/API_EN.POP.DNST_DS2_en_csv_v2_4701323.csv"

df_population = pd.read_csv(read_name, skiprows=4)

df_population.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,...,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Aruba,ABW,Population density (people per sq. km of land ...,EN.POP.DNST,,307.966667,312.411111,314.994444,316.827778,318.65,320.566667,322.466667,324.316667,326.3,328.166667,330.233333,332.494444,334.644444,336.261111,336.961111,336.588889,335.366667,333.9,333.177778,333.872222,...,484.888889,494.494444,504.811111,516.066667,527.733333,538.977778,548.577778,555.711111,560.166667,562.366667,563.122222,563.622222,564.805556,566.944444,569.805556,573.138889,576.533333,579.661111,582.583333,585.338889,588.033333,590.611111,593.144444,,
1,Africa Eastern and Southern,AFE,Population density (people per sq. km of land ...,EN.POP.DNST,,9.206929,9.444024,9.690214,9.945378,10.209576,10.482895,10.765896,11.059617,11.365371,11.684171,12.016528,12.362473,12.721761,13.094008,13.478945,13.876693,14.287579,14.711372,15.14866,15.599232,...,25.947604,26.625607,27.321141,28.035481,28.769895,29.52636,30.001659,30.801413,31.628679,32.483559,33.367205,34.279582,35.219737,36.189915,36.879528,37.898051,38.940522,40.004465,41.089451,42.195162,43.319881,44.462045,45.620592,,
2,Afghanistan,AFG,Population density (people per sq. km of land ...,EN.POP.DNST,,14.058547,14.337645,14.631648,14.940699,15.265041,15.60008,15.945197,16.308762,16.702347,17.131463,17.594177,18.078319,18.56548,19.031569,19.455045,19.844369,20.194247,20.454746,20.561857,20.478206,...,30.261978,30.925972,31.859861,33.127872,34.65154,36.307546,37.910996,39.333171,40.527204,41.550591,42.503842,43.534959,44.747269,46.176059,47.776671,49.475786,51.164166,52.762987,54.249311,55.649251,56.992046,58.325678,59.68499,,
3,Africa Western and Central,AFW,Population density (people per sq. km of land ...,EN.POP.DNST,,10.877837,11.10994,11.351399,11.601453,11.859717,12.12633,12.402029,12.687791,12.984878,13.294421,13.616671,13.952068,14.302085,14.669124,15.05382,15.456176,15.876543,16.313802,16.766255,17.232596,...,28.002263,28.761516,29.540244,30.338334,31.157344,32.000505,32.872024,33.77493,34.710713,35.678698,36.677028,37.702723,38.753638,39.829118,40.929916,42.056948,43.211639,44.394917,45.607045,46.847251,48.11408,49.405535,50.720207,,
4,Angola,AGO,Population density (people per sq. km of land ...,EN.POP.DNST,,4.436874,4.498676,4.555554,4.60014,4.628678,4.637286,4.63178,4.629801,4.655231,4.724761,4.845784,5.012405,5.211585,5.423617,5.634069,5.839119,6.043005,6.249117,6.463553,6.690695,...,12.320205,12.727096,13.151101,13.592487,14.052633,14.535555,15.046232,15.588036,16.162593,16.768557,17.402451,18.059096,18.734457,19.427817,20.139508,20.86772,21.61047,22.366552,23.135062,23.916555,24.713072,25.527632,26.362612,,


In [278]:
read_name = "../data/bronze_tables/API_AG.LND.TOTL.K2_DS2_en_csv_v2_4701206.csv"

df_area = pd.read_csv(read_name, skiprows=4)

df_area.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,...,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Aruba,ABW,Land area (sq. km),AG.LND.TOTL.K2,,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,...,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,,
1,Africa Eastern and Southern,AFE,Land area (sq. km),AG.LND.TOTL.K2,,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,...,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14720190.0,14720240.0,14720230.0,14720270.0,14720236.89,14720270.0,14720960.0,14721240.05,14845170.0,14845130.0,14845090.0,14845140.0,14845150.0,14845140.0,14845150.0,14845160.0,14845120.0,,
2,Afghanistan,AFG,Land area (sq. km),AG.LND.TOTL.K2,,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,...,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,,
3,Africa Western and Central,AFW,Land area (sq. km),AG.LND.TOTL.K2,,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046180.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,...,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045773.0,,
4,Angola,AGO,Land area (sq. km),AG.LND.TOTL.K2,,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,...,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,,


In [279]:
df_pop_area = pd.concat([df_population, df_area])

df_pop_area = df_pop_area[df_population.columns[:-2]].drop(['Indicator Code'], axis=1)

In [280]:
df_pop_area.head(2)

Unnamed: 0,Country Name,Country Code,Indicator Name,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,...,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,ABW,Population density (people per sq. km of land ...,,307.966667,312.411111,314.994444,316.827778,318.65,320.566667,322.466667,324.316667,326.3,328.166667,330.233333,332.494444,334.644444,336.261111,336.961111,336.588889,335.366667,333.9,333.177778,333.872222,336.45,...,462.283333,474.722222,484.888889,494.494444,504.811111,516.066667,527.733333,538.977778,548.577778,555.711111,560.166667,562.366667,563.122222,563.622222,564.805556,566.944444,569.805556,573.138889,576.533333,579.661111,582.583333,585.338889,588.033333,590.611111,593.144444
1,Africa Eastern and Southern,AFE,Population density (people per sq. km of land ...,,9.206929,9.444024,9.690214,9.945378,10.209576,10.482895,10.765896,11.059617,11.365371,11.684171,12.016528,12.362473,12.721761,13.094008,13.478945,13.876693,14.287579,14.711372,15.14866,15.599232,16.062631,...,24.633762,25.284822,25.947604,26.625607,27.321141,28.035481,28.769895,29.52636,30.001659,30.801413,31.628679,32.483559,33.367205,34.279582,35.219737,36.189915,36.879528,37.898051,38.940522,40.004465,41.089451,42.195162,43.319881,44.462045,45.620592


In [281]:
df_pop_area = df_pop_area.melt(
    id_vars=df_pop_area.columns[:3], 
    value_vars=df_pop_area.columns[3:],
    var_name='Year'
).reset_index(drop=True)

df_pop_area.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Year,value
0,Aruba,ABW,Population density (people per sq. km of land ...,1960,
1,Africa Eastern and Southern,AFE,Population density (people per sq. km of land ...,1960,
2,Afghanistan,AFG,Population density (people per sq. km of land ...,1960,
3,Africa Western and Central,AFW,Population density (people per sq. km of land ...,1960,
4,Angola,AGO,Population density (people per sq. km of land ...,1960,


In [282]:
df_pop_area = df_pop_area.pivot(['Country Code', 'Year'], 'Indicator Name', 'value').reset_index()

In [283]:
df_pop_area['Year'] = df_pop_area['Year'].astype(np.number)

#### join with combined_df

In [284]:
df_pop_area = df_pop_area.rename(columns={'Country Name': 'Entity'})

In [285]:
df_pop_area.head()

Indicator Name,Country Code,Year,Land area (sq. km),Population density (people per sq. km of land area)
0,ABW,1960.0,,
1,ABW,1961.0,180.0,307.966667
2,ABW,1962.0,180.0,312.411111
3,ABW,1963.0,180.0,314.994444
4,ABW,1964.0,180.0,316.827778


In [286]:
df = df.set_index(['Code', 'Year']).join(df_pop_area.set_index(['Country Code', 'Year'])).reset_index()

### Rearrange Columns

In [288]:
cat_cols = ['Entity', 'Year', 'Code', 'Region', 'Income group', 'Lending category']
value_cols = list((set(df.columns) - set(cat_cols)) - {'Country Code'})

In [289]:
df = df[cat_cols + value_cols]

### Basic Analysis

In [291]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3548764 entries, 0 to 3548763
Data columns (total 35 columns):
 #   Column                                               Dtype  
---  ------                                               -----  
 0   Entity                                               object 
 1   Year                                                 int64  
 2   Code                                                 object 
 3   Region                                               object 
 4   Income group                                         object 
 5   Lending category                                     object 
 6   Renewables (% electricity)                           float64
 7   Coal (TWh, direct energy)                            float64
 8   Land area (sq. km)                                   float64
 9   Economy                                              object 
 10  Renewables per capita (kWh - equivalent)             float64
 11  Fossil fuels (% equivale

In [292]:
df.describe()

Unnamed: 0,Year,Renewables (% electricity),"Coal (TWh, direct energy)",Land area (sq. km),Renewables per capita (kWh - equivalent),Fossil fuels (% equivalent primary energy),Fossil fuels per capita (kWh),Annual CO2 emissions,Geo Biomass Other - TWh,Coal Consumption - TWh,Wind Generation - TWh,prod of Electricity from solar (TWh),Per capita electricity (kWh),Coal Production - TWh,Gas Production - TWh,Fossil fuels (TWh),Solar Generation - TWh,Oil Production - TWh,prod of Other renewables including bioenergy (TWh),Population density (people per sq. km of land area),"Gas (TWh, direct energy)","Oil (TWh, direct energy)",prod of Electricity from hydro (TWh),Fossil fuels (% growth),Annual change in primary energy consumption (%),Gas Consumption - TWh,Hydro Generation - TWh,prod of Electricity from wind (TWh),Oil Consumption - TWh
count,3548764.0,1481185.0,15179.0,3413246.0,1122327.0,1115945.0,1115945.0,3424429.0,1103980.0,1115948.0,1103980.0,1866353.0,1524543.0,362327.0,655474.0,1115945.0,1103980.0,665848.0,1849595.0,3191901.0,15179.0,15179.0,1884703.0,1110365.0,2466964.0,1122331.0,1122327.0,1866353.0,1122334.0
mean,1990.835,29.55008,28051.366649,5249864.0,4.521244,86.41344,32565.67,214347400.0,5.271232,749.2411,5.355192,1.071659,3889.834,1896.652969,927.879854,2324.555,1.801836,1702.987533,3.196876,291.7047,20841.881473,38418.751094,37.34946,3.51855,4.529973,556.2669,61.28578,3.18094,1011.397
std,17.94527,32.0891,9862.128852,15097250.0,13.3426,15.55028,34479.09,1742079000.0,32.96575,3740.826,54.27216,17.94764,5077.855,6091.733428,3556.924536,10968.32,23.30776,5833.765701,25.59484,1503.785,9417.580614,9569.777345,236.2517,27.2427,31.88959,2738.557,303.7947,41.82296,4682.507
min,1750.0,0.0,97.0,2.027,0.0,12.8047,155.8074,34.0,0.0,0.0,0.0,0.0,0.0,0.339596,0.0,0.09738055,0.0,0.0,0.0,0.09862452,0.0,0.0,0.0,-49.59083,-95.00508,0.0,0.0,0.0,0.09738056
25%,1976.0,1.30733,19457.904297,21640.0,0.1683346,80.80704,11525.3,593568.0,0.0,3.265833,0.0,0.0,564.2858,37.866039,45.318611,116.4674,0.0,102.93206,0.0,19.56125,13293.952148,33660.101562,0.014,-1.060772,-0.8340657,8.645,0.4206,0.0,64.54946
50%,1992.0,15.99574,25904.46875,199810.0,0.7866168,91.76839,25404.53,4968423.0,0.039,32.54101,0.0,0.0,2427.437,125.336014,140.884018,282.3127,0.0,344.388245,0.0,51.59331,20063.484375,37960.613281,1.52,2.466643,2.337921,52.59425,4.582918,0.0,142.5319
75%,2006.0,53.125,38069.035156,1260000.0,2.784231,97.84651,40255.04,42171000.0,1.299,168.1976,0.05770781,0.001094,5367.686,824.191345,399.114014,950.1196,0.004572,1135.137939,0.35,128.6045,28161.105469,46672.65625,9.815488,6.335747,6.949067,232.5982,22.91953,0.005,414.727
max,2021.0,100.0,45161.207031,129987000.0,153.8834,100.0,308704.2,36702500000.0,762.7827,45161.21,1861.94,1032.501,56781.6,46550.605469,40368.828125,136131.5,1032.501,52181.949219,762.7827,21388.6,40374.605469,53368.628906,4345.99,1553.105,1553.105,40374.61,4345.99,1861.94,53368.63


In [293]:
df.describe(include='object')

Unnamed: 0,Entity,Code,Region,Income group,Lending category,Economy
count,3548764,3548764,3283267,3266991,2248104,3283267
unique,235,235,7,4,3,211
top,World,GBR,Europe & Central Asia,High income,IBRD,United Kingdom
freq,16437,16437,838218,1155284,1119762,16437


### Data Clean-up

In [294]:
x = 'Entity'

#### Unify Entinty Names

In [295]:
df[x] = df[x].str.title()

#### Create Entity Category Column

In [296]:
df[x].nunique()

235

In [309]:
df[x].unique()

array(['United Kingdom', 'World', 'Canada', 'Germany', 'Poland',
       'United States', 'Belgium', 'France', 'Austria', 'Norway',
       'Armenia', 'Azerbaijan', 'Belarus', 'Spain', 'Estonia', 'Georgia',
       'Hungary', 'Kazakhstan', 'Kyrgyzstan', 'Lithuania', 'Latvia',
       'Moldova', 'Russia', 'Tajikistan', 'Turkmenistan', 'Ukraine',
       'Uzbekistan', 'Sweden', 'Denmark', 'Netherlands', 'Ireland',
       'Switzerland', 'India', 'Romania', 'Australia', 'Czechia',
       'Finland', 'Italy', 'Slovakia', 'Turkey', 'Greece', 'Japan',
       'Portugal', 'New Zealand', 'Bulgaria', 'Peru', 'South Africa',
       'Bosnia And Herzegovina', 'Croatia', 'North Macedonia',
       'Montenegro', 'Serbia', 'Slovenia', 'Argentina', 'Indonesia',
       'Malaysia', 'Mexico', 'Vietnam', 'Chile', 'Taiwan', 'China',
       'Brazil', 'Zimbabwe', 'Venezuela', 'South Korea', 'North Korea',
       'Iran', 'Philippines', 'Trinidad And Tobago', 'Egypt', 'Nigeria',
       'Algeria', 'Tunisia', 'Ecuador', 

In [297]:
continents = ['Asia', 'Africa', 'North America', 'South America', 'Europe', "Antarctica"]

countries_official_names = ['Afghanistan', 'Aland Islands', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia, Plurinational State of', 'Bonaire, Sint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island', 'Cocos (Keeling) Islands', 'Colombia', 'Comoros', 'Congo', 'Congo, The Democratic Republic of the', 'Cook Islands', 'Costa Rica', "Côte d'Ivoire", 'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Falkland Islands (Malvinas)', 'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana', 'French Polynesia', 'French Southern Territories', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Greece', 'Greenland', 'Grenada', 'Guadeloupe', 'Guam', 'Guatemala', 'Guernsey', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Heard Island and McDonald Islands', 'Holy See (Vatican City State)', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran, Islamic Republic of', 'Iraq', 'Ireland', 'Isle of Man', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jersey', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', "Korea, Democratic People's Republic of", 'Korea, Republic of', 'Kuwait', 'Kyrgyzstan', "Lao People's Democratic Republic", 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macao', 'Macedonia, Republic of', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Martinique', 'Mauritania', 'Mauritius', 'Mayotte', 'Mexico', 'Micronesia, Federated States of', 'Moldova, Republic of', 'Monaco', 'Mongolia', 'Montenegro', 'Montserrat', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Caledonia', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestinian Territory, Occupied', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Pitcairn', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Réunion', 'Romania', 'Russian Federation', 'Rwanda', 'Saint Barthélemy', 'Saint Helena, Ascension and Tristan da Cunha', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Martin (French part)', 'Saint Pierre and Miquelon', 'Saint Vincent and the Grenadines', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Georgia and the South Sandwich Islands', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'South Sudan', 'Svalbard and Jan Mayen', 'Swaziland', 'Sweden', 'Switzerland', 'Syrian Arab Republic', 'Taiwan, Province of China', 'Tajikistan', 'Tanzania, United Republic of', 'Thailand', 'Timor-Leste', 'Togo', 'Tokelau', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Turks and Caicos Islands', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'United States Minor Outlying Islands', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela, Bolivarian Republic of', 'Viet Nam', 'Virgin Islands, British', 'Virgin Islands, U.S.', 'Wallis and Futuna', 'Yemen', 'Zambia', 'Zimbabwe']
other_countries = [
       'Antigua And Barbuda', 'Bolivia',
       'Bonaire Sint Eustatius And Saba', 'Bosnia And Herzegovina',
       'British Virgin Islands', 'Brunei', "Cote D'Ivoire", 'Curacao',
       'Czechia', 'Democratic Republic Of Congo', 'Eswatini',
       'Faeroe Islands', 'French Equatorial Africa', 'French West Africa',
       'Iran',
       'Kosovo',  'Laos', 'Leeward Islands',
       'Moldova',
       'North Korea', 'North Macedonia', 'Oceania', 'Palestine',
       'Panama Canal Zone', 'Reunion', 'Russia', 'Ryukyu Islands',
       'Saint Helena', 'Saint Kitts And Nevis',
       'Saint Pierre And Miquelon', 'Saint Vincent And The Grenadines',
       'Sao Tome And Principe', 'South Korea', 'St. Kitts-Nevis-Anguilla',
       'Syria', 'Taiwan', 'Tanzania', 'Timor', 'Trinidad And Tobago',
       'Turks And Caicos Islands',
       'Venezuela', 'Vietnam', 'Wallis And Futuna', 'Ussr',
       'Czechoslovakia', 'Falkland Islands', 'Netherlands Antilles',
       'Serbia And Montenegro', 'United States Virgin Islands',
       'Western Sahara', 'Yugoslavia',
       'Micronesia (Country)', 'Sint Maarten (Dutch Part)'
                  ]
countries = countries_official_names + other_countries

In [298]:
df[x].isna().sum()

0

In [299]:
def assign_category_to_entity(s):

    if "(" in s:
        if "(Bp)" in s:
            return "Bp?"
        if "(Eia)" in s:
            return "Eia?"
        if "(Ember)" in s:
            return "Ember?"
        if s in countries:
            return "Country"
        else:
            return "Exlusive Regions"

    else:
        if s == "World":
            return "World"
        elif s in continents:
            return "Continent"
        elif s in countries:
            return "Country"
        elif "Income" in s:
            return "Income Level"
        else:
            return "Other"

In [300]:
df['Entity_Category'] = df[x].apply(assign_category_to_entity)

In [301]:
df['Entity_Category'].value_counts()

Country      3521953
World          16437
Continent      10374
Name: Entity_Category, dtype: int64

In [310]:
df.loc[df['Entity_Category'] == 'Continent']

Unnamed: 0,Entity,Year,Code,Region,Income group,Lending category,Renewables (% electricity),"Coal (TWh, direct energy)",Land area (sq. km),Economy,Renewables per capita (kWh - equivalent),Fossil fuels (% equivalent primary energy),Fossil fuels per capita (kWh),Annual CO2 emissions,Geo Biomass Other - TWh,Coal Consumption - TWh,Wind Generation - TWh,prod of Electricity from solar (TWh),Per capita electricity (kWh),Coal Production - TWh,Gas Production - TWh,Fossil fuels (TWh),Solar Generation - TWh,Oil Production - TWh,prod of Other renewables including bioenergy (TWh),Population density (people per sq. km of land area),"Gas (TWh, direct energy)","Oil (TWh, direct energy)",prod of Electricity from hydro (TWh),Fossil fuels (% growth),Annual change in primary energy consumption (%),Gas Consumption - TWh,Hydro Generation - TWh,prod of Electricity from wind (TWh),Oil Consumption - TWh,Entity_Category
1133105,Antarctica,1981,ATA,,,,,,180.0,,,,,,,,,,,,,,,,,336.450000,,,,,-0.273222,,,,,Continent
1133106,Antarctica,1981,ATA,,,,,,14571611.0,,,,,,,,,,,,,,,,,16.062631,,,,,-0.273222,,,,,Continent
1133107,Antarctica,1981,ATA,,,,,,652230.0,,,,,,,,,,,,,,,,,20.194838,,,,,-0.273222,,,,,Continent
1133108,Antarctica,1981,ATA,,,,,,9045780.0,,,,,,,,,,,,,,,,,17.712523,,,,,-0.273222,,,,,Continent
1133109,Antarctica,1981,ATA,,,,,,1246700.0,,,,,,,,,,,,,,,,,6.930679,,,,,-0.273222,,,,,Continent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3429478,Antarctica,2019,ATA,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000072,,,,,Continent
3429479,Antarctica,2019,ATA,,,,,,527970.0,,,,,,,,,,,,,,,,,55.234051,,,,,0.000072,,,,,Continent
3429480,Antarctica,2019,ATA,,,,,,1213090.0,,,,,,,,,,,,,,,,,48.271989,,,,,0.000072,,,,,Continent
3429481,Antarctica,2019,ATA,,,,,,743390.0,,,,,,,,,,,,,,,,,24.026465,,,,,0.000072,,,,,Continent


In [302]:
df_entity_lookup = df[['Entity_Category', 'Entity', 'Code', 'Income group', 'Lending category']].drop_duplicates().sort_values(by='Entity_Category')

In [306]:
df_entity_lookup['Entity_Category'].value_counts()

Country      233
Continent      1
World          1
Name: Entity_Category, dtype: int64

In [307]:
df_entity_lookup.head()

Unnamed: 0,Entity_Category,Entity,Code,Income group,Lending category
1133105,Continent,Antarctica,ATA,,
0,Country,United Kingdom,GBR,High income,
6818,Country,Nepal,NPL,Lower middle income,IDA
6825,Country,Papua New Guinea,PNG,Lower middle income,Blend
6829,Country,Paraguay,PRY,Upper middle income,IBRD


In [305]:
df.head()

Unnamed: 0,Entity,Year,Code,Region,Income group,Lending category,Renewables (% electricity),"Coal (TWh, direct energy)",Land area (sq. km),Economy,Renewables per capita (kWh - equivalent),Fossil fuels (% equivalent primary energy),Fossil fuels per capita (kWh),Annual CO2 emissions,Geo Biomass Other - TWh,Coal Consumption - TWh,Wind Generation - TWh,prod of Electricity from solar (TWh),Per capita electricity (kWh),Coal Production - TWh,Gas Production - TWh,Fossil fuels (TWh),Solar Generation - TWh,Oil Production - TWh,prod of Other renewables including bioenergy (TWh),Population density (people per sq. km of land area),"Gas (TWh, direct energy)","Oil (TWh, direct energy)",prod of Electricity from hydro (TWh),Fossil fuels (% growth),Annual change in primary energy consumption (%),Gas Consumption - TWh,Hydro Generation - TWh,prod of Electricity from wind (TWh),Oil Consumption - TWh,Entity_Category
0,United Kingdom,1750,GBR,Europe & Central Asia,High income,,,,,United Kingdom,,,,9350528.0,,,,,,,,,,,,,,,,,,,,,,Country
1,World,1750,OWID_WRL,,,,,,,,,,,9350528.0,,,,,,,,,,,,,,,,,,,,,,World
2,United Kingdom,1751,GBR,Europe & Central Asia,High income,,,,,United Kingdom,,,,9350528.0,,,,,,,,,,,,,,,,,,,,,,Country
3,World,1751,OWID_WRL,,,,,,,,,,,9350528.0,,,,,,,,,,,,,,,,,,,,,,World
4,United Kingdom,1752,GBR,Europe & Central Asia,High income,,,,,United Kingdom,,,,9354192.0,,,,,,,,,,,,,,,,,,,,,,Country


## Save Tables

In [39]:
df.head()

Unnamed: 0,Entity,Year,Code,Region,Income group,Lending category,Fossil fuels (% equivalent primary energy),Fossil fuels (% growth),Gas Production - TWh,Land area (sq. km),Unnamed: 0.1.1.1,Coal Consumption - TWh,Renewables per capita (kWh - equivalent),Wind Generation - TWh,prod of Other renewables including bioenergy (TWh),Fossil fuels per capita (kWh),Fossil fuels (TWh),Oil Production - TWh,Coal Production - TWh,Hydro Generation - TWh,Unnamed: 0.1.1,prod of Electricity from solar (TWh),prod of Electricity from hydro (TWh),Unnamed: 0.1,"Gas (TWh, direct energy)",Annual change in primary energy consumption (%),Geo Biomass Other - TWh,"Coal (TWh, direct energy)",Per capita electricity (kWh),Oil Consumption - TWh,Renewables (% electricity),Population density (people per sq. km of land area),Solar Generation - TWh,Gas Consumption - TWh,"Oil (TWh, direct energy)",Annual CO2 emissions,prod of Electricity from wind (TWh),Entity_Category
0,Afghanistan,1949,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14656.0,,Country
1,Afghanistan,1950,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84272.0,,Country
2,Afghanistan,1951,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91600.0,,Country
3,Afghanistan,1952,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91600.0,,Country
4,Afghanistan,1953,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,106256.0,,Country


In [40]:
df.to_csv('../data/silver_tables/yearly_values_per_entity.csv', index=False)

In [41]:
df_entity_lookup.to_csv('../data/silver_tables/entity_lookup.csv', index=False)