# Data Cleanup

## Setup

### Import Packages

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from collections import OrderedDict
from datetime import datetime, date
from os import environ
import json

# Set ipython's max row display
pd.set_option('display.max_row', 1000)
# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)

sns.set_style("darkgrid")
plt.rcParams['figure.figsize'] = [10, 5]

### Global Variables

## Importing Dataset

### Combined dataset

In [2]:
read_name = "../data/bronze_tables/combined_dataset.csv"

df_combined = pd.read_csv(read_name)

In [3]:
df_combined.head()

Unnamed: 0.2,Unnamed: 0,Entity,Year,Annual CO2 emissions,Fossil fuels (% growth)_x,Annual change in primary energy consumption (%)_x,Unnamed: 0.1,Fossil fuels (% growth)_y,Annual change in primary energy consumption (%)_y,Unnamed: 0.1.1,Fossil fuels (% growth)_x.1,Annual change in primary energy consumption (%)_x.1,Unnamed: 0.1.1.1,Fossil fuels (% growth)_y.1,Annual change in primary energy consumption (%)_y.1,Gas Consumption - TWh_x,Coal Consumption - TWh_x,Oil Consumption - TWh_x,Fossil fuels (TWh)_x,Coal Production - TWh_x,Oil Production - TWh_x,Gas Production - TWh_x,Fossil fuels per capita (kWh)_x,Fossil fuels (% equivalent primary energy)_x,"Gas (TWh, direct energy)_x",...,Renewables per capita (kWh - equivalent)_x.1,Renewables (% electricity)_x.1,Gas Consumption - TWh_y.1,Coal Consumption - TWh_y.1,Oil Consumption - TWh_y.1,Fossil fuels (TWh)_y.1,Coal Production - TWh_y.1,Oil Production - TWh_y.1,Gas Production - TWh_y.1,Fossil fuels per capita (kWh)_y.1,Fossil fuels (% equivalent primary energy)_y.1,"Gas (TWh, direct energy)_y.1","Oil (TWh, direct energy)_y.1","Coal (TWh, direct energy)_y.1",Geo Biomass Other - TWh_y.1,Solar Generation - TWh_y.1,Wind Generation - TWh_y.1,Hydro Generation - TWh_y.1,prod of Electricity from wind (TWh)_y.1,prod of Electricity from hydro (TWh)_y.1,prod of Electricity from solar (TWh)_y.1,prod of Other renewables including bioenergy (TWh)_y.1,Per capita electricity (kWh)_y.1,Renewables per capita (kWh - equivalent)_y.1,Renewables (% electricity)_y.1
0,0,Afghanistan,1949,14656.0,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,Afghanistan,1950,84272.0,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,Afghanistan,1951,91600.0,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,Afghanistan,1952,91600.0,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,Afghanistan,1953,106256.0,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,


In [4]:
df_combined = df_combined[df_combined.columns[1:]]

In [5]:
df = df_combined.copy()

#### Drop Duplicate Columns

In [6]:
cols = list(df.columns)
unique_cols = {}

for col in cols:
    unique_cols[col] = col.split('_', 1)[0]

In [7]:
df = df.rename(columns = unique_cols)

In [8]:
df = df.loc[:,~df.columns.duplicated()]

### Income Level Lookup table

In [9]:
read_name = "../data/bronze_tables/income_level_lookup.xlsx"

df_income_lookup = pd.read_excel(read_name, skiprows=0)

df_income_lookup.head()

Unnamed: 0,Economy,Code,Region,Income group,Lending category,Other (EMU or HIPC)
0,Aruba,ABW,Latin America & Caribbean,High income,,
1,Afghanistan,AFG,South Asia,Low income,IDA,HIPC
2,Angola,AGO,Sub-Saharan Africa,Lower middle income,IBRD,
3,Albania,ALB,Europe & Central Asia,Upper middle income,IBRD,
4,Andorra,AND,Europe & Central Asia,High income,,


In [10]:
df_income_lookup = df_income_lookup[df_income_lookup.columns[:5]]

#### join with combined_df

In [11]:
df_income_lookup = df_income_lookup.rename(columns={'Economy': 'Entity'})
df = df.set_index('Entity').join(df_income_lookup.set_index('Entity')).reset_index()

### Population & Area

In [12]:
read_name = "../data/bronze_tables/API_EN.POP.DNST_DS2_en_csv_v2_4701323.csv"

df_population = pd.read_csv(read_name, skiprows=4)

df_population.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,...,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Aruba,ABW,Population density (people per sq. km of land ...,EN.POP.DNST,,307.966667,312.411111,314.994444,316.827778,318.65,320.566667,322.466667,324.316667,326.3,328.166667,330.233333,332.494444,334.644444,336.261111,336.961111,336.588889,335.366667,333.9,333.177778,333.872222,...,484.888889,494.494444,504.811111,516.066667,527.733333,538.977778,548.577778,555.711111,560.166667,562.366667,563.122222,563.622222,564.805556,566.944444,569.805556,573.138889,576.533333,579.661111,582.583333,585.338889,588.033333,590.611111,593.144444,,
1,Africa Eastern and Southern,AFE,Population density (people per sq. km of land ...,EN.POP.DNST,,9.206929,9.444024,9.690214,9.945378,10.209576,10.482895,10.765896,11.059617,11.365371,11.684171,12.016528,12.362473,12.721761,13.094008,13.478945,13.876693,14.287579,14.711372,15.14866,15.599232,...,25.947604,26.625607,27.321141,28.035481,28.769895,29.52636,30.001659,30.801413,31.628679,32.483559,33.367205,34.279582,35.219737,36.189915,36.879528,37.898051,38.940522,40.004465,41.089451,42.195162,43.319881,44.462045,45.620592,,
2,Afghanistan,AFG,Population density (people per sq. km of land ...,EN.POP.DNST,,14.058547,14.337645,14.631648,14.940699,15.265041,15.60008,15.945197,16.308762,16.702347,17.131463,17.594177,18.078319,18.56548,19.031569,19.455045,19.844369,20.194247,20.454746,20.561857,20.478206,...,30.261978,30.925972,31.859861,33.127872,34.65154,36.307546,37.910996,39.333171,40.527204,41.550591,42.503842,43.534959,44.747269,46.176059,47.776671,49.475786,51.164166,52.762987,54.249311,55.649251,56.992046,58.325678,59.68499,,
3,Africa Western and Central,AFW,Population density (people per sq. km of land ...,EN.POP.DNST,,10.877837,11.10994,11.351399,11.601453,11.859717,12.12633,12.402029,12.687791,12.984878,13.294421,13.616671,13.952068,14.302085,14.669124,15.05382,15.456176,15.876543,16.313802,16.766255,17.232596,...,28.002263,28.761516,29.540244,30.338334,31.157344,32.000505,32.872024,33.77493,34.710713,35.678698,36.677028,37.702723,38.753638,39.829118,40.929916,42.056948,43.211639,44.394917,45.607045,46.847251,48.11408,49.405535,50.720207,,
4,Angola,AGO,Population density (people per sq. km of land ...,EN.POP.DNST,,4.436874,4.498676,4.555554,4.60014,4.628678,4.637286,4.63178,4.629801,4.655231,4.724761,4.845784,5.012405,5.211585,5.423617,5.634069,5.839119,6.043005,6.249117,6.463553,6.690695,...,12.320205,12.727096,13.151101,13.592487,14.052633,14.535555,15.046232,15.588036,16.162593,16.768557,17.402451,18.059096,18.734457,19.427817,20.139508,20.86772,21.61047,22.366552,23.135062,23.916555,24.713072,25.527632,26.362612,,


In [13]:
read_name = "../data/bronze_tables/API_AG.LND.TOTL.K2_DS2_en_csv_v2_4701206.csv"

df_area = pd.read_csv(read_name, skiprows=4)

df_area.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,...,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Aruba,ABW,Land area (sq. km),AG.LND.TOTL.K2,,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,...,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,,
1,Africa Eastern and Southern,AFE,Land area (sq. km),AG.LND.TOTL.K2,,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,...,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14571611.0,14720190.0,14720240.0,14720230.0,14720270.0,14720236.89,14720270.0,14720960.0,14721240.05,14845170.0,14845130.0,14845090.0,14845140.0,14845150.0,14845140.0,14845150.0,14845160.0,14845120.0,,
2,Afghanistan,AFG,Land area (sq. km),AG.LND.TOTL.K2,,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,...,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,652230.0,,
3,Africa Western and Central,AFW,Land area (sq. km),AG.LND.TOTL.K2,,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046580.0,9046180.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,...,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045780.0,9045773.0,,
4,Angola,AGO,Land area (sq. km),AG.LND.TOTL.K2,,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,...,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,1246700.0,,


In [14]:
df_pop_area = pd.concat([df_population, df_area])

df_pop_area = df_pop_area[df_population.columns[:-2]].drop(['Indicator Code'], axis=1)

In [15]:
df_pop_area.head(2)

Unnamed: 0,Country Name,Country Code,Indicator Name,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,...,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,ABW,Population density (people per sq. km of land ...,,307.966667,312.411111,314.994444,316.827778,318.65,320.566667,322.466667,324.316667,326.3,328.166667,330.233333,332.494444,334.644444,336.261111,336.961111,336.588889,335.366667,333.9,333.177778,333.872222,336.45,...,462.283333,474.722222,484.888889,494.494444,504.811111,516.066667,527.733333,538.977778,548.577778,555.711111,560.166667,562.366667,563.122222,563.622222,564.805556,566.944444,569.805556,573.138889,576.533333,579.661111,582.583333,585.338889,588.033333,590.611111,593.144444
1,Africa Eastern and Southern,AFE,Population density (people per sq. km of land ...,,9.206929,9.444024,9.690214,9.945378,10.209576,10.482895,10.765896,11.059617,11.365371,11.684171,12.016528,12.362473,12.721761,13.094008,13.478945,13.876693,14.287579,14.711372,15.14866,15.599232,16.062631,...,24.633762,25.284822,25.947604,26.625607,27.321141,28.035481,28.769895,29.52636,30.001659,30.801413,31.628679,32.483559,33.367205,34.279582,35.219737,36.189915,36.879528,37.898051,38.940522,40.004465,41.089451,42.195162,43.319881,44.462045,45.620592


In [16]:
df_pop_area = df_pop_area.melt(
    id_vars=df_pop_area.columns[:3], 
    value_vars=df_pop_area.columns[3:],
    var_name='Year'
).reset_index(drop=True)

df_pop_area.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Year,value
0,Aruba,ABW,Population density (people per sq. km of land ...,1960,
1,Africa Eastern and Southern,AFE,Population density (people per sq. km of land ...,1960,
2,Afghanistan,AFG,Population density (people per sq. km of land ...,1960,
3,Africa Western and Central,AFW,Population density (people per sq. km of land ...,1960,
4,Angola,AGO,Population density (people per sq. km of land ...,1960,


In [17]:
df_pop_area = df_pop_area.pivot(['Country Name', 'Country Code', 'Year'], 'Indicator Name', 'value').reset_index()

In [18]:
df_pop_area['Year'] = df_pop_area['Year'].astype(np.number)

#### join with combined_df

In [19]:
df_pop_area = df_pop_area.rename(columns={'Country Name': 'Entity'})

In [20]:
df_pop_area.head()

Indicator Name,Entity,Country Code,Year,Land area (sq. km),Population density (people per sq. km of land area)
0,Afghanistan,AFG,1960.0,,
1,Afghanistan,AFG,1961.0,652230.0,14.058547
2,Afghanistan,AFG,1962.0,652230.0,14.337645
3,Afghanistan,AFG,1963.0,652230.0,14.631648
4,Afghanistan,AFG,1964.0,652230.0,14.940699


In [21]:
df = df.set_index(['Entity', 'Year']).join(df_pop_area.set_index(['Entity', 'Year'])).reset_index()

### Rearrange Columns

In [22]:
cat_cols = ['Entity', 'Year', 'Code', 'Region', 'Income group', 'Lending category']
value_cols = list((set(df.columns) - set(cat_cols)) - {'Country Code'})

In [23]:
df = df[cat_cols + value_cols]

### Basic Analysis

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27960 entries, 0 to 27959
Data columns (total 37 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Entity                                               27960 non-null  object 
 1   Year                                                 27960 non-null  int64  
 2   Code                                                 18173 non-null  object 
 3   Region                                               17664 non-null  object 
 4   Income group                                         17664 non-null  object 
 5   Lending category                                     11122 non-null  object 
 6   Fossil fuels (% equivalent primary energy)           6068 non-null   float64
 7   Fossil fuels (% growth)                              6034 non-null   float64
 8   Gas Production - TWh                                 4053 non-null

In [25]:
df.describe()

Unnamed: 0,Year,Fossil fuels (% equivalent primary energy),Fossil fuels (% growth),Gas Production - TWh,Land area (sq. km),Unnamed: 0.1.1.1,Coal Consumption - TWh,Renewables per capita (kWh - equivalent),Wind Generation - TWh,prod of Other renewables including bioenergy (TWh),Fossil fuels per capita (kWh),Fossil fuels (TWh),Oil Production - TWh,Coal Production - TWh,Hydro Generation - TWh,Unnamed: 0.1.1,prod of Electricity from solar (TWh),prod of Electricity from hydro (TWh),Unnamed: 0.1,"Gas (TWh, direct energy)",Annual change in primary energy consumption (%),Geo Biomass Other - TWh,"Coal (TWh, direct energy)",Per capita electricity (kWh),Oil Consumption - TWh,Renewables (% electricity),Population density (people per sq. km of land area),Solar Generation - TWh,Gas Consumption - TWh,"Oil (TWh, direct energy)",Annual CO2 emissions,prod of Electricity from wind (TWh)
count,27960.0,6068.0,6034.0,4053.0,10559.0,13181.0,6134.0,4862.0,5939.0,8899.0,4840.0,6068.0,4408.0,2500.0,6153.0,13181.0,8939.0,9170.0,13181.0,74.0,12261.0,5948.0,74.0,6220.0,6160.0,7008.0,9717.0,5924.0,6094.0,74.0,24670.0,8953.0
mean,1958.997997,85.831892,3.317557,1775.709862,1433329.0,6590.0,1625.004354,4.339072,13.805802,10.535739,31653.758962,5136.950645,3466.583599,3274.685893,136.73628,6590.0,4.42386,105.856874,6590.0,16669.84959,4.125264,12.44599,23161.696104,3950.509462,2228.388291,30.48827,161.785499,5.047541,1228.19854,30469.016364,326658300.0,12.191714
std,52.198663,14.603942,22.875763,4042.829666,9901976.0,3805.171284,4907.816698,12.693478,81.229086,45.6427,33092.712344,13737.030117,6955.447481,6908.166717,374.128795,3805.171284,37.61014,346.000705,3805.171284,12227.579351,27.879834,45.803265,13521.988256,5026.958829,5840.259523,30.934145,475.88973,38.265463,3413.741868,17904.778196,1677027000.0,81.726038
min,1750.0,12.804697,-49.590828,0.0,10.0,0.0,0.0,0.0,0.0,0.0,155.807388,0.097381,0.0,0.339596,0.0,0.0,0.0,0.0,0.0,0.0,-95.005081,0.0,97.0,0.0,0.097381,0.0,0.098625,0.0,0.0,0.0,34.0,0.0
25%,1934.0,80.50989,-0.53191,44.498001,23180.0,3295.0,4.537538,0.208465,0.0,0.0,10325.921631,149.484257,100.963156,38.597149,1.389,3295.0,0.0,0.1,3295.0,6995.105713,-0.349981,0.0,16180.420654,596.250275,80.953899,3.552361,17.792095,0.0,12.293094,19688.894043,556928.0,0.0
50%,1973.0,90.139271,2.589911,192.537277,111890.0,6590.0,46.895945,0.89916,0.0,0.02,25337.834961,418.682419,467.141678,357.479385,10.712626,6590.0,0.0,3.5655,6590.0,16082.451172,2.46042,0.201,23494.454102,2502.079101,208.418251,18.145459,60.702633,0.0,81.496605,35453.982422,5332958.0,0.0
75%,1998.0,96.528872,5.987856,992.275024,499460.0,9885.0,442.860115,3.228757,0.22132,1.333982,40323.15918,1808.173248,3527.187439,2806.387512,59.555,9885.0,0.01,28.66731,9885.0,25554.728027,6.231642,3.617,30863.200195,5627.8219,999.276443,53.127329,138.575929,0.016565,429.343689,44337.702148,48153090.0,0.04
max,2021.0,100.000015,1553.10498,40368.828125,129987000.0,13180.0,45161.207031,153.883406,1861.939819,762.782654,308704.21875,136131.46875,52181.949219,46550.605469,4345.990234,13180.0,1032.501221,4345.990234,13180.0,40374.605469,1553.10498,762.782654,45161.207031,56781.60156,53368.628906,100.0,7965.878492,1032.501221,40374.605469,53368.628906,36702500000.0,1861.939819


In [26]:
df.describe(include='object')

Unnamed: 0,Entity,Code,Region,Income group,Lending category
count,27960,18173,17664,17664,11122
unique,321,181,7,4,3
top,Europe,GBR,Europe & Central Asia,High income,IBRD
freq,272,272,7103,7516,6856


### Data Clean-up

In [27]:
x = 'Entity'

#### Unify Entinty Names

In [28]:
df[x] = df[x].str.title()

In [29]:
df[x].value_counts()

United Kingdom                                  272
World                                           272
High-Income Countries                           272
Europe                                          272
European Union (28)                             271
Europe (Excl. Eu-27)                            271
North America                                   237
Canada                                          237
North America (Excl. Usa)                       236
European Union (27)                             230
Germany                                         230
United States                                   222
Poland                                          222
France                                          214
Austria                                         204
Belgium                                         193
Spain                                           192
Europe (Excl. Eu-28)                            191
Norway                                          190
Hungary     

#### Create Entity Category Column

In [30]:
df[x].nunique()

321

In [31]:
continents = ['Asia', 'Africa', 'North America', 'South America', 'Europe', "Antarctica"]

countries_official_names = ['Afghanistan', 'Aland Islands', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia, Plurinational State of', 'Bonaire, Sint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island', 'Cocos (Keeling) Islands', 'Colombia', 'Comoros', 'Congo', 'Congo, The Democratic Republic of the', 'Cook Islands', 'Costa Rica', "Côte d'Ivoire", 'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Falkland Islands (Malvinas)', 'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana', 'French Polynesia', 'French Southern Territories', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Greece', 'Greenland', 'Grenada', 'Guadeloupe', 'Guam', 'Guatemala', 'Guernsey', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Heard Island and McDonald Islands', 'Holy See (Vatican City State)', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran, Islamic Republic of', 'Iraq', 'Ireland', 'Isle of Man', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jersey', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', "Korea, Democratic People's Republic of", 'Korea, Republic of', 'Kuwait', 'Kyrgyzstan', "Lao People's Democratic Republic", 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macao', 'Macedonia, Republic of', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Martinique', 'Mauritania', 'Mauritius', 'Mayotte', 'Mexico', 'Micronesia, Federated States of', 'Moldova, Republic of', 'Monaco', 'Mongolia', 'Montenegro', 'Montserrat', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Caledonia', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestinian Territory, Occupied', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Pitcairn', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Réunion', 'Romania', 'Russian Federation', 'Rwanda', 'Saint Barthélemy', 'Saint Helena, Ascension and Tristan da Cunha', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Martin (French part)', 'Saint Pierre and Miquelon', 'Saint Vincent and the Grenadines', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Georgia and the South Sandwich Islands', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'South Sudan', 'Svalbard and Jan Mayen', 'Swaziland', 'Sweden', 'Switzerland', 'Syrian Arab Republic', 'Taiwan, Province of China', 'Tajikistan', 'Tanzania, United Republic of', 'Thailand', 'Timor-Leste', 'Togo', 'Tokelau', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Turks and Caicos Islands', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'United States Minor Outlying Islands', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela, Bolivarian Republic of', 'Viet Nam', 'Virgin Islands, British', 'Virgin Islands, U.S.', 'Wallis and Futuna', 'Yemen', 'Zambia', 'Zimbabwe']
other_countries = [
       'Antigua And Barbuda', 'Bolivia',
       'Bonaire Sint Eustatius And Saba', 'Bosnia And Herzegovina',
       'British Virgin Islands', 'Brunei', "Cote D'Ivoire", 'Curacao',
       'Czechia', 'Democratic Republic Of Congo', 'Eswatini',
       'Faeroe Islands', 'French Equatorial Africa', 'French West Africa',
       'Iran',
       'Kosovo',  'Laos', 'Leeward Islands',
       'Moldova',
       'North Korea', 'North Macedonia', 'Oceania', 'Palestine',
       'Panama Canal Zone', 'Reunion', 'Russia', 'Ryukyu Islands',
       'Saint Helena', 'Saint Kitts And Nevis',
       'Saint Pierre And Miquelon', 'Saint Vincent And The Grenadines',
       'Sao Tome And Principe', 'South Korea', 'St. Kitts-Nevis-Anguilla',
       'Syria', 'Taiwan', 'Tanzania', 'Timor', 'Trinidad And Tobago',
       'Turks And Caicos Islands',
       'Venezuela', 'Vietnam', 'Wallis And Futuna', 'Ussr',
       'Czechoslovakia', 'Falkland Islands', 'Netherlands Antilles',
       'Serbia And Montenegro', 'United States Virgin Islands',
       'Western Sahara', 'Yugoslavia',
       'Micronesia (Country)', 'Sint Maarten (Dutch Part)'
                  ]
countries = countries_official_names + other_countries

In [32]:
def assign_category_to_entity(s):

    if "(" in s:
        if "(Bp)" in s:
            return "Bp?"
        if "(Eia)" in s:
            return "Eia?"
        if "(Ember)" in s:
            return "Ember?"
        if s in countries:
            return "Country"
        else:
            return "Exlusive Regions"

    else:
        if s == "World":
            return "World"
        elif s in continents:
            return "Continent"
        elif s in countries:
            return "Country"
        elif "Income" in s:
            return "Income Level"
        else:
            return "Other"

In [33]:
df['Entity_Category'] = df[x].apply(assign_category_to_entity)

In [34]:
df['Entity_Category'].value_counts()

Country             21830
Bp?                  1549
Exlusive Regions     1368
Eia?                 1040
Continent             994
Income Level          725
World                 272
Ember?                110
Other                  72
Name: Entity_Category, dtype: int64

In [35]:
df_entity_lookup = df[['Entity_Category', 'Entity', 'Code', 'Income group', 'Lending category']].drop_duplicates().sort_values(by='Entity_Category')

In [36]:
df_entity_lookup['Entity_Category'].value_counts()

Country             240
Eia?                 29
Bp?                  28
Continent             6
Exlusive Regions      6
Ember?                5
Income Level          4
Other                 2
World                 1
Name: Entity_Category, dtype: int64

In [37]:
df_entity_lookup.head()

Unnamed: 0,Entity_Category,Entity,Code,Income group,Lending category
15505,Bp?,Middle East (Bp),,,
15448,Bp?,Middle Africa (Bp),,,
19236,Bp?,Other Middle East (Bp),,,
19190,Bp?,Other Middle Africa (Bp),,,
19133,Bp?,Other Europe (Bp),,,


In [38]:
df.head()

Unnamed: 0,Entity,Year,Code,Region,Income group,Lending category,Fossil fuels (% equivalent primary energy),Fossil fuels (% growth),Gas Production - TWh,Land area (sq. km),Unnamed: 0.1.1.1,Coal Consumption - TWh,Renewables per capita (kWh - equivalent),Wind Generation - TWh,prod of Other renewables including bioenergy (TWh),Fossil fuels per capita (kWh),Fossil fuels (TWh),Oil Production - TWh,Coal Production - TWh,Hydro Generation - TWh,Unnamed: 0.1.1,prod of Electricity from solar (TWh),prod of Electricity from hydro (TWh),Unnamed: 0.1,"Gas (TWh, direct energy)",Annual change in primary energy consumption (%),Geo Biomass Other - TWh,"Coal (TWh, direct energy)",Per capita electricity (kWh),Oil Consumption - TWh,Renewables (% electricity),Population density (people per sq. km of land area),Solar Generation - TWh,Gas Consumption - TWh,"Oil (TWh, direct energy)",Annual CO2 emissions,prod of Electricity from wind (TWh),Entity_Category
0,Afghanistan,1949,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14656.0,,Country
1,Afghanistan,1950,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84272.0,,Country
2,Afghanistan,1951,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91600.0,,Country
3,Afghanistan,1952,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91600.0,,Country
4,Afghanistan,1953,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,106256.0,,Country


## Save Tables

In [39]:
df.head()

Unnamed: 0,Entity,Year,Code,Region,Income group,Lending category,Fossil fuels (% equivalent primary energy),Fossil fuels (% growth),Gas Production - TWh,Land area (sq. km),Unnamed: 0.1.1.1,Coal Consumption - TWh,Renewables per capita (kWh - equivalent),Wind Generation - TWh,prod of Other renewables including bioenergy (TWh),Fossil fuels per capita (kWh),Fossil fuels (TWh),Oil Production - TWh,Coal Production - TWh,Hydro Generation - TWh,Unnamed: 0.1.1,prod of Electricity from solar (TWh),prod of Electricity from hydro (TWh),Unnamed: 0.1,"Gas (TWh, direct energy)",Annual change in primary energy consumption (%),Geo Biomass Other - TWh,"Coal (TWh, direct energy)",Per capita electricity (kWh),Oil Consumption - TWh,Renewables (% electricity),Population density (people per sq. km of land area),Solar Generation - TWh,Gas Consumption - TWh,"Oil (TWh, direct energy)",Annual CO2 emissions,prod of Electricity from wind (TWh),Entity_Category
0,Afghanistan,1949,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14656.0,,Country
1,Afghanistan,1950,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84272.0,,Country
2,Afghanistan,1951,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91600.0,,Country
3,Afghanistan,1952,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91600.0,,Country
4,Afghanistan,1953,AFG,South Asia,Low income,IDA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,106256.0,,Country


In [40]:
df.to_csv('../data/silver_tables/yearly_values_per_entity.csv', index=False)

In [41]:
df_entity_lookup.to_csv('../data/silver_tables/entity_lookup.csv', index=False)