In [63]:
import pandas as pd
import numpy as np
import re
from utils import cleaning

In [64]:
df = pd.read_csv("dataset/world_energy_consumption.csv")

In [65]:
df.head()

Unnamed: 0,country,year,iso_code,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
0,Afghanistan,1900,AFG,4832414.0,,,,,,,...,,,,,,,,,,
1,Afghanistan,1901,AFG,4879685.0,,,,,,,...,,,,,,,,,,
2,Afghanistan,1902,AFG,4935122.0,,,,,,,...,,,,,,,,,,
3,Afghanistan,1903,AFG,4998861.0,,,,,,,...,,,,,,,,,,
4,Afghanistan,1904,AFG,5063419.0,,,,,,,...,,,,,,,,,,


In [66]:
df.columns

Index(['country', 'year', 'iso_code', 'population', 'gdp',
       'biofuel_cons_change_pct', 'biofuel_cons_change_twh',
       'biofuel_cons_per_capita', 'biofuel_consumption',
       'biofuel_elec_per_capita',
       ...
       'solar_share_elec', 'solar_share_energy', 'wind_cons_change_pct',
       'wind_cons_change_twh', 'wind_consumption', 'wind_elec_per_capita',
       'wind_electricity', 'wind_energy_per_capita', 'wind_share_elec',
       'wind_share_energy'],
      dtype='object', length=128)

In [67]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22343 entries, 0 to 22342
Data columns (total 128 columns):
 #    Column                                        Dtype  
---   ------                                        -----  
 0    country                                       object 
 1    year                                          int64  
 2    iso_code                                      object 
 3    population                                    float64
 4    gdp                                           float64
 5    biofuel_cons_change_pct                       float64
 6    biofuel_cons_change_twh                       float64
 7    biofuel_cons_per_capita                       float64
 8    biofuel_consumption                           float64
 9    biofuel_elec_per_capita                       float64
 10   biofuel_electricity                           float64
 11   biofuel_share_elec                            float64
 12   biofuel_share_energy                        

In [68]:
# Dump value counts
cleaning.dump_value_counts('data_expl/', df)

Try to subdivide the dataset in other sub-dataset.
1. One will be the ones with all the primary energy productions, consumption,
   shares etc.
2. Other will be with overall measures such as overall energy production
<br><br>
After that, an EDA will be performed, it will also be easier to understand the <br> 
dataset's behaviour (missing data per country, missing data per energy src etc.) <br>
These sub-dataset can be indexed by the identifiers that reports general id like <br>
country, population and gdp. 

In [69]:
# All energy sources
energy_src = ['biofuel', 'coal', 'gas', 'oil', 'hydro', 'nuclear',
               'other_renewable', 'other_renewable_exc_biofuel', 'solar', 'wind', 
               'fossil', 'low_carbon', 'renewables'] # Primary energy sources
overall_energy_measures = ['energy_cons_change_pct', 'energy_cons_change_pct', 
                    'carbon_intensity_elec', 'electricity_generation', 
                    'fossil_electricity', 'renewables_electricity',
                    'energy_per_gdp', 'energy_per_capita', 
                    'per_capita_electricity', 'primary_energy_consumption']
identifiers = ['iso_code', 'country', 'year', 'population', 'gdp', ]

Which kind of data is available for energy src?

In [70]:
# Divide pure energy src from aggregated ones
pure_energy_src = ['biofuel', 'coal', 'gas', 'hydro', 'nuclear', 'oil', 'solar', 'wind']
aggragated_energy_src = ['fossil', 'low_carbon', 'other_renewable', 'other_renewable_exc_biofuel', 'renewables']

In [71]:
# Create a energy src dataset for exploration
pure_energy_src_data_cols = ['energy_source', 'description', '#_of_measures_available']

In [72]:
# Std consumption and production names:
# production --> prod
# consumption --> cons
# electricity --> elec
df_std = df.copy()

df_std.columns = df.columns.str.replace('production', 'prod').str.replace('electricity', 'elec').str.replace('consumption', 'cons')


In [73]:
# How many measures are available for each energy srcs
n_measures = [([col for col in df_std.columns if energy == col.split('_')[0]]) for energy in pure_energy_src]
measures_per_energy = {pure_energy_src[i]: n_measures[i] for i in range(len(n_measures))}
measures_per_energy

{'biofuel': ['biofuel_cons_change_pct',
  'biofuel_cons_change_twh',
  'biofuel_cons_per_capita',
  'biofuel_cons',
  'biofuel_elec_per_capita',
  'biofuel_elec',
  'biofuel_share_elec',
  'biofuel_share_energy'],
 'coal': ['coal_cons_change_pct',
  'coal_cons_change_twh',
  'coal_cons_per_capita',
  'coal_cons',
  'coal_elec_per_capita',
  'coal_elec',
  'coal_prod_change_pct',
  'coal_prod_change_twh',
  'coal_prod_per_capita',
  'coal_prod',
  'coal_share_elec',
  'coal_share_energy'],
 'gas': ['gas_cons_change_pct',
  'gas_cons_change_twh',
  'gas_cons',
  'gas_elec_per_capita',
  'gas_elec',
  'gas_energy_per_capita',
  'gas_prod_change_pct',
  'gas_prod_change_twh',
  'gas_prod_per_capita',
  'gas_prod',
  'gas_share_elec',
  'gas_share_energy'],
 'hydro': ['hydro_cons_change_pct',
  'hydro_cons_change_twh',
  'hydro_cons',
  'hydro_elec_per_capita',
  'hydro_elec',
  'hydro_energy_per_capita',
  'hydro_share_elec',
  'hydro_share_energy'],
 'nuclear': ['nuclear_cons_change_pct',

For each pure energy src we have the following:
1. Consumption data **cons**
   For **cons** data then we have
   1. Primary fuel consumption if only **cons** in terawatt-hours
   2. **change** --> representing annual change in energy src consumption
      1. **change** can be percentatage with **pct**
      2. **change** can be in terawatt-hours with **twh**
   3. **per_capita** if measured in terwatt-hours per capita
2. Electricity consumption from the energy src with **elec**:
   1. Pure electricity consumption if only **elec** in terawatt-hours
   2. **per_capita** if electricity is measured per capita
3. **share** when accounting for the share among the other energy srcs:
   1. could be **elec** referring to electricity consuption;
   2. could be **energy** when accounting for energy srcs
4. **prod** accounts for prod (elec+thermal?) using the energy source:
   1. if no additional keywords, is the overall production in terawatt hours