#### Import & Path Setups

In [13]:
import sys
from pathlib import Path

import pandas as pd

# visualization
import plotly.express as px
import plotly.express as px
import plotly.graph_objects as go

In [14]:
PROJECT_ROOT = Path.cwd().parents[0]
SRC_DIR = PROJECT_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

from my_project.paths import get_paths

paths = get_paths(PROJECT_ROOT)
DATA_DIR = paths['DATA_DIR']
RAW_DATA_DIR = paths['RAW_DATA_DIR']
PROCESSED_DATA_DIR = paths['PROCESSED_DATA_DIR']
LOGS_DIR = paths['LOGS_DIR']

from utils.plots import save_static_img
from data.get_data import download_from_url

#### Download Dataset

In [15]:
file_url = "https://storage.googleapis.com/emb-prod-bkt-publicdata/public-downloads/yearly_full_release_long_format.csv"
yearly_full_release = download_from_url(file_url, RAW_DATA_DIR)
df = pd.read_csv(yearly_full_release)
df.sample(3)

INFO:data.get_data:Downloading data from https://storage.googleapis.com/emb-prod-bkt-publicdata/public-downloads/yearly_full_release_long_format.csv to /home/zephyr/workspace/Global_Energy_Trends/data/raw/yearly_full_release_long_format.csv...
INFO:data.get_data:Download complete.


Unnamed: 0,Area,ISO 3 code,Year,Area type,Continent,Ember region,EU,OECD,G20,G7,ASEAN,Category,Subcategory,Variable,Unit,Value,YoY absolute change,YoY % change
266627,Reunion,REU,2020,Country or economy,Africa,Africa,0.0,0.0,0.0,0.0,0.0,Capacity,Fuel,Wind,GW,0.02,0.0,0.0
223275,Namibia,NAM,2018,Country or economy,Africa,Africa,0.0,0.0,0.0,0.0,0.0,Electricity generation,Aggregate fuel,Clean,TWh,1.39,-0.31,-18.24
200669,Mali,MLI,2001,Country or economy,Africa,Africa,0.0,0.0,0.0,0.0,0.0,Power sector emissions,Fuel,Hydro,mtCO2,0.01,0.0,0.0


#### Transformations

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357052 entries, 0 to 357051
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Area                 357052 non-null  object 
 1   ISO 3 code           335602 non-null  object 
 2   Year                 357052 non-null  int64  
 3   Area type            357052 non-null  object 
 4   Continent            335602 non-null  object 
 5   Ember region         335602 non-null  object 
 6   EU                   335602 non-null  float64
 7   OECD                 335602 non-null  float64
 8   G20                  335602 non-null  float64
 9   G7                   335602 non-null  float64
 10  ASEAN                335602 non-null  float64
 11  Category             357052 non-null  object 
 12  Subcategory          357052 non-null  object 
 13  Variable             357052 non-null  object 
 14  Unit                 357052 non-null  object 
 15  Value            

In [17]:
df.sample(3)

Unnamed: 0,Area,ISO 3 code,Year,Area type,Continent,Ember region,EU,OECD,G20,G7,ASEAN,Category,Subcategory,Variable,Unit,Value,YoY absolute change,YoY % change
86632,Djibouti,DJI,2011,Country or economy,Africa,Africa,0.0,0.0,0.0,0.0,0.0,Electricity generation,Aggregate fuel,Wind and Solar,TWh,0.0,0.0,
91252,Ecuador,ECU,2008,Country or economy,South America,Latin America and Caribbean,0.0,0.0,0.0,0.0,0.0,Electricity generation,Aggregate fuel,Wind and Solar,TWh,0.0,0.0,
41090,Bolivia,BOL,2018,Country or economy,South America,Latin America and Caribbean,0.0,0.0,0.0,0.0,0.0,Capacity,Aggregate fuel,Renewables,GW,0.97,0.21,27.63


In [18]:
# data cleaning for column names in df
col_name_clean = df.columns.str.lower().str.replace(" ", "_")
df.columns = col_name_clean

# rename cols to be more descriptive
df = df.rename(columns={
    "variable": "energy_type",
})

# remove unused columns
df = df.drop(columns=["ember_region", "eu", "oecd", "g20", "g7", "asean", "yoy_absolute_change", "yoy_%_change"])

# drop % rows from unit column
df = df[df["unit"] != "%"]

# remove unwanted subcategories 
# removing "Total" since it is an aggregate of generation and emissions
# removing "Aggregate Fuel": we'll use unaggregated data for more granularity in our analysis.
df = df[~df["subcategory"].isin(["Total", "Aggregate fuel"])]

In [19]:
# split dataset based on area_type
df_country = df[df["area_type"] == "Country or economy"]
df_region = df[df["area_type"] == "Region"]
# drop the area_type column as it's no longer needed
df_country = df_country.drop(columns=["area_type"])
df_region = df_region.drop(columns=["area_type"])

##### Country/Economy Data

In [28]:
df_country.sample(3)

Unnamed: 0,area,iso_3_code,year,continent,category,subcategory,energy_type,unit,value
300950,South Sudan,SSD,2014,Africa,Power sector emissions,Fuel,Nuclear,mtCO2,0.0
157100,Iraq,IRQ,2010,Asia,Capacity,Fuel,Wind,GW,
171148,Kiribati,KIR,2008,Oceania,Electricity generation,Fuel,Gas,TWh,0.0
