## Import Libs & Fetch Dataset

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
from pathlib import Path

In [9]:
PROJECT_ROOT = Path.cwd().parents[0]
SRC_DIR = PROJECT_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

from my_project.paths import get_paths

paths = get_paths(PROJECT_ROOT)
DATA_DIR = paths['DATA_DIR']
RAW_DATA_DIR = paths['RAW_DATA_DIR']
PROCESSED_DATA_DIR = paths['PROCESSED_DATA_DIR']
LOGS_DIR = paths['LOGS_DIR']

from data.ember_api_client import EmberAPI

In [10]:
# get data from Ember API 
api = EmberAPI()

# get yearly generation data
params = {"is_aggregate_series":False, "start_date":"2000",}
yearly_generation_json = api.fetch_and_cache(
    endpoint_name="electricity_generation_yearly",
    fetch_func=api.electricity_generation_yearly,
    params=params
)

# get monthly generation data
# params = {"is_aggregate_series":False, "start_date":"2000-01", }
# monthly_generation_json = api.fetch_and_cache(
#     endpoint_name="electricity_generation_monthly",
#     fetch_func=api.electricity_generation_monthly,
#     params=params
# )

# get monthly installed capacity data
params = {"is_aggregate_series":False, "start_date": "2000-01",}
monthly_capacity_json = api.fetch_and_cache(
    endpoint_name = "electricity_capacity_monthly",
    fetch_func=api.electricity_capacity_monthly,
    params=params
)

2025-12-28 21:08:38 | INFO | Initialized EmberAPI client with base URL: https://api.ember-energy.org
2025-12-28 21:08:38 | INFO | Loading cached data from /home/zephyr/workspace/Global_Energy_Trends/data/raw/electricity_generation_yearly_is_aggregate_series-False_start_date-2000.json
2025-12-28 21:08:38 | INFO | Loading cached data from /home/zephyr/workspace/Global_Energy_Trends/data/raw/electricity_capacity_monthly_is_aggregate_series-False_start_date-2000-01.json


Decided to use yearly instead of monthly generation data since the yearly data is more comprehensive in terms of  coverage. As of Dec2025, the monthly generation data only contains 88 economies/countries whereas the yearly data contains over 210 economies/countries.

As for capacity data, only monthly data is available on the API so we will get it and later merge with IMF energy data. 

In [11]:
# df_generation_monthly = pd.DataFrame(monthly_generation_json.get("data",[]))
df_capacity_monthly = pd.DataFrame(monthly_capacity_json.get("data",[]))
df_generation_yearly = pd.DataFrame(yearly_generation_json.get("data", []))

## Dataset Overview

### Electricity Generation Yearly

In [12]:
df_generation_yearly.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52181 entries, 0 to 52180
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   entity                   52181 non-null  object 
 1   entity_code              49231 non-null  object 
 2   is_aggregate_entity      52181 non-null  bool   
 3   date                     52181 non-null  object 
 4   series                   52181 non-null  object 
 5   is_aggregate_series      52181 non-null  bool   
 6   generation_twh           52181 non-null  float64
 7   share_of_generation_pct  52181 non-null  float64
dtypes: bool(2), float64(2), object(4)
memory usage: 2.5+ MB


In [13]:
df_generation_yearly.isnull().sum()

entity                        0
entity_code                2950
is_aggregate_entity           0
date                          0
series                        0
is_aggregate_series           0
generation_twh                0
share_of_generation_pct       0
dtype: int64

In [14]:
df_generation_yearly[df_generation_yearly['is_aggregate_entity']==True]['entity_code'].isnull().sum()

np.int64(2950)

No missing data. The only nulls are entity codes which are not defined for those aggregated regions such as EU, Asia, etc.

##### Drop unnecessary columns

In [15]:
# dropping this constant since we have already requested unaggregated series during api call 
if "is_aggregate_series" in df_generation_yearly.columns:
    print(df_generation_yearly['is_aggregate_series'].unique())
    print(df_generation_yearly.shape)
    df_generation = df_generation_yearly.drop(columns="is_aggregate_series")
    print(df_generation.shape)

[False]
(52181, 8)
(52181, 7)


In [16]:
# split dataset into aggregated regions and individual country/economy dataset
# aggregated region dataset
df_generation_region = df_generation_yearly[df_generation_yearly['is_aggregate_entity'] == True].copy()
df_generation_country = df_generation_yearly[df_generation_yearly['is_aggregate_entity'] == False].copy()

# drop is_aggregated_entity column since it is constant in each dataset after splitting
df_generation_region = df_generation_region.drop(columns="is_aggregate_entity")
df_generation_country = df_generation_country.drop(columns="is_aggregate_entity")

print(df_generation_region.shape, df_generation_country.shape)

(2950, 7) (49231, 7)


In [17]:
print("Num Countries/Economies: ", df_generation_country['entity'].nunique())
print("Num Aggregated Regions: ", df_generation_region['entity'].nunique())

Num Countries/Economies:  210
Num Aggregated Regions:  13


In [18]:
df_generation_country['entity'].unique().tolist()

['United States',
 'Uruguay',
 'Uzbekistan',
 'Vanuatu',
 'Venezuela',
 'Viet Nam',
 'Virgin Islands (British)',
 'Virgin Islands (U.S.)',
 'Yemen',
 'Zambia',
 'Zimbabwe',
 'Thailand',
 'The Philippines',
 'Togo',
 'Tonga',
 'Trinidad and Tobago',
 'Tunisia',
 'Türkiye',
 'Turkmenistan',
 'Turks and Caicos Islands',
 'Uganda',
 'Ukraine',
 'United Arab Emirates',
 'United Kingdom',
 'Saint Vincent and the Grenadines',
 'Samoa',
 'Sao Tome and Principe',
 'Saudi Arabia',
 'Senegal',
 'Serbia',
 'Seychelles',
 'Sierra Leone',
 'Singapore',
 'Slovakia',
 'Slovenia',
 'Solomon Islands',
 'Somalia',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Montserrat',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Namibia',
 'Nauru',
 'Nepal',
 'Netherlands',
 'New Caledonia',
 'New Zealand',
 'Nicaragua',
 'Niger',
 'South Africa',
 'South Korea',
 'Spain',
 'Sri Lanka',
 'Sudan',
 'Suriname',
 'Sweden',
 

##### Country/Economy-wise Variables

In [19]:
# check overview with data profiling
from ydata_profiling import ProfileReport

profile_df = ProfileReport(df_generation_country, title="Ember Yearly Energy Dataset Profile", minimal=True)
profile_df.to_file(os.path.join(DATA_DIR, "ember_yearly_energy_profile.html"))
print(f"Profile saved to {DATA_DIR}")

  from .autonotebook import tqdm as notebook_tqdm
2025-12-28 21:08:39 | INFO | Pandas backend loaded 2.3.3
2025-12-28 21:08:39 | INFO | Numpy backend loaded 2.3.5
2025-12-28 21:08:39 | INFO | Pyspark backend NOT loaded
2025-12-28 21:08:39 | INFO | Python backend loaded
100%|██████████| 7/7 [00:00<00:00, 550.27it/s]0<00:00, 51.09it/s, Describe variable: share_of_generation_pct]
Summarize dataset: 100%|██████████| 13/13 [00:00<00:00, 104.66it/s, Completed]                               
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.37s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  9.09it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 610.26it/s]

Profile saved to /home/zephyr/workspace/Global_Energy_Trends/data





In [20]:
df_generation_country.info() # no missing data

<class 'pandas.core.frame.DataFrame'>
Index: 49231 entries, 0 to 52170
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   entity                   49231 non-null  object 
 1   entity_code              49231 non-null  object 
 2   date                     49231 non-null  object 
 3   series                   49231 non-null  object 
 4   is_aggregate_series      49231 non-null  bool   
 5   generation_twh           49231 non-null  float64
 6   share_of_generation_pct  49231 non-null  float64
dtypes: bool(1), float64(2), object(4)
memory usage: 2.7+ MB


{ "entity" : "name of country/economy",
"entity_code" :  "ISO code associated with entity",
"date" : "years which the data record is taken (2000 to 2024) as of 2025-Dec",
"series" : ""}

In [21]:
print(df_generation_country['entity'].nunique()) # 210 individual countries/economies
df_generation_country['entity'].value_counts() # since long format, each have multiple records

210


entity
United States      250
Uruguay            250
Türkiye            250
Tunisia            250
The Philippines    250
                  ... 
Reunion            153
Singapore          150
South Sudan        120
Martinique         115
French Guiana      105
Name: count, Length: 210, dtype: int64