In [None]:
import pandas as pd

In [10]:
# Load dataset
df = pd.read_csv("data/raw/global_electricity_production_data.csv")

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])

# Extract year and month
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

# Identify the latest complete year in the dataset
latest_year = df['year'].max()
print(f"Latest year in dataset: {latest_year}")

# Filter to that year
df_latest = df[df['year'] == latest_year].copy()

# Aggregate by country and energy product
aggregated = (
    df_latest
    .groupby(['country_name', 'product'])
    .agg(
        total_GWh=('value', 'sum'),
        avg_GWh=('value', 'mean'),
        std_GWh=('value', 'std'),
        months_reported=('month', 'nunique')
    )
    .reset_index()
)

# Pivot to wide format for clustering later
aggregated_pivot = aggregated.pivot_table(
    index='country_name',
    columns='product',
    values='total_GWh',
    fill_value=0
)

print(aggregated_pivot.info())

Latest year in dataset: 2023
<class 'pandas.core.frame.DataFrame'>
Index: 48 entries, Argentina to United States
Data columns (total 16 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Coal, Peat and Manufactured Gases                  48 non-null     float64
 1   Combustible Renewables                             48 non-null     float64
 2   Data is estimated for this month                   48 non-null     float64
 3   Electricity                                        48 non-null     float64
 4   Geothermal                                         48 non-null     float64
 5   Hydro                                              48 non-null     float64
 6   Natural Gas                                        48 non-null     float64
 7   Not Specified                                      48 non-null     float64
 8   Nuclear                                          