In [None]:
import pandas as pd 
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Set up workspace path 
workspace_root = os.path.dirname(os.getcwd())

# Set the data directory path
data_dir = os.path.join(workspace_root, 'data')

csv_file_path = os.path.join(data_dir, 'important_variables.csv')

df = pd.read_csv(csv_file_path)


In [None]:
# Variables to include in the model. Exclude oil production for multicollinearity
TARGET_COL = 'WB_WDI_EN_GHG_CO2_PC_CE_AR5'  
YEAR_COL = 'YEAR'
COUNTRY_COL = 'REF_AREA_LABEL'
TRAIN_MAX_YEAR = 2020

# Predictor variables
features = [
'WB_WDI_EG_ELC_COAL_ZS',	# Electricity production from coal sources (% of total)
'WB_WDI_EG_ELC_NGAS_ZS',	# Electricity production from natural gas sources (% of total)
#'WB_WDI_EG_ELC_PETR_ZS',	# Electricity production from oil sources (% of total)
'WB_WDI_EG_ELC_HYRO_ZS',	# Electricity production from hydroelectric sources (% of total)
'WB_WDI_EG_ELC_NUCL_ZS',	# Electricity production from nuclear sources (% of total)
'WB_WDI_EG_ELC_RNWX_ZS', # Electricity production from renewable sources, excluding hydroelectric (% of total)
'WB_WDI_NY_GDP_MKTP_KD_ZG'# GDP growth

]                   



# 2. Filter dataset

# Include variables + target + key columns
df_filtered = df[[TARGET_COL, COUNTRY_COL, YEAR_COL] + features].dropna()


# 3. Temporal split

train = df_filtered[df_filtered[YEAR_COL] <= TRAIN_MAX_YEAR]
test = df_filtered[df_filtered[YEAR_COL] > TRAIN_MAX_YEAR]

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")


# 4. Dummy variables for COUNTRY

# Training
X_train = pd.get_dummies(train[[COUNTRY_COL] + features], columns=[COUNTRY_COL], drop_first=True)
y_train = train[TARGET_COL].values

# Test
X_test = pd.get_dummies(test[[COUNTRY_COL] + features], columns=[COUNTRY_COL], drop_first=True)

# Align columns between train and test
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)
y_test = test[TARGET_COL].values


# 5. Train model

model = LinearRegression()
model.fit(X_train, y_train)


# 6. Evaluate model

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")


# 7. Coefficients

coefficients = pd.DataFrame({
    "feature": X_train.columns,
    "coef": model.coef_
}).sort_values(by="coef", ascending=False)

print(coefficients)




Train shape: (4134, 9)
Test shape: (636, 9)
RMSE: 1.4341
R²: 0.9410
                                 feature       coef
122                 REF_AREA_LABEL_Qatar  45.230175
82                 REF_AREA_LABEL_Kuwait  24.217942
13                REF_AREA_LABEL_Bahrain  22.784407
154  REF_AREA_LABEL_United Arab Emirates  22.724272
148   REF_AREA_LABEL_Trinidad and Tobago  19.877507
..                                   ...        ...
163              REF_AREA_LABEL_Zimbabwe  -3.348116
101               REF_AREA_LABEL_Morocco  -3.931649
69                  REF_AREA_LABEL_India  -4.712576
23               REF_AREA_LABEL_Botswana  -5.080978
109                 REF_AREA_LABEL_Niger  -5.787214

[164 rows x 2 columns]


In [54]:
# Simulate increses in renewable energy production


def simulate_renewable_increase(df, increase_pp, rnwx_ratio=0.7, hydro_ratio=0.3):
    """
    Increases renewables (RNWX/HYRO) by 'increase_pp' percentage points,
    reducing only fossil fuels (COAL/NGAS) proportionally.
    If there's not enough fossil fuel to cut, limits the effective increase.
    """
    df_sim = df.copy()

    # columns
    coal  = 'WB_WDI_EG_ELC_COAL_ZS'
    gas   = 'WB_WDI_EG_ELC_NGAS_ZS'
    hydro = 'WB_WDI_EG_ELC_HYRO_ZS'
    rnwx  = 'WB_WDI_EG_ELC_RNWX_ZS'
    nucl  = 'WB_WDI_EG_ELC_NUCL_ZS'  # we don't touch this

    # ensure float type
    for c in [coal, gas, hydro, rnwx, nucl]:
        df_sim[c] = pd.to_numeric(df_sim[c], errors='coerce')

    # fossil fuel available to cut
    fossil_total = df_sim[coal].fillna(0) + df_sim[gas].fillna(0)

    # maximum possible increase without leaving fossil fuels negative
    effective_inc = np.minimum(increase_pp, fossil_total)

    # allocation to RNWX/HYRO according to ratios
    drnwx  = effective_inc * rnwx_ratio
    dhydro = effective_inc * hydro_ratio

    df_sim[rnwx]  = (df_sim[rnwx].fillna(0)  + drnwx).clip(lower=0)
    df_sim[hydro] = (df_sim[hydro].fillna(0) + dhydro).clip(lower=0)

    # distribute the reduction between coal and gas in proportion to their weight
    # avoid division by zero with a minimum denominator
    denom = fossil_total.replace(0, np.nan)  # NaN where there's no fossil fuel
    dcoal = -(effective_inc * (df_sim[coal] / denom)).fillna(0)
    dgas  = -(effective_inc * (df_sim[gas]  / denom)).fillna(0)

    df_sim[coal] = (df_sim[coal].fillna(0) + dcoal).clip(lower=0)
    df_sim[gas]  = (df_sim[gas].fillna(0)  + dgas).clip(lower=0)

    # final cleanup just in case
    df_sim = df_sim.replace([np.inf, -np.inf], np.nan).fillna(0)

    
    return df_sim



# 2. Create three scenarios


latest = test.sort_values(YEAR_COL).groupby(COUNTRY_COL).tail(1)

scenarios = {
    # Solar/wind dominates
    "scenario_1": simulate_renewable_increase(latest, increase_pp=10, rnwx_ratio=0.8, hydro_ratio=0.2),
    
    # Balanced
    "scenario_2": simulate_renewable_increase(latest, increase_pp=10, rnwx_ratio=0.5, hydro_ratio=0.5),
    
    # Hydroelectric dominates
    "scenario_3": simulate_renewable_increase(latest, increase_pp=10, rnwx_ratio=0.3, hydro_ratio=0.7),
}




In [55]:
scenarios

{'scenario_1':       WB_WDI_EN_GHG_CO2_PC_CE_AR5          REF_AREA_LABEL  YEAR  \
 29                       1.672195                 Albania  2024   
 4739                     0.388873                  Zambia  2024   
 4019                     0.425102                   Sudan  2024   
 59                       3.906870                 Algeria  2024   
 539                      6.906184  Bosnia and Herzegovina  2024   
 ...                           ...                     ...   ...   
 3329                     0.810127                Pakistan  2024   
 3359                     3.301479                  Panama  2024   
 929                      9.399266                   China  2024   
 569                      2.993536                Botswana  2024   
 4769                     0.718697                Zimbabwe  2024   
 
       WB_WDI_EG_ELC_COAL_ZS  WB_WDI_EG_ELC_NGAS_ZS  WB_WDI_EG_ELC_HYRO_ZS  \
 29                 0.000000               0.000000              97.700657   
 4739       

In [56]:
# Baseline  with the latest year

X_latest = pd.get_dummies(latest[[COUNTRY_COL] + features], columns=[COUNTRY_COL], drop_first=True)
X_latest = X_latest.reindex(columns=X_train.columns, fill_value=0)

base_pred = model.predict(X_latest)
base_df = pd.DataFrame({
    COUNTRY_COL: latest[COUNTRY_COL].values,
    YEAR_COL: latest[YEAR_COL].values,
    "Pred_CO2_per_capita_Base": base_pred
})


# 3. Prepare data and predict for each scenario

results = {}

for name, df_sim in scenarios.items():
    # Build X for the SCENARIO (don't use latest here)
    X_sim = pd.get_dummies(df_sim[[COUNTRY_COL] + features], columns=[COUNTRY_COL], drop_first=True)
    X_sim = X_sim.reindex(columns=X_train.columns, fill_value=0)

    y_sim_pred = model.predict(X_sim)

    sim_df = pd.DataFrame({
        COUNTRY_COL: df_sim[COUNTRY_COL].values,
        YEAR_COL: df_sim[YEAR_COL].values,
        "Pred_CO2_per_capita": y_sim_pred
    })

    # Compare against baseline for the same country/year
    merged = base_df.merge(sim_df, on=[COUNTRY_COL, YEAR_COL])
    merged["Delta_CO2"] = merged["Pred_CO2_per_capita"] - merged["Pred_CO2_per_capita_Base"]

    # Save the final result of the scenario (already compared)
    results[name] = merged


# 4. Summary by scenario 

for name, df_out in results.items():
    print(f"\n--- {name} ---")
    print(df_out.groupby(COUNTRY_COL)["Delta_CO2"].mean().sort_values())



--- scenario_1 ---
REF_AREA_LABEL
Lao PDR                          -0.719299
Zambia                           -0.679513
Mauritius                        -0.679513
Sri Lanka                        -0.679513
South Africa                     -0.679513
                                    ...   
Paraguay                          0.000000
Samoa                             0.000000
Seychelles                        0.000000
St. Vincent and the Grenadines    0.000000
Albania                           0.000000
Name: Delta_CO2, Length: 159, dtype: float64

--- scenario_2 ---
REF_AREA_LABEL
Lao PDR                          -0.764152
South Africa                     -0.664574
Zambia                           -0.664574
Mauritius                        -0.664574
Madagascar                       -0.664574
                                    ...   
Paraguay                          0.000000
Samoa                             0.000000
St. Vincent and the Grenadines    0.000000
Seychelles               

In [57]:
# Iterate over each scenario
for name, df_out in results.items():
    
    print(f"Results for scenario: {name}")
    

   
    # 1. Average Delta_CO2 by country
   
    country_avg = df_out.groupby(COUNTRY_COL)["Delta_CO2"].mean().sort_values()

    # 10 countries with greatest reduction (most negative Delta_CO2)
    print("\nTop 10 countries with GREATEST emission reduction:")
    print(country_avg.head(10))

    # 10 countries with least reduction or even increase (most positive Delta_CO2)
    print("\nTop 10 countries with LEAST emission reduction (or even increase):")
    print(country_avg.tail(10))

  
    # 2. Total global change
   
    global_change = df_out["Delta_CO2"].sum()
    print(f"\nTotal global emission change (sum of Delta_CO2): {global_change:.4f}")

   
    # 3. Global average
  
    mean_global = df_out["Delta_CO2"].mean()
    print(f"Global average of Delta_CO2: {mean_global:.4f}")



Results for scenario: scenario_1

Top 10 countries with GREATEST emission reduction:
REF_AREA_LABEL
Lao PDR        -0.719299
Zambia         -0.679513
Mauritius      -0.679513
Sri Lanka      -0.679513
South Africa   -0.679513
Mongolia       -0.679513
Madagascar     -0.679513
Cambodia       -0.679513
Botswana       -0.679513
Zimbabwe       -0.679513
Name: Delta_CO2, dtype: float64

Top 10 countries with LEAST emission reduction (or even increase):
REF_AREA_LABEL
Malawi                            0.0
Mali                              0.0
Mauritania                        0.0
Nicaragua                         0.0
Pacific island small states       0.0
Paraguay                          0.0
Samoa                             0.0
Seychelles                        0.0
St. Vincent and the Grenadines    0.0
Albania                           0.0
Name: Delta_CO2, dtype: float64

Total global emission change (sum of Delta_CO2): -42.3521
Global average of Delta_CO2: -0.2664
Results for scenario: scena

Scenario 1 increases renewable energies without hydro to a greater extent. Scenario 2 balances them and scenario 3 prioritizes hydroelectric energy more than other renewables.

The results show that greater investment in renewables brings better global effects in CO2 reduction, more than in the scenario where they are balanced or where hydroelectric energy is prioritized.

In reality, the change is not very significant between prioritizing renewables or hydroelectric. This suggests that focusing on one of the two is better than making multiple investments in different energy sources.

It is also observed that the countries showing the greatest CO2 reductions are mostly small and low-resource countries.

Strategies would have to change according to the country.