In [6]:
# IMPORTS
#ML
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import  r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

import pycountry
import rasterio
from scipy.spatial import cKDTree
from tqdm import tqdm
from datetime import datetime
import geopandas as gpd
import cdsapi
import pygrib

import zipfile
import os
import gdown

In [7]:
output_path = "../data/processed/merged_df.csv"
merged_df = pd.read_csv(output_path)

merged_df.head()


Unnamed: 0,geonameid,name_x,asciiname,alternatenames,latitude,longitude,feature class,feature code,iso alpha 2,cc2,...,status,color_code,name_y,continent,region,iso_3166_1_alpha_2_codes,french_short,geometry,Region Grouped,nearest_hdd
0,292968,Abu Dhabi,Abu Dhabi,"A-pu-that-pi,AEbu Saby,AUH,Aboe Dhabi,Abou Dab...",24.45118,54.39696,P,PPLC,AE,,...,Member State,ARE,United Arab Emirates,Asia,Western Asia,AE,Émirats arabes unis,MULTIPOLYGON (((53.964860000000044 24.17944000...,Asia & Oceania,0.0
1,1138958,Kabul,Kabul,"Cabool,Caboul,Cabul,Cabura,Cabúl,Caubul,KBL,Ka...",34.52813,69.17233,P,PPLC,AF,,...,Member State,AFG,Afghanistan,Asia,Southern Asia,AF,Afghanistan,"POLYGON ((74.91574000000008 37.23733000000004,...",Asia & Oceania,0.0
2,3183875,Tirana,Tirana,"TIA,Terana,Theranda,Tiorana,Tiorána,Tiran,Tira...",41.3275,19.81889,P,PPLC,AL,,...,Member State,ALB,Albania,Europe,Southern Europe,AL,Albanie,POLYGON ((20.071420000000046 42.56091000000003...,Europe,0.285643
3,616052,Yerevan,Yerevan,"Ayrivan,Djerevan,EVN,Eireavan,Eireaván,Ereban,...",40.18111,44.51361,P,PPLC,AM,,...,Member State,ARM,Armenia,Asia,Western Asia,AM,Arménie,POLYGON ((46.540380000000084 38.87559000000004...,Asia & Oceania,0.156981
4,2240449,Luanda,Luanda,"LAD,Loanda,Louanda,Louanta,Luand,Luanda,Luanda...",-8.83682,13.23432,P,PPLC,AO,,...,Member State,AGO,Angola,Africa,Middle Africa,AO,Angola,MULTIPOLYGON (((23.986210000000085 -10.8704599...,Africa,0.0


In [8]:
# this is temporal, look for a way to fill this
merged_df = merged_df[merged_df['Country'] != 'Taiwan']

In [9]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred, y_test

def traditional_train_test_split(X, y, model_class, model_name, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    model = model_class()
    y_pred, _ = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)
    
    metrics = {
        "MSE": mean_squared_error(y_test, y_pred),
        "R²": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "MAPE": mean_absolute_percentage_error(y_test, y_pred),
        "WAPE": np.sum(np.abs(y_test - y_pred)) / np.sum(np.abs(y_test)) * 100,
    }
    return metrics

merged_df.loc[:, 'Region'] = merged_df['Region Grouped']
regions = merged_df["Region"].unique()
metrics_by_region = []

for region in regions:
    region_data = merged_df[merged_df["Region"] == region]
    X_lr = region_data[["nearest_hdd", "GDP_2022", "Urbanization_Rate_2022", "latitude", "longitude", "Population_2023", "Paris_Agreement"]]
    y_residential = region_data["Residential EUI (kWh/m2/year)"]
    y_non_residential = region_data["Non-residential EUI (kWh/m2/year)"]
    
    metrics_lr_residential = traditional_train_test_split(
        X_lr,
        y_residential,
        LinearRegression,
        "LR (Res EUI, HDD | GDP | URB | Lat-Long | Pop | Paris)",
    )
    metrics_lr_residential['Region'] = region
    metrics_lr_residential['Model'] = 'LR'
    metrics_lr_residential['Y'] = 'Res EUI'
    metrics_lr_residential['X'] = 'HDD | GDP | URB | Lat-Long | Pop | Paris'
    metrics_by_region.append(metrics_lr_residential)
    
    metrics_lr_non_residential = traditional_train_test_split(
        X_lr,
        y_non_residential,
        LinearRegression,
        "LR (Non-Res EUI, HDD | GDP | URB | Lat-Long | Pop | Paris)",
    )
    metrics_lr_non_residential['Region'] = region
    metrics_lr_non_residential['Model'] = 'LR'
    metrics_lr_non_residential['Y'] = 'Non-Res EUI'
    metrics_lr_non_residential['X'] = 'HDD | GDP | URB | Lat-Long | Pop | Paris'
    metrics_by_region.append(metrics_lr_non_residential)

X_lr_total = merged_df[["nearest_hdd", "GDP_2022", "Urbanization_Rate_2022", "latitude", "longitude", "Population_2023", "Paris_Agreement"]]
y_residential_total = merged_df["Residential EUI (kWh/m2/year)"]
y_non_residential_total = merged_df["Non-residential EUI (kWh/m2/year)"]

metrics_lr_residential_total = traditional_train_test_split(
    X_lr_total,
    y_residential_total,
    LinearRegression,
    "LR (Res EUI, HDD | GDP | URB | Lat-Long | Pop | Paris)",
)
metrics_lr_residential_total['Region'] = 'Total'
metrics_lr_residential_total['Model'] = 'LR'
metrics_lr_residential_total['Y'] = 'Res EUI'
metrics_lr_residential_total['X'] = 'HDD | GDP | URB | Lat-Long | Pop | Paris'
metrics_by_region.append(metrics_lr_residential_total)

metrics_lr_non_residential_total = traditional_train_test_split(
    X_lr_total,
    y_non_residential_total,
    LinearRegression,
    "LR (Non-Res EUI, HDD | GDP | URB | Lat-Long | Pop | Paris)",
)
metrics_lr_non_residential_total['Region'] = 'Total'
metrics_lr_non_residential_total['Model'] = 'LR'
metrics_lr_non_residential_total['Y'] = 'Non-Res EUI'
metrics_lr_non_residential_total['X'] = 'HDD | GDP | URB | Lat-Long | Pop | Paris'
metrics_by_region.append(metrics_lr_non_residential_total)

metrics_by_region_df = pd.DataFrame(metrics_by_region).round(2)
metrics_by_region_df = metrics_by_region_df[["Model", "Y", "X", "Region"] + [col for col in metrics_by_region_df.columns if col not in ["Model", "Y", "X", "Region"]]]


In [10]:
metrics_by_region_df.sort_values(by='Y')

Unnamed: 0,Model,Y,X,Region,MSE,R²,MAE,RMSE,MAPE,WAPE
1,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Asia & Oceania,874.26,0.62,21.36,29.57,14.78,14.72
3,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Europe,4536.99,-0.35,42.32,67.36,25.64,20.1
5,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Africa,162.32,0.05,4.73,12.74,8.38,4.61
7,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Central and South America,41.9,0.36,3.85,6.47,4.03,3.7
9,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Northern America,1285.15,-1.46,26.82,35.85,9.63,10.29
11,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Total,1652.65,0.58,27.73,40.65,21.26,20.43
0,LR,Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Asia & Oceania,2393.34,0.45,22.08,48.92,17.44,21.21
2,LR,Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Europe,3201.31,0.06,44.0,56.58,29.17,22.46
4,LR,Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Africa,43.05,-1.12,4.94,6.56,7.51,7.36
6,LR,Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Central and South America,151.31,0.6,9.69,12.3,10.58,11.47
