In [1]:
#IMPORTS 
import cdsapi
import pygrib

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

#from geonamescache import GeonamesCache
import pycountry
import rasterio
from scipy.spatial import cKDTree
from tqdm import tqdm
from datetime import datetime

import zipfile
import os
import gdown

## Countries

In [2]:
countries_zip_url = "https://drive.google.com/uc?id=1UQzdO7suT0BnwKBeNybMG97vM9GIDogA"
countries_zip_file_path = "../allCountries.zip"

# Download the ZIP file if it doesn't exist; otherwise, proceed to read the TXT file.
if not os.path.exists(countries_zip_file_path):
    gdown.download(countries_zip_url, countries_zip_file_path, quiet=False)

with zipfile.ZipFile(countries_zip_file_path) as z:
    countries_txt_filename = "allCountries.txt"

    with z.open(countries_txt_filename) as txt_file:
        countries_df = pd.read_csv(txt_file, sep="\t", header=None)

print(f"\nshape: {countries_df.shape}")
countries_df.head()

  countries_df = pd.read_csv(txt_file, sep="\t", header=None)



shape: (12950185, 19)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,2994701,Roc Meler,Roc Meler,"Roc Mele,Roc Meler,Roc Mélé",42.58765,1.7418,T,PK,AD,"AD,FR",02,,,,0,2811.0,2348,Europe/Andorra,2023-10-03
1,3017832,Pic de les Abelletes,Pic de les Abelletes,"Pic de la Font-Negre,Pic de la Font-Nègre,Pic ...",42.52535,1.73343,T,PK,AD,FR,A9,66.0,663.0,66146.0,0,,2411,Europe/Andorra,2014-11-05
2,3017833,Estany de les Abelletes,Estany de les Abelletes,"Estany de les Abelletes,Etang de Font-Negre,Ét...",42.52915,1.73362,H,LK,AD,FR,A9,,,,0,,2260,Europe/Andorra,2014-11-05
3,3023203,Port Vieux de la Coume d’Ose,Port Vieux de la Coume d'Ose,"Port Vieux de Coume d'Ose,Port Vieux de Coume ...",42.62568,1.61823,T,PASS,AD,,00,,,,0,,2687,Europe/Andorra,2014-11-05
4,3029315,Port de la Cabanette,Port de la Cabanette,"Port de la Cabanette,Porteille de la Cabanette",42.6,1.73333,T,PASS,AD,"AD,FR",B3,9.0,91.0,9139.0,0,,2379,Europe/Andorra,2014-11-05


## EUI

In [3]:
eui_url = "https://drive.google.com/uc?id=12qGq_DLefI1RihIF_RKQUyJtm480-xRC"
eui_df = pd.read_csv(eui_url)

print(f"shape: {eui_df.shape}")
eui_df.head()

shape: (482, 5)


Unnamed: 0,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year)
0,Nha Trang,1572151,Vietnam,59.096065,112.778867
1,Aberdeen,2657832,United Kingdom,231.302877,259.832393
2,Abidjan,2293538,Cote d'Ivoire,73.830819,105.622137
3,Abu Dhabi,292968,United Arab Emirates,128.447899,226.725457
4,Abuja,2352778,Nigeria,63.955819,103.009079


## Data preperation for EUI

In [4]:
merged_df = pd.merge(
    countries_df, eui_df, left_on=0, right_on="Geonames ID", how="inner"
)
merged_df.rename(columns={4: "latitude", 5: "longitude"}, inplace=True)

merged_df

Unnamed: 0,0,1,2,3,latitude,longitude,6,7,8,9,...,14,15,16,17,18,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year)
0,292968,Abu Dhabi,Abu Dhabi,"A-pu-that-pi,AEbu Saby,AUH,Aboe Dhabi,Abou Dab...",24.45118,54.39696,P,PPLC,AE,,...,1807000,,6,Asia/Dubai,2024-03-27,Abu Dhabi,292968,United Arab Emirates,128.447899,226.725457
1,1138958,Kabul,Kabul,"Cabool,Caboul,Cabul,Cabura,Cabúl,Caubul,KBL,Ka...",34.52813,69.17233,P,PPLC,AF,,...,4434550,,1798,Asia/Kabul,2024-09-05,Kabul,1138958,Afghanistan,213.167026,144.395840
2,3183875,Tirana,Tirana,"TIA,Terana,Theranda,Tiorana,Tiorána,Tiran,Tira...",41.32750,19.81889,P,PPLC,AL,,...,418495,,113,Europe/Tirane,2023-01-01,Tirana,3183875,Albania,133.717672,101.873579
3,616052,Yerevan,Yerevan,"Ayrivan,Djerevan,EVN,Eireavan,Eireaván,Ereban,...",40.18111,44.51361,P,PPLC,AM,,...,1093485,,994,Asia/Yerevan,2023-11-13,Yerevan,616052,Armenia,198.865302,132.124738
4,2240449,Luanda,Luanda,"LAD,Loanda,Louanda,Louanta,Luand,Luanda,Luanda...",-8.83682,13.23432,P,PPLC,AO,,...,2776168,,73,Africa/Luanda,2024-03-26,Luanda,2240449,Angola,65.343750,104.311100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,1018725,Bloemfontein,Bloemfontein,"BFN,Bloemfontein,Blumfantehjn,Blumfonteina,Blu...",-29.12107,26.21400,P,PPLA,ZA,,...,556000,,1396,Africa/Johannesburg,2022-08-16,Bloemfontein,1018725,South Africa,78.735991,106.046441
478,3369157,Cape Town,Cape Town,"Altepetl In Cabo,Ar Chab,CPT,Cape Toun,Cape To...",-33.92584,18.42322,P,PPLA,ZA,,...,4710000,,25,Africa/Johannesburg,2024-03-27,Cape Town,3369157,South Africa,71.455819,104.301427
479,909137,Lusaka,Lusaka,"LUN,Lousaka,Louzaka,Lusaca,Lusak,Lusaka,Lusako...",-15.40669,28.28713,P,PPLC,ZM,,...,1267440,,1277,Africa/Lusaka,2019-09-05,Lusaka,909137,Zambia,75.533405,105.500787
480,890299,Harare,Harare,"Arare,Charare,HRE,Harare,Hararensis Urbs,Harar...",-17.82772,31.05337,P,PPLC,ZW,,...,1542813,,1494,Africa/Harare,2019-09-05,Harare,890299,Zimbabwe,69.613147,102.108799


## Region

In [5]:
merged_df['Region'] = merged_df[17].str.split('/').str[0]

region_mapping = {
    'Asia': 'Asia & Oceania',
    'Africa': 'Africa',
    'Europe': 'Europe',
    'America': 'America',
    'Pacific': 'Asia & Oceania',
    'Indian': 'Asia & Oceania',
    'Australia': 'Asia & Oceania',
    'Atlantic': 'Atlantic'
}

merged_df['Region Grouped'] = merged_df['Region'].map(region_mapping)

merged_df.loc[merged_df[1] == 'Praia', 'Region Grouped'] = 'Africa'

In [6]:
merged_df['Region Grouped'].value_counts()

Asia & Oceania    187
Africa            123
Europe             88
America            84
Name: Region Grouped, dtype: int64

In [7]:
merged_df["nearest_hdd"] = np.nan
latitude = merged_df["latitude"]

# Temperature

### Note: The following tempreture data is downloaded via API. I chooes the data as 2m temperature, 2023, April, 22, 3:00, NetCDF.

### Here is the API: 
```
import cdsapi

c = cdsapi.Client()

c.retrieve(
    'reanalysis-era5-single-levels',
    {
        'product_type': 'reanalysis',
        'format': 'netcdf',
        'variable': '2m_temperature',
        'year': '2023',
        'month': '04',
        'day': '22',
        'time': '03:00',
    },
    'download.nc')
```

In [8]:
''' 
c = cdsapi.Client()

c.retrieve(
    "reanalysis-era5-single-levels",
    {
        "product_type": "reanalysis",
        "format": "netcdf",
        "variable": "2m_temperature",
        "year": "2023",
        "month": "04",
        "day": "22",
        "time": "03:00",
    },
    "download.nc",
)
'''

' \nc = cdsapi.Client()\n\nc.retrieve(\n    "reanalysis-era5-single-levels",\n    {\n        "product_type": "reanalysis",\n        "format": "netcdf",\n        "variable": "2m_temperature",\n        "year": "2023",\n        "month": "04",\n        "day": "22",\n        "time": "03:00",\n    },\n    "download.nc",\n)\n'

In [9]:
temp_thresh = 18
file_path = "./download.nc"


def get_hdd(temperature, temp_thresh):
    keep = temperature < temp_thresh
    hdd = np.zeros(temperature.shape)
    hdd[keep] = temp_thresh - temperature[keep]
    return hdd / 24  # Convert from heating degree hours to degree days


with rasterio.open(file_path) as src:
    celsius_factor = -273.15
    temp_conversion = lambda x: x + celsius_factor

    # blocky, blockx = src.block_shapes[0]
    n_lon = src.width
    n_lat = src.height
    left = src.bounds.left
    right = src.bounds.right
    bottom = src.bounds.bottom
    top = src.bounds.top
    new_count = 1
    #new_crs = rasterio.crs.CRS.from_string("EPSG:4326")
    new_crs = {'init': 'epsg:4326'}
    new_transform = rasterio.transform.from_bounds(
        left, bottom, right, top, n_lon, n_lat
    )

    out_profile = {
        "driver": "GTiff",
        "dtype": "float64",
        "nodata": None,
        "width": n_lon,
        "height": n_lat,
        "count": new_count,
        "crs": new_crs,
        "transform": new_transform,
        "tiled": False,
        "interleave": "band",
        # 'blockxsize': blockx,
        # 'blockysize': blocky,
        "compress": "lzw",
        "driver": "GTiff",
        "dtype": "float64",
        "interleave": "band",
        "nodata": 99999999,
        # 'tiled': True}
    }

    yearly_hdd = np.zeros((n_lat, n_lon))

    for i in tqdm(src.indexes):
        # Read the data from this timepoint
        raw_data = src.read(i)
        # Get the quarter from which the data are from
        time = int(src.tags(i)["NETCDF_DIM_valid_time"])
        # Convert the temperature data to Celcius
        date_time = datetime.utcfromtimestamp(time)
        year = date_time.year

        temperature = temp_conversion(raw_data)
        # Compute the HDDs
        chdd = get_hdd(temperature, temp_thresh)

        yearly_hdd += chdd

        # # Accumulate HDDs only into the corresponding quarter----modify to
        # hdd_quarter[quarter] += chdd
lon_arr = np.linspace(left, right, n_lon)
lat_arr = np.linspace(bottom, top, n_lat)
lon_grid, lat_grid = np.meshgrid(lon_arr, lat_arr)
lon_flat = lon_grid.ravel()
lat_flat = lat_grid.ravel()
hdd_flat = yearly_hdd.ravel()

100%|██████████| 1/1 [00:00<00:00, 37.51it/s]


In [10]:
# get nearest point with KDtree
tree = cKDTree(np.column_stack((lat_flat, lon_flat)))

for index, row in tqdm(merged_df.iterrows(), total=merged_df.shape[0]):
    latitude = row["latitude"]
    longitude = row["longitude"]
    # latitude = 45.0
    # longitude = -75.0
    # Query the tree for the nearest point
    dist, idx = tree.query([latitude, longitude])
    # Get the nearest HDD value
    merged_df.at[index, "nearest_hdd"] = hdd_flat[idx]

100%|██████████| 482/482 [00:00<00:00, 13462.89it/s]


## GDP

In [11]:
gdp_data_url = (
    "https://drive.google.com/uc?id=160t-E-kILHcjXzVnFTwD-Us1ulLNtA88&export=download"
)
gdp_data = pd.read_csv(gdp_data_url)

country_gdp = gdp_data[gdp_data["Level"] == "National"][["Country", "2022"]].rename(
    columns={"2022": "GDP_2022"}
)

country_gdp['Country'] = country_gdp['Country'].replace({
    'Argentina urban': 'Argentina',
    'Chili': 'Chile',
    'Russian Federation': 'Russia',
    'United States': 'United States of America',
    'Congo Democratic Republic' : 'Congo, Democratic Republic of the',
    'Congo Brazzaville':'Congo, Republic of the',
    'Saint Lucia':'St. Lucia',
    'Myanmar': 'Burma',
    'Sao Tome & Principe':'Sao Tome and Principe',
    'Timor Leste': 'East Timor',
    'Gambia':'Gambia, The',
    'Guinea Bissau':'Guinea-Bissau',
    'Trinidad & Tobago' :'Trinidad and Tobago',
    'Eswatini':'Swaziland',
    'Central African Republic CAR': 'Central African Republic',
    'Lao':'Laos',
    
    # montenegro independece 2006 from Serbia
  
})

# Perform a left merge to keep all rows from the original dataset
merged_df = pd.merge(
    merged_df, country_gdp, on="Country", how="left"
)

merged_df[merged_df['GDP_2022'].isna()]

Unnamed: 0,0,1,2,3,latitude,longitude,6,7,8,9,...,18,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year),Region,Region Grouped,nearest_hdd,GDP_2022
295,3193044,Podgorica,Podgorica,"Birziminium,Padgoryca,Podgairitse,Podgairítse,...",42.44111,19.26361,P,PPLC,ME,,...,2023-06-14,Podgorica,3193044,Montenegro,83.06681,106.449024,Europe,Europe,0.276447,
426,1668341,Taipei,Taipei,"GJai Bac,Kota Taipei,Pan Kiao,Pan-ch'iao,Pan-c...",25.05306,121.52639,P,PPLC,TW,,...,2024-09-13,Taipei,1668341,Taiwan,115.34282,189.889419,Asia,Asia & Oceania,0.0,


In [12]:
merged_df.head()

Unnamed: 0,0,1,2,3,latitude,longitude,6,7,8,9,...,18,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year),Region,Region Grouped,nearest_hdd,GDP_2022
0,292968,Abu Dhabi,Abu Dhabi,"A-pu-that-pi,AEbu Saby,AUH,Aboe Dhabi,Abou Dab...",24.45118,54.39696,P,PPLC,AE,,...,2024-03-27,Abu Dhabi,292968,United Arab Emirates,128.447899,226.725457,Asia,Asia & Oceania,0.0,0.937
1,1138958,Kabul,Kabul,"Cabool,Caboul,Cabul,Cabura,Cabúl,Caubul,KBL,Ka...",34.52813,69.17233,P,PPLC,AF,,...,2024-09-05,Kabul,1138958,Afghanistan,213.167026,144.39584,Asia,Asia & Oceania,0.0,0.462
2,3183875,Tirana,Tirana,"TIA,Terana,Theranda,Tiorana,Tiorána,Tiran,Tira...",41.3275,19.81889,P,PPLC,AL,,...,2023-01-01,Tirana,3183875,Albania,133.717672,101.873579,Europe,Europe,0.285643,0.789
3,616052,Yerevan,Yerevan,"Ayrivan,Djerevan,EVN,Eireavan,Eireaván,Ereban,...",40.18111,44.51361,P,PPLC,AM,,...,2023-11-13,Yerevan,616052,Armenia,198.865302,132.124738,Asia,Asia & Oceania,0.156981,0.786
4,2240449,Luanda,Luanda,"LAD,Loanda,Louanda,Louanta,Luand,Luanda,Luanda...",-8.83682,13.23432,P,PPLC,AO,,...,2024-03-26,Luanda,2240449,Angola,65.34375,104.3111,Africa,Africa,0.0,0.591


In [13]:
# Load the Urbanization Rate dataset, skipping metadata rows if necessary
urbanization_rate_url = (
    "https://drive.google.com/uc?id=1YteyPHAWnJUKG0LWogS98EYnwjRTeZDf&export=download"
)
urbanization_rate = pd.read_csv(urbanization_rate_url, skiprows=4)

urbanization_rate['Country Name'] = urbanization_rate['Country Name'].replace({
    'Argentina urban': 'Argentina',
    'Chili': 'Chile',
    'Russian Federation': 'Russia',
    'United States': 'United States of America',
    'Congo Democratic Republic' : 'Congo, Democratic Republic of the',
    'Congo, Dem. Rep.':'Congo, Democratic Republic of the',
    'Congo Brazzaville':'Congo, Republic of the',
    'Congo, Rep.':'Congo, Republic of the',
    'Saint Lucia':'St. Lucia',
    'Myanmar': 'Burma',
    'Sao Tome & Principe':'Sao Tome and Principe',
    'Timor Leste': 'East Timor',
    'Gambia':'Gambia, The',
    'Guinea Bissau':'Guinea-Bissau',
    'Trinidad & Tobago' :'Trinidad and Tobago',
    'Eswatini':'Swaziland',
    'Central African Republic CAR': 'Central African Republic',
    'Lao':'Laos',
    'Egypt, Arab Rep.':'Egypt',
    'Iran, Islamic Rep.':'Iran',
    'Korea, Rep.':'South Korea',
    'Syrian Arab Republic':'Syria',
    'Turkiye':'Turkey',
    'Viet Nam':'Vietnam',
    'Venezuela, RB':'Venezuela',
    'Yemen, Rep.':'Yemen',
    'Czechia':'Czech Republic'
    # montenegro independece 2006 from Serbia

  
})

# Keep only relevant columns: 'Country Name' and '2022'
urbanization_rate_2022 = urbanization_rate[["Country Name", "2022"]].rename(
    columns={"Country Name": "Country", "2022": "Urbanization_Rate_2022"}
)

# 
# Step 6: Perform a second left merge to add the urbanization rate feature
merged_df = pd.merge(
    merged_df, urbanization_rate_2022, on="Country", how="left"
)


In [14]:
# Step 7: Check for rows with missing Urbanization Rate data
missing_urbanization_rate = merged_df[
    merged_df["Urbanization_Rate_2022"].isna()
]
 
missing_urbanization_rate

Unnamed: 0,0,1,2,3,latitude,longitude,6,7,8,9,...,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year),Region,Region Grouped,nearest_hdd,GDP_2022,Urbanization_Rate_2022
95,3374333,Praia,Praia,"Braia,Cidade da Praia,Municipio da Praia,Munic...",14.93152,-23.51254,P,PPLC,CV,,...,Praia,3374333,Cape Verde,66.011853,104.377117,Atlantic,Africa,0.0,0.661,
268,1528675,Bishkek,Bishkek,"Bichkek,Biscecum,Bischkek,Bishkek,Bishkek osh,...",42.87,74.59,P,PPLC,KG,,...,Bishkek,1528675,Kyrgyzstan,249.953664,150.607926,Asia,Asia & Oceania,0.34383,0.701,
278,1651944,Vientiane,Vientiane,"Bientian,Fanhyiengh,Nakhon Viangchan,V'ent'jan...",17.96667,102.6,P,PPLC,LA,,...,Vientiane,1651944,Laos,68.064655,111.200553,Asia,Asia & Oceania,0.0,0.62,
401,724443,Košice,Kosice,"Cassovia,KSC,Kaschau,Kassa,Koesice,Koshice,Kos...",48.71395,21.25808,P,PPLA,SK,,...,Kosice,724443,Slovakia,219.991096,285.172131,Europe,Europe,0.670328,0.855,
402,3060972,Bratislava,Bratislava,"An Bhrataslaiv,An Bhratasláiv,BTS,Baratislawa,...",48.14816,17.10674,P,PPLC,SK,,...,Bratislava,3060972,Slovakia,201.402857,279.080585,Europe,Europe,0.604735,0.855,
418,1645457,Dili,Dili,"DIL,Delhi,Dilhi,Dili,Dilis,Dilium,Dilli,Dilly,...",-8.55861,125.57361,P,PPLC,TL,,...,Dili,1645457,East Timor,88.320043,111.436996,Asia,Asia & Oceania,0.0,0.566,
426,1668341,Taipei,Taipei,"GJai Bac,Kota Taipei,Pan Kiao,Pan-ch'iao,Pan-c...",25.05306,121.52639,P,PPLC,TW,,...,Taipei,1668341,Taiwan,115.34282,189.889419,Asia,Asia & Oceania,0.0,,


## Models

In [15]:
merged_df = merged_df.dropna(subset=["GDP_2022", "Urbanization_Rate_2022"])
merged_df.head()


Unnamed: 0,0,1,2,3,latitude,longitude,6,7,8,9,...,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year),Region,Region Grouped,nearest_hdd,GDP_2022,Urbanization_Rate_2022
0,292968,Abu Dhabi,Abu Dhabi,"A-pu-that-pi,AEbu Saby,AUH,Aboe Dhabi,Abou Dab...",24.45118,54.39696,P,PPLC,AE,,...,Abu Dhabi,292968,United Arab Emirates,128.447899,226.725457,Asia,Asia & Oceania,0.0,0.937,87.543
1,1138958,Kabul,Kabul,"Cabool,Caboul,Cabul,Cabura,Cabúl,Caubul,KBL,Ka...",34.52813,69.17233,P,PPLC,AF,,...,Kabul,1138958,Afghanistan,213.167026,144.39584,Asia,Asia & Oceania,0.0,0.462,26.616
2,3183875,Tirana,Tirana,"TIA,Terana,Theranda,Tiorana,Tiorána,Tiran,Tira...",41.3275,19.81889,P,PPLC,AL,,...,Tirana,3183875,Albania,133.717672,101.873579,Europe,Europe,0.285643,0.789,63.799
3,616052,Yerevan,Yerevan,"Ayrivan,Djerevan,EVN,Eireavan,Eireaván,Ereban,...",40.18111,44.51361,P,PPLC,AM,,...,Yerevan,616052,Armenia,198.865302,132.124738,Asia,Asia & Oceania,0.156981,0.786,63.573
4,2240449,Luanda,Luanda,"LAD,Loanda,Louanda,Louanta,Luand,Luanda,Luanda...",-8.83682,13.23432,P,PPLC,AO,,...,Luanda,2240449,Angola,65.34375,104.3111,Africa,Africa,0.0,0.591,68.081


In [16]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [17]:
X_knn = merged_df[["latitude", "longitude"]]
X_lr = merged_df[["nearest_hdd", "GDP_2022","Urbanization_Rate_2022"]]
y_residential = merged_df["Residential EUI (kWh/m2/year)"]
y_non_residential = merged_df["Non-residential EUI (kWh/m2/year)"]

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred, y_test

def region_validation(X, y, model_class, model_name):
    regions = merged_df['Region Grouped'].unique()
    y_test_all = np.array([])
    y_pred_all = np.array([])

    for region in regions:
        mask = merged_df['Region Grouped'] == region
        X_train = X[mask]
        y_train = y[mask]
        X_test = X[~mask]
        y_test = y[~mask]

        model = model_class()
        y_pred, _ = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)

        y_test_all = np.concatenate((y_test_all, y_test))
        y_pred_all = np.concatenate((y_pred_all, y_pred))

    metrics = {
        'MSE': mean_squared_error(y_test_all, y_pred_all),
        'R²': r2_score(y_test_all, y_pred_all),
        'MAE': mean_absolute_error(y_test_all, y_pred_all),
        'RMSE': np.sqrt(mean_squared_error(y_test_all, y_pred_all)),
        'MAPE': mean_absolute_percentage_error(y_test_all, y_pred_all),
        'WAPE': np.sum(np.abs(y_test_all - y_pred_all)) / np.sum(np.abs(y_test_all)) * 100
    }
    metrics['Model'] = model_name
    return metrics

def k_fold_validation(X, y, model_class, model_name, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    y_test_all = np.array([])
    y_pred_all = np.array([])

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = model_class()
        y_pred, _ = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)

        y_test_all = np.concatenate((y_test_all, y_test))
        y_pred_all = np.concatenate((y_pred_all, y_pred))

    metrics = {
        'MSE': mean_squared_error(y_test_all, y_pred_all),
        'R²': r2_score(y_test_all, y_pred_all),
        'MAE': mean_absolute_error(y_test_all, y_pred_all),
        'RMSE': np.sqrt(mean_squared_error(y_test_all, y_pred_all)),
        'MAPE': mean_absolute_percentage_error(y_test_all, y_pred_all),
        'WAPE': np.sum(np.abs(y_test_all - y_pred_all)) / np.sum(np.abs(y_test_all)) * 100
    }
    metrics['Model'] = f'K-Fold {model_name}'
    return metrics

metrics_residential = []
metrics_non_residential = []

metrics_residential.append(region_validation(X_knn, y_residential, KNeighborsRegressor, 'KNN (Res EUI, Lat-Long) - Regional'))
metrics_residential.append(region_validation(X_lr, y_residential, LinearRegression, 'LR (Res EUI, HDD|GDP|URB) - Regional'))
metrics_non_residential.append(region_validation(X_knn, y_non_residential, KNeighborsRegressor, 'KNN (Non-Res EUI, Lat-Long) - Regional'))
metrics_non_residential.append(region_validation(X_lr, y_non_residential, LinearRegression, 'LR (Non-Res EUI, HDD|GDP|URB) - Regional'))

n_regions = merged_df['Region Grouped'].nunique()
metrics_residential.append(k_fold_validation(X_knn, y_residential, KNeighborsRegressor, 'KNN (Res EUI, Lat-Long) - K-Fold', n_splits=n_regions))
metrics_residential.append(k_fold_validation(X_lr, y_residential, LinearRegression, 'LR (Res EUI, HDD|GDP|URB) - K-Fold', n_splits=n_regions))
metrics_non_residential.append(k_fold_validation(X_knn, y_non_residential, KNeighborsRegressor, 'KNN (Non-Res EUI, Lat-Long) - K-Fold', n_splits=n_regions))
metrics_non_residential.append(k_fold_validation(X_lr, y_non_residential, LinearRegression, 'LR (Non-Res EUI, HDD|GDP|URB) - K-Fold', n_splits=n_regions))

metrics_residential_df = pd.DataFrame(metrics_residential).round(2)
metrics_non_residential_df = pd.DataFrame(metrics_non_residential).round(2)

metrics_residential_df = metrics_residential_df[['Model'] + [col for col in metrics_residential_df.columns if col != 'Model']]
metrics_non_residential_df = metrics_non_residential_df[['Model'] + [col for col in metrics_non_residential_df.columns if col != 'Model']]


In [18]:
metrics_residential_df 

Unnamed: 0,Model,MSE,R²,MAE,RMSE,MAPE,WAPE
0,"KNN (Res EUI, Lat-Long) - Regional",6486.78,-0.29,56.22,80.54,52.52,51.59
1,"LR (Res EUI, HDD|GDP|URB) - Regional",4134.94,0.18,43.41,64.3,40.38,39.83
2,"K-Fold KNN (Res EUI, Lat-Long) - K-Fold",1866.5,0.63,22.65,43.2,17.91,20.78
3,"K-Fold LR (Res EUI, HDD|GDP|URB) - K-Fold",2004.96,0.6,29.17,44.78,28.49,26.77


In [19]:
metrics_non_residential_df

Unnamed: 0,Model,MSE,R²,MAE,RMSE,MAPE,WAPE
0,"KNN (Non-Res EUI, Lat-Long) - Regional",5813.17,-0.32,50.94,76.24,32.54,35.73
1,"LR (Non-Res EUI, HDD|GDP|URB) - Regional",6635.47,-0.5,57.74,81.46,43.06,40.5
2,"K-Fold KNN (Non-Res EUI, Lat-Long) - K-Fold",1912.67,0.57,19.93,43.73,11.82,13.98
3,"K-Fold LR (Non-Res EUI, HDD|GDP|URB) - K-Fold",2051.51,0.54,28.44,45.29,19.99,19.94
