In [1]:
#IMPORTS 
import cdsapi
import pygrib

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

#from geonamescache import GeonamesCache
import pycountry
import rasterio
from scipy.spatial import cKDTree
from tqdm import tqdm
from datetime import datetime

import zipfile
import os
import gdown

## Countries

In [2]:
countries_zip_url = "https://drive.google.com/uc?id=1UQzdO7suT0BnwKBeNybMG97vM9GIDogA"
countries_zip_file_path = "../allCountries.zip"

# Download the ZIP file if it doesn't exist; otherwise, proceed to read the TXT file.
if not os.path.exists(countries_zip_file_path):
    gdown.download(countries_zip_url, countries_zip_file_path, quiet=False)

with zipfile.ZipFile(countries_zip_file_path) as z:
    countries_txt_filename = "allCountries.txt"

    with z.open(countries_txt_filename) as txt_file:
        countries_df = pd.read_csv(txt_file, sep="\t", header=None)

print(f"\nshape: {countries_df.shape}")
countries_df.head()

## EUI

In [None]:
eui_url = "https://drive.google.com/uc?id=12qGq_DLefI1RihIF_RKQUyJtm480-xRC"
eui_df = pd.read_csv(eui_url)

print(f"shape: {eui_df.shape}")
eui_df.head()

## Data preperation for EUI

In [None]:
merged_df = pd.merge(
    countries_df, eui_df, left_on=0, right_on="Geonames ID", how="inner"
)
merged_df.rename(columns={4: "latitude", 5: "longitude"}, inplace=True)

merged_df

## Region

In [5]:
merged_df['Region'] = merged_df[17].str.split('/').str[0]

region_mapping = {
    'Asia': 'Asia & Oceania',
    'Africa': 'Africa',
    'Europe': 'Europe',
    'America': 'America',
    'Pacific': 'Asia & Oceania',
    'Indian': 'Asia & Oceania',
    'Australia': 'Asia & Oceania',
    'Atlantic': 'Atlantic'
}

merged_df['Region Grouped'] = merged_df['Region'].map(region_mapping)

merged_df.loc[merged_df[1] == 'Praia', 'Region Grouped'] = 'Africa'

In [None]:
merged_df['Region Grouped'].value_counts()

In [7]:
merged_df["nearest_hdd"] = np.nan
latitude = merged_df["latitude"]

# Temperature

### Note: The following tempreture data is downloaded via API. I chooes the data as 2m temperature, 2023, April, 22, 3:00, NetCDF.

### Here is the API: 
```
import cdsapi

c = cdsapi.Client()

c.retrieve(
    'reanalysis-era5-single-levels',
    {
        'product_type': 'reanalysis',
        'format': 'netcdf',
        'variable': '2m_temperature',
        'year': '2023',
        'month': '04',
        'day': '22',
        'time': '03:00',
    },
    'download.nc')
```

In [None]:
''' 
c = cdsapi.Client()

c.retrieve(
    "reanalysis-era5-single-levels",
    {
        "product_type": "reanalysis",
        "format": "netcdf",
        "variable": "2m_temperature",
        "year": "2023",
        "month": "04",
        "day": "22",
        "time": "03:00",
    },
    "download.nc",
)
'''

In [None]:
temp_thresh = 18
file_path = "./download.nc"


def get_hdd(temperature, temp_thresh):
    keep = temperature < temp_thresh
    hdd = np.zeros(temperature.shape)
    hdd[keep] = temp_thresh - temperature[keep]
    return hdd / 24  # Convert from heating degree hours to degree days


with rasterio.open(file_path) as src:
    celsius_factor = -273.15
    temp_conversion = lambda x: x + celsius_factor

    # blocky, blockx = src.block_shapes[0]
    n_lon = src.width
    n_lat = src.height
    left = src.bounds.left
    right = src.bounds.right
    bottom = src.bounds.bottom
    top = src.bounds.top
    new_count = 1
    #new_crs = rasterio.crs.CRS.from_string("EPSG:4326")
    new_crs = {'init': 'epsg:4326'}
    new_transform = rasterio.transform.from_bounds(
        left, bottom, right, top, n_lon, n_lat
    )

    out_profile = {
        "driver": "GTiff",
        "dtype": "float64",
        "nodata": None,
        "width": n_lon,
        "height": n_lat,
        "count": new_count,
        "crs": new_crs,
        "transform": new_transform,
        "tiled": False,
        "interleave": "band",
        # 'blockxsize': blockx,
        # 'blockysize': blocky,
        "compress": "lzw",
        "driver": "GTiff",
        "dtype": "float64",
        "interleave": "band",
        "nodata": 99999999,
        # 'tiled': True}
    }

    yearly_hdd = np.zeros((n_lat, n_lon))

    for i in tqdm(src.indexes):
        # Read the data from this timepoint
        raw_data = src.read(i)
        # Get the quarter from which the data are from
        time = int(src.tags(i)["NETCDF_DIM_valid_time"])
        # Convert the temperature data to Celcius
        date_time = datetime.utcfromtimestamp(time)
        year = date_time.year

        temperature = temp_conversion(raw_data)
        # Compute the HDDs
        chdd = get_hdd(temperature, temp_thresh)

        yearly_hdd += chdd

        # # Accumulate HDDs only into the corresponding quarter----modify to
        # hdd_quarter[quarter] += chdd
lon_arr = np.linspace(left, right, n_lon)
lat_arr = np.linspace(bottom, top, n_lat)
lon_grid, lat_grid = np.meshgrid(lon_arr, lat_arr)
lon_flat = lon_grid.ravel()
lat_flat = lat_grid.ravel()
hdd_flat = yearly_hdd.ravel()

In [None]:
# get nearest point with KDtree
tree = cKDTree(np.column_stack((lat_flat, lon_flat)))

for index, row in tqdm(merged_df.iterrows(), total=merged_df.shape[0]):
    latitude = row["latitude"]
    longitude = row["longitude"]
    # latitude = 45.0
    # longitude = -75.0
    # Query the tree for the nearest point
    dist, idx = tree.query([latitude, longitude])
    # Get the nearest HDD value
    merged_df.at[index, "nearest_hdd"] = hdd_flat[idx]

## GDP

In [None]:
gdp_data_url = (
    "https://drive.google.com/uc?id=160t-E-kILHcjXzVnFTwD-Us1ulLNtA88&export=download"
)
gdp_data = pd.read_csv(gdp_data_url)

country_gdp = gdp_data[gdp_data["Level"] == "National"][["Country", "2022"]].rename(
    columns={"2022": "GDP_2022"}
)

country_gdp['Country'] = country_gdp['Country'].replace({
    'Argentina urban': 'Argentina',
    'Chili': 'Chile',
    'Russian Federation': 'Russia',
    'United States': 'United States of America',
    'Congo Democratic Republic' : 'Congo, Democratic Republic of the',
    'Congo Brazzaville':'Congo, Republic of the',
    'Saint Lucia':'St. Lucia',
    'Myanmar': 'Burma',
    'Sao Tome & Principe':'Sao Tome and Principe',
    'Timor Leste': 'East Timor',
    'Gambia':'Gambia, The',
    'Guinea Bissau':'Guinea-Bissau',
    'Trinidad & Tobago' :'Trinidad and Tobago',
    'Eswatini':'Swaziland',
    'Central African Republic CAR': 'Central African Republic',
    'Lao':'Laos',
    # montenegro independece 2006 from Serbia
  
})

# Perform a left merge to keep all rows from the original dataset
merged_df = pd.merge(
    merged_df, country_gdp, on="Country", how="left"
)

merged_df[merged_df['GDP_2022'].isna()]

In [None]:
merged_df.head()

In [13]:
# Load the Urbanization Rate dataset, skipping metadata rows if necessary
urbanization_rate_url = (
    "https://drive.google.com/uc?id=1YteyPHAWnJUKG0LWogS98EYnwjRTeZDf&export=download"
)
urbanization_rate = pd.read_csv(urbanization_rate_url, skiprows=4)

urbanization_rate['Country Name'] = urbanization_rate['Country Name'].replace({
    'Argentina urban': 'Argentina',
    'Chili': 'Chile',
    'Russian Federation': 'Russia',
    'United States': 'United States of America',
    'Congo Democratic Republic' : 'Congo, Democratic Republic of the',
    'Congo, Dem. Rep.':'Congo, Democratic Republic of the',
    'Congo Brazzaville':'Congo, Republic of the',
    'Congo, Rep.':'Congo, Republic of the',
    'Saint Lucia':'St. Lucia',
    'Myanmar': 'Burma',
    'Sao Tome & Principe':'Sao Tome and Principe',
    'Timor Leste': 'East Timor',
    'Gambia':'Gambia, The',
    'Guinea Bissau':'Guinea-Bissau',
    'Trinidad & Tobago' :'Trinidad and Tobago',
    'Eswatini':'Swaziland',
    'Central African Republic CAR': 'Central African Republic',
    'Lao':'Laos',
    'Egypt, Arab Rep.':'Egypt'
    # montenegro independece 2006 from Serbia
  
})

# Keep only relevant columns: 'Country Name' and '2022'
urbanization_rate_2022 = urbanization_rate[["Country Name", "2022"]].rename(
    columns={"Country Name": "Country", "2022": "Urbanization_Rate_2022"}
)

# 
# Step 6: Perform a second left merge to add the urbanization rate feature
merged_df = pd.merge(
    merged_df, urbanization_rate_2022, on="Country", how="left"
)


In [None]:
# Step 7: Check for rows with missing Urbanization Rate data
missing_urbanization_rate = merged_df[
    merged_df["Urbanization_Rate_2022"].isna()
]
 
missing_urbanization_rate

In [None]:
merged_df.columns

## Models

In [None]:
merged_df = merged_df.dropna(subset=["GDP_2022", "Urbanization_Rate_2022"])
merged_df.head()


In [18]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [26]:
X_knn = merged_df[["latitude", "longitude"]]
X_lr = merged_df[["nearest_hdd", "GDP_2022","Urbanization_Rate_2022"]]
y_residential = merged_df["Residential EUI (kWh/m2/year)"]
y_non_residential = merged_df["Non-residential EUI (kWh/m2/year)"]

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred, y_test

def region_validation(X, y, model_class, model_name):
    regions = merged_df['Region Grouped'].unique()
    y_test_all = np.array([])
    y_pred_all = np.array([])

    for region in regions:
        mask = merged_df['Region Grouped'] == region
        X_train = X[mask]
        y_train = y[mask]
        X_test = X[~mask]
        y_test = y[~mask]

        model = model_class()
        y_pred, _ = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)

        y_test_all = np.concatenate((y_test_all, y_test))
        y_pred_all = np.concatenate((y_pred_all, y_pred))

    metrics = {
        'MSE': mean_squared_error(y_test_all, y_pred_all),
        'R²': r2_score(y_test_all, y_pred_all),
        'MAE': mean_absolute_error(y_test_all, y_pred_all),
        'RMSE': np.sqrt(mean_squared_error(y_test_all, y_pred_all)),
        'MAPE': mean_absolute_percentage_error(y_test_all, y_pred_all),
        'WAPE': np.sum(np.abs(y_test_all - y_pred_all)) / np.sum(np.abs(y_test_all)) * 100
    }
    metrics['Model'] = model_name
    return metrics

def k_fold_validation(X, y, model_class, model_name, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    y_test_all = np.array([])
    y_pred_all = np.array([])

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = model_class()
        y_pred, _ = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)

        y_test_all = np.concatenate((y_test_all, y_test))
        y_pred_all = np.concatenate((y_pred_all, y_pred))

    metrics = {
        'MSE': mean_squared_error(y_test_all, y_pred_all),
        'R²': r2_score(y_test_all, y_pred_all),
        'MAE': mean_absolute_error(y_test_all, y_pred_all),
        'RMSE': np.sqrt(mean_squared_error(y_test_all, y_pred_all)),
        'MAPE': mean_absolute_percentage_error(y_test_all, y_pred_all),
        'WAPE': np.sum(np.abs(y_test_all - y_pred_all)) / np.sum(np.abs(y_test_all)) * 100
    }
    metrics['Model'] = f'K-Fold {model_name}'
    return metrics

metrics_residential = []
metrics_non_residential = []

metrics_residential.append(region_validation(X_knn, y_residential, KNeighborsRegressor, 'KNN (Res EUI, Lat-Long) - Regional'))
metrics_residential.append(region_validation(X_lr, y_residential, LinearRegression, 'LR (Res EUI, HDD|GDP|URB) - Regional'))
metrics_non_residential.append(region_validation(X_knn, y_non_residential, KNeighborsRegressor, 'KNN (Non-Res EUI, Lat-Long) - Regional'))
metrics_non_residential.append(region_validation(X_lr, y_non_residential, LinearRegression, 'LR (Non-Res EUI, HDD|GDP|URB) - Regional'))

n_regions = merged_df['Region Grouped'].nunique()
metrics_residential.append(k_fold_validation(X_knn, y_residential, KNeighborsRegressor, 'KNN (Res EUI, Lat-Long) - K-Fold', n_splits=n_regions))
metrics_residential.append(k_fold_validation(X_lr, y_residential, LinearRegression, 'LR (Res EUI, HDD|GDP|URB) - K-Fold', n_splits=n_regions))
metrics_non_residential.append(k_fold_validation(X_knn, y_non_residential, KNeighborsRegressor, 'KNN (Non-Res EUI, Lat-Long) - K-Fold', n_splits=n_regions))
metrics_non_residential.append(k_fold_validation(X_lr, y_non_residential, LinearRegression, 'LR (Non-Res EUI, HDD|GDP|URB) - K-Fold', n_splits=n_regions))

metrics_residential_df = pd.DataFrame(metrics_residential).round(2)
metrics_non_residential_df = pd.DataFrame(metrics_non_residential).round(2)

metrics_residential_df = metrics_residential_df[['Model'] + [col for col in metrics_residential_df.columns if col != 'Model']]
metrics_non_residential_df = metrics_non_residential_df[['Model'] + [col for col in metrics_non_residential_df.columns if col != 'Model']]


In [None]:
metrics_residential_df 

In [None]:
metrics_non_residential_df