In [1]:
# IMPORTS
#ML
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import  r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

# from geonamescache import GeonamesCache
import pycountry
import rasterio
from scipy.spatial import cKDTree
from tqdm import tqdm
from datetime import datetime
import geopandas as gpd
import cdsapi
import pygrib

import zipfile
import os
import gdown

## Countries (kyle 1)

In [2]:
countries_zip_url = "https://drive.google.com/uc?id=1UQzdO7suT0BnwKBeNybMG97vM9GIDogA"
countries_zip_file_path = "../../allCountries.zip"

# Download the ZIP file if it doesn't exist; otherwise, proceed to read the TXT file.
if not os.path.exists(countries_zip_file_path):
    gdown.download(countries_zip_url, countries_zip_file_path, quiet=False)

with zipfile.ZipFile(countries_zip_file_path) as z:
    countries_txt_filename = "allCountries.txt"

    with z.open(countries_txt_filename) as txt_file:
        countries_df = pd.read_csv(txt_file, sep="\t", header=None)

print(f"\nshape: {countries_df.shape}")
countries_df.head()

  countries_df = pd.read_csv(txt_file, sep="\t", header=None)



shape: (12950185, 19)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,2994701,Roc Meler,Roc Meler,"Roc Mele,Roc Meler,Roc Mélé",42.58765,1.7418,T,PK,AD,"AD,FR",02,,,,0,2811.0,2348,Europe/Andorra,2023-10-03
1,3017832,Pic de les Abelletes,Pic de les Abelletes,"Pic de la Font-Negre,Pic de la Font-Nègre,Pic ...",42.52535,1.73343,T,PK,AD,FR,A9,66.0,663.0,66146.0,0,,2411,Europe/Andorra,2014-11-05
2,3017833,Estany de les Abelletes,Estany de les Abelletes,"Estany de les Abelletes,Etang de Font-Negre,Ét...",42.52915,1.73362,H,LK,AD,FR,A9,,,,0,,2260,Europe/Andorra,2014-11-05
3,3023203,Port Vieux de la Coume d’Ose,Port Vieux de la Coume d'Ose,"Port Vieux de Coume d'Ose,Port Vieux de Coume ...",42.62568,1.61823,T,PASS,AD,,00,,,,0,,2687,Europe/Andorra,2014-11-05
4,3029315,Port de la Cabanette,Port de la Cabanette,"Port de la Cabanette,Porteille de la Cabanette",42.6,1.73333,T,PASS,AD,"AD,FR",B3,9.0,91.0,9139.0,0,,2379,Europe/Andorra,2014-11-05


## EUI (kyle 2)

In [3]:
eui_url = "https://drive.google.com/uc?id=12qGq_DLefI1RihIF_RKQUyJtm480-xRC"
eui_df = pd.read_csv(eui_url)

print(f"shape: {eui_df.shape}")
eui_df.head()

shape: (482, 5)


Unnamed: 0,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year)
0,Nha Trang,1572151,Vietnam,59.096065,112.778867
1,Aberdeen,2657832,United Kingdom,231.302877,259.832393
2,Abidjan,2293538,Cote d'Ivoire,73.830819,105.622137
3,Abu Dhabi,292968,United Arab Emirates,128.447899,226.725457
4,Abuja,2352778,Nigeria,63.955819,103.009079


## Data preperation for EUI

In [4]:
merged_df = pd.merge(
    countries_df, eui_df, left_on=0, right_on="Geonames ID", how="inner"
)
merged_df.rename(columns={4: "latitude", 5: "longitude"}, inplace=True)
merged_df

Unnamed: 0,0,1,2,3,latitude,longitude,6,7,8,9,...,14,15,16,17,18,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year)
0,292968,Abu Dhabi,Abu Dhabi,"A-pu-that-pi,AEbu Saby,AUH,Aboe Dhabi,Abou Dab...",24.45118,54.39696,P,PPLC,AE,,...,1807000,,6,Asia/Dubai,2024-03-27,Abu Dhabi,292968,United Arab Emirates,128.447899,226.725457
1,1138958,Kabul,Kabul,"Cabool,Caboul,Cabul,Cabura,Cabúl,Caubul,KBL,Ka...",34.52813,69.17233,P,PPLC,AF,,...,4434550,,1798,Asia/Kabul,2024-09-05,Kabul,1138958,Afghanistan,213.167026,144.395840
2,3183875,Tirana,Tirana,"TIA,Terana,Theranda,Tiorana,Tiorána,Tiran,Tira...",41.32750,19.81889,P,PPLC,AL,,...,418495,,113,Europe/Tirane,2023-01-01,Tirana,3183875,Albania,133.717672,101.873579
3,616052,Yerevan,Yerevan,"Ayrivan,Djerevan,EVN,Eireavan,Eireaván,Ereban,...",40.18111,44.51361,P,PPLC,AM,,...,1093485,,994,Asia/Yerevan,2023-11-13,Yerevan,616052,Armenia,198.865302,132.124738
4,2240449,Luanda,Luanda,"LAD,Loanda,Louanda,Louanta,Luand,Luanda,Luanda...",-8.83682,13.23432,P,PPLC,AO,,...,2776168,,73,Africa/Luanda,2024-03-26,Luanda,2240449,Angola,65.343750,104.311100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,1018725,Bloemfontein,Bloemfontein,"BFN,Bloemfontein,Blumfantehjn,Blumfonteina,Blu...",-29.12107,26.21400,P,PPLA,ZA,,...,556000,,1396,Africa/Johannesburg,2022-08-16,Bloemfontein,1018725,South Africa,78.735991,106.046441
478,3369157,Cape Town,Cape Town,"Altepetl In Cabo,Ar Chab,CPT,Cape Toun,Cape To...",-33.92584,18.42322,P,PPLA,ZA,,...,4710000,,25,Africa/Johannesburg,2024-03-27,Cape Town,3369157,South Africa,71.455819,104.301427
479,909137,Lusaka,Lusaka,"LUN,Lousaka,Louzaka,Lusaca,Lusak,Lusaka,Lusako...",-15.40669,28.28713,P,PPLC,ZM,,...,1267440,,1277,Africa/Lusaka,2019-09-05,Lusaka,909137,Zambia,75.533405,105.500787
480,890299,Harare,Harare,"Arare,Charare,HRE,Harare,Hararensis Urbs,Harar...",-17.82772,31.05337,P,PPLC,ZW,,...,1542813,,1494,Africa/Harare,2019-09-05,Harare,890299,Zimbabwe,69.613147,102.108799


In [5]:
# Extract the ISO 3 codes from the 'Country' column
iso3_codes_merged_df = merged_df["Country"]

# Get the unique ISO 3 codes in the 'Country' column
unique_iso3_codes = iso3_codes_merged_df.unique()
num_missing = iso3_codes_merged_df.isna().sum()

In [6]:
# Function to convert country names to ISO 3 codes
def get_iso3_code(country_name):
    try:
        country = pycountry.countries.lookup(country_name)
        return country.alpha_3
    except LookupError:
        return None


# Apply the function to the 'Country' column to create a new 'ISO_alpha3' column
merged_df["ISO_alpha3"] = merged_df["Country"].apply(get_iso3_code)

In [7]:
# Display rows where ISO 3 code is missing
missing_rows = merged_df[merged_df["ISO_alpha3"].isna()]
print("Rows with missing ISO 3 codes:")
missing_rows[["Country", "ISO_alpha3"]]

Rows with missing ISO 3 codes:


Unnamed: 0,Country,ISO_alpha3
61,"Congo, Democratic Republic of the",
62,"Congo, Democratic Republic of the",
63,"Congo, Democratic Republic of the",
64,"Congo, Democratic Republic of the",
66,"Congo, Republic of the",
69,Cote d'Ivoire,
70,Cote d'Ivoire,
71,Cote d'Ivoire,
95,Cape Verde,
142,"Gambia, The",


In [8]:
# Manually assign ISO 3 codes to missing countries
merged_df.loc[merged_df["Country"] == "South Korea", "ISO_alpha3"] = "KOR"
merged_df.loc[
    merged_df["Country"] == "Congo, Democratic Republic of the", "ISO_alpha3"
] = "COD"
merged_df.loc[merged_df["Country"] == "Congo, Republic of the", "ISO_alpha3"] = "COG"
merged_df.loc[merged_df["Country"] == "Cote d'Ivoire", "ISO_alpha3"] = "CIV"
merged_df.loc[merged_df["Country"] == "Cote d'Ivoire", "ISO_alpha3"] = "CIV"
merged_df.loc[merged_df["Country"] == "Cape Verde", "ISO_alpha3"] = "CPV"
merged_df.loc[merged_df["Country"] == "Gambia, The", "ISO_alpha3"] = "GMB"
merged_df.loc[merged_df["Country"] == "St. Lucia", "ISO_alpha3"] = "LCA"
merged_df.loc[merged_df["Country"] == "Burma", "ISO_alpha3"] = "MMR"
merged_df.loc[merged_df["Country"] == "Russia", "ISO_alpha3"] = "RUS"
merged_df.loc[merged_df["Country"] == "Swaziland", "ISO_alpha3"] = "SWZ"
merged_df.loc[merged_df["Country"] == "East Timor", "ISO_alpha3"] = "TLS"
merged_df.loc[merged_df["Country"] == "Turkey", "ISO_alpha3"] = "TUR"

assert merged_df["ISO_alpha3"].isna().sum() == 0, "There are missing values in the ISO_alpha3 column."
assert merged_df.shape[0] == 482, "The number of rows in merged_df is not 482."

# Population

In [9]:
population_path = '../data/raw/population/API_SP.POP.TOTL_DS2_en_csv_v2_31753.csv'
population_df = pd.read_csv(population_path, skiprows=4)
population_2023 = population_df[['Country Name', 'Country Code', '2023']]
population_2023.rename(columns={'2023': 'Population_2023'}, inplace=True)

#taiwan
taiwan_raw = {'Country Name': 'Taiwan', 'Country Code': 'TWN', 'Population_2023': 23894394}
population_2023.loc[len(population_2023)] = taiwan_raw

merged_df = merged_df.merge(population_2023, 
                             left_on='ISO_alpha3', 
                             right_on='Country Code', 
                             how='left')

assert merged_df['Population_2023'].notnull().all(), "Error: There are null values in 'Population_2023'."
merged_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population_2023.rename(columns={'2023': 'Population_2023'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population_2023.loc[len(population_2023)] = taiwan_raw


Unnamed: 0,0,1,2,3,latitude,longitude,6,7,8,9,...,18,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year),ISO_alpha3,Country Name,Country Code,Population_2023
0,292968,Abu Dhabi,Abu Dhabi,"A-pu-that-pi,AEbu Saby,AUH,Aboe Dhabi,Abou Dab...",24.45118,54.39696,P,PPLC,AE,,...,2024-03-27,Abu Dhabi,292968,United Arab Emirates,128.447899,226.725457,ARE,United Arab Emirates,ARE,9516871.0
1,1138958,Kabul,Kabul,"Cabool,Caboul,Cabul,Cabura,Cabúl,Caubul,KBL,Ka...",34.52813,69.17233,P,PPLC,AF,,...,2024-09-05,Kabul,1138958,Afghanistan,213.167026,144.39584,AFG,Afghanistan,AFG,42239854.0
2,3183875,Tirana,Tirana,"TIA,Terana,Theranda,Tiorana,Tiorána,Tiran,Tira...",41.3275,19.81889,P,PPLC,AL,,...,2023-01-01,Tirana,3183875,Albania,133.717672,101.873579,ALB,Albania,ALB,2745972.0
3,616052,Yerevan,Yerevan,"Ayrivan,Djerevan,EVN,Eireavan,Eireaván,Ereban,...",40.18111,44.51361,P,PPLC,AM,,...,2023-11-13,Yerevan,616052,Armenia,198.865302,132.124738,ARM,Armenia,ARM,2777970.0
4,2240449,Luanda,Luanda,"LAD,Loanda,Louanda,Louanta,Luand,Luanda,Luanda...",-8.83682,13.23432,P,PPLC,AO,,...,2024-03-26,Luanda,2240449,Angola,65.34375,104.3111,AGO,Angola,AGO,36684202.0


# HDI - Educational Index - Income Index.csv

In [10]:
HDI_EI_II_path = '../data/raw/HDI_educationalIndex_incomeIndex.csv'
HDI_EI_II_df = pd.read_csv(HDI_EI_II_path)
HDI_EI_II_df = HDI_EI_II_df[["ISO_Code", "Subnational HDI","Educational index", "Income index"]]
merged_df = merged_df.merge(HDI_EI_II_df, 
                             left_on='ISO_alpha3', 
                             right_on='ISO_Code', 
                             how='left')

assert merged_df.loc[merged_df['Country Name'] != 'Taiwan', 
                     ['Subnational HDI', 'Educational index', 'Income index']].notnull().all().all()

# Paris Agreement

In [11]:
paris_agreement_iso_codes = [
    "AFG", "ALB", "DZA", "AND", "AGO", "ATG", "ARG", "AUS", "AUT", "AZE", 
    "BHS", "BHR", "BGD", "BRB", "BLR", "BEL", "BLZ", "BEN", "BTN", "BOL", 
    "BIH", "BWA", "BRA", "BRN", "BGR", "BFA", "BDI", "CPV", "KHM", "CMR", 
    "CAN", "CAF", "TCD", "CHN", "COL", "COM", "COG", "CRI", "CIV", "HRV", 
    "CUB", "CYP", "CZE", "PRK", "COD", "DNK", "DJI", "DMA", "DOM", "EGY", 
    "SLV", "GNQ", "ERI", "EST", "ETH", "EUN", "FJI", "FIN", "FRA", "GAB", 
    "GEO", "DEU", "GHA", "GRC", "GRD", "GTM", "GIN", "GNB", "GUY", "HTI", 
    "HND", "HUN", "ISL", "IND", "IDN", "IRN", "IRL", "ISR", "ITA", "JAM", 
    "JPN", "JOR", "KEN", "KIR", "KWT", "LAO", "LVA", "LBN", "LSO", "LBR", 
    "LBY", "LIE", "LTU", "LUX", "MDG", "MYS", "MDV", "MLI", "MLT", "MHL", 
    "MUS", "MRT", "MEX", "FSM", "MCO", "MNG", "MNE", "MAR", "MOZ", "MMR", 
    "NAM", "NRU", "NPL", "NLD", "NZL", "NER", "NOR", "OMN", "PAK", "PLW", 
    "PAN", "PNG", "PRY", "PER", "PHL", "POL", "PRT", "QAT", "KOR", "ROU", 
    "RUS", "RWA", "KNA", "LCA", "VCT", "WSM", "SMR", "STP", "SEN", "SRB", 
    "SGP", "SVK", "SVN", "SLB", "SOM", "ZAF", "SSD", "ESP", "LKA", "PSE", 
    "SDN", "SUR", "SWZ", "SWE", "CHE", "TJK", "THA", "MKD", "TLS", "TON", 
    "TTO", "TUN", "TUR", "TUV", "UGA", "UKR", "ARE", "GBR", "TZA", "USA", 
    "URY", "VUT", "VEN", "VNM", "ZWE"
]

merged_df['Paris_Agreement'] = merged_df['ISO_alpha3'].apply(lambda x: 1 if x in paris_agreement_iso_codes else 0)
merged_df.head()


Unnamed: 0,0,1,2,3,latitude,longitude,6,7,8,9,...,Non-residential EUI (kWh/m2/year),ISO_alpha3,Country Name,Country Code,Population_2023,ISO_Code,Subnational HDI,Educational index,Income index,Paris_Agreement
0,292968,Abu Dhabi,Abu Dhabi,"A-pu-that-pi,AEbu Saby,AUH,Aboe Dhabi,Abou Dab...",24.45118,54.39696,P,PPLC,AE,,...,226.725457,ARE,United Arab Emirates,ARE,9516871.0,ARE,0.937,0.904,0.998,1
1,1138958,Kabul,Kabul,"Cabool,Caboul,Cabul,Cabura,Cabúl,Caubul,KBL,Ka...",34.52813,69.17233,P,PPLC,AF,,...,144.39584,AFG,Afghanistan,AFG,42239854.0,AFG,0.462,0.381,0.391,1
2,3183875,Tirana,Tirana,"TIA,Terana,Theranda,Tiorana,Tiorána,Tiran,Tira...",41.3275,19.81889,P,PPLC,AL,,...,101.873579,ALB,Albania,ALB,2745972.0,ALB,0.789,0.74,0.76,1
3,616052,Yerevan,Yerevan,"Ayrivan,Djerevan,EVN,Eireavan,Eireaván,Ereban,...",40.18111,44.51361,P,PPLC,AM,,...,132.124738,ARM,Armenia,ARM,2777970.0,ARM,0.786,0.778,0.761,0
4,2240449,Luanda,Luanda,"LAD,Loanda,Louanda,Louanta,Luand,Luanda,Luanda...",-8.83682,13.23432,P,PPLC,AO,,...,104.3111,AGO,Angola,AGO,36684202.0,AGO,0.591,0.533,0.601,1


## Region

In [12]:
world_boundaries_url = "https://drive.google.com/uc?id=1k-2ECd2gwJ9FBz1anMRZy7O85uExAFY_"
world_boundaries_path = "../../world-administrative-boundaries.geojson"

gdown.download(world_boundaries_url, world_boundaries_path, quiet=False)
world_boundaries_df = gpd.read_file(world_boundaries_path)
world_boundaries_df.head()

Downloading...
From: https://drive.google.com/uc?id=1k-2ECd2gwJ9FBz1anMRZy7O85uExAFY_
To: /Users/barbaraflores/Desktop/MIDS/IDS798_Capstone/world-administrative-boundaries.geojson
100%|██████████| 8.58M/8.58M [00:00<00:00, 27.7MB/s]


Unnamed: 0,geo_point_2d,iso3,status,color_code,name,continent,region,iso_3166_1_alpha_2_codes,french_short,geometry
0,"{'lon': 145.67921950822935, 'lat': 15.08852006...",MNP,US Territory,USA,Northern Mariana Islands,Oceania,Micronesia,MP,Northern Mariana Islands,"MULTIPOLYGON (((145.63331 14.91236, 145.62412 ..."
1,"{'lon': 147.1622109044358, 'lat': 44.691217168...",,Sovereignty unsettled,RUS,Kuril Islands,Asia,Eastern Asia,,Kuril Islands,"MULTIPOLYGON (((146.68274 43.70777, 146.66664 ..."
2,"{'lon': 2.551955216777798, 'lat': 46.564502053...",FRA,Member State,FRA,France,Europe,Western Europe,FR,France,"MULTIPOLYGON (((9.44750 42.68305, 9.45014 42.6..."
3,"{'lon': 20.805271723235375, 'lat': 44.03149841...",SRB,Member State,SRB,Serbia,Europe,Southern Europe,RS,Serbie,"POLYGON ((20.26102 46.11485, 20.31403 46.06986..."
4,"{'lon': -56.01239637788298, 'lat': -32.7996453...",URY,Member State,URY,Uruguay,Americas,South America,UY,Uruguay,"POLYGON ((-53.37430 -33.74067, -53.39917 -33.7..."


In [13]:
merged_df = merged_df.merge(world_boundaries_df, left_on='ISO_alpha3', right_on='iso3', how='left')


merged_df['Region Grouped'] = np.where(
    merged_df['region'] == 'Northern America', 
    'Northern America',  
    np.where(
        merged_df['continent'] == 'Americas', 
        'Central and South America',  
        np.where(
            merged_df['continent'].isin(['Asia', 'Oceania']), 
            'Asia & Oceania',  
            merged_df['continent']  
        )
    )
)

assert merged_df['Region Grouped'].notnull().all(), "The 'Region Grouped' column contains null values."
merged_df.groupby(['continent', 'region' , 'Region Grouped']).size().reset_index(name='count').sort_values(by='Region Grouped')


Unnamed: 0,continent,region,Region Grouped,count
0,Africa,Eastern Africa,Africa,34
1,Africa,Middle Africa,Africa,18
2,Africa,Northern Africa,Africa,20
3,Africa,Southern Africa,Africa,17
4,Africa,Western Africa,Africa,38
18,Oceania,Australia and New Zealand,Asia & Oceania,11
13,Asia,Western Asia,Asia & Oceania,15
12,Asia,Southern Asia,Asia & Oceania,90
11,Asia,South-Eastern Asia,Asia & Oceania,31
19,Oceania,Melanesia,Asia & Oceania,4


# Temperature

### Note: The following tempreture data is downloaded via API. I chooes the data as 2m temperature, 2023, April, 22, 3:00, NetCDF.

### Here is the API: 
```
import cdsapi

c = cdsapi.Client()

c.retrieve(
    'reanalysis-era5-single-levels',
    {
        'product_type': 'reanalysis',
        'format': 'netcdf',
        'variable': '2m_temperature',
        'year': '2023',
        'month': '04',
        'day': '22',
        'time': '03:00',
    },
    'download.nc')
```

In [14]:
""" 
c = cdsapi.Client()

c.retrieve(
    "reanalysis-era5-single-levels",
    {
        "product_type": "reanalysis",
        "format": "netcdf",
        "variable": "2m_temperature",
        "year": "2023",
        "month": "04",
        "day": "22",
        "time": "03:00",
    },
    "download.nc",
)
"""

' \nc = cdsapi.Client()\n\nc.retrieve(\n    "reanalysis-era5-single-levels",\n    {\n        "product_type": "reanalysis",\n        "format": "netcdf",\n        "variable": "2m_temperature",\n        "year": "2023",\n        "month": "04",\n        "day": "22",\n        "time": "03:00",\n    },\n    "download.nc",\n)\n'

In [15]:
merged_df["nearest_hdd"] = np.nan
latitude = merged_df["latitude"]

In [16]:
temp_thresh = 18
file_path = "../data/raw/download.nc"


def get_hdd(temperature, temp_thresh):
    keep = temperature < temp_thresh
    hdd = np.zeros(temperature.shape)
    hdd[keep] = temp_thresh - temperature[keep]
    return hdd / 24  # Convert from heating degree hours to degree days


with rasterio.open(file_path) as src:
    celsius_factor = -273.15
    temp_conversion = lambda x: x + celsius_factor

    # blocky, blockx = src.block_shapes[0]
    n_lon = src.width
    n_lat = src.height
    left = src.bounds.left
    right = src.bounds.right
    bottom = src.bounds.bottom
    top = src.bounds.top
    new_count = 1
    # new_crs = rasterio.crs.CRS.from_string("EPSG:4326")
    new_crs = {"init": "epsg:4326"}
    new_transform = rasterio.transform.from_bounds(
        left, bottom, right, top, n_lon, n_lat
    )

    out_profile = {
        "driver": "GTiff",
        "dtype": "float64",
        "nodata": None,
        "width": n_lon,
        "height": n_lat,
        "count": new_count,
        "crs": new_crs,
        "transform": new_transform,
        "tiled": False,
        "interleave": "band",
        # 'blockxsize': blockx,
        # 'blockysize': blocky,
        "compress": "lzw",
        "driver": "GTiff",
        "dtype": "float64",
        "interleave": "band",
        "nodata": 99999999,
        # 'tiled': True}
    }

    yearly_hdd = np.zeros((n_lat, n_lon))

    for i in tqdm(src.indexes):
        # Read the data from this timepoint
        raw_data = src.read(i)
        # Get the quarter from which the data are from
        time = int(src.tags(i)["NETCDF_DIM_valid_time"])
        # Convert the temperature data to Celcius
        date_time = datetime.utcfromtimestamp(time)
        year = date_time.year

        temperature = temp_conversion(raw_data)
        # Compute the HDDs
        chdd = get_hdd(temperature, temp_thresh)

        yearly_hdd += chdd

        # # Accumulate HDDs only into the corresponding quarter----modify to
        # hdd_quarter[quarter] += chdd
lon_arr = np.linspace(left, right, n_lon)
lat_arr = np.linspace(bottom, top, n_lat)
lon_grid, lat_grid = np.meshgrid(lon_arr, lat_arr)
lon_flat = lon_grid.ravel()
lat_flat = lat_grid.ravel()
hdd_flat = yearly_hdd.ravel()

100%|██████████| 1/1 [00:00<00:00, 44.42it/s]


In [17]:
# get nearest point with KDtree
tree = cKDTree(np.column_stack((lat_flat, lon_flat)))

for index, row in tqdm(merged_df.iterrows(), total=merged_df.shape[0]):
    latitude = row["latitude"]
    longitude = row["longitude"]
    # latitude = 45.0
    # longitude = -75.0
    # Query the tree for the nearest point
    dist, idx = tree.query([latitude, longitude])
    # Get the nearest HDD value
    merged_df.at[index, "nearest_hdd"] = hdd_flat[idx]

100%|██████████| 485/485 [00:00<00:00, 23027.10it/s]


## GDP

### GeoJSON Data

In [18]:
gdf = world_boundaries_df
# Extract unique ISO 3 codes from the 'iso3' column in gdf
unique_iso3_gdf = gdf["iso3"].unique()



In [19]:
# Extract unique ISO 3 codes from merged_df
unique_iso3_merged_df = merged_df["ISO_alpha3"].unique()

# Find ISO 3 codes that are only in gdf but not in merged_df
only_in_gdf = set(unique_iso3_gdf) - set(unique_iso3_merged_df)


# Find ISO 3 codes that are only in merged_df but not in gdf
only_in_merged_df = set(unique_iso3_merged_df) - set(unique_iso3_gdf)

In [20]:
# Display rows where ISO 3 code is missing in gdf
missing_gdf_rows = gdf[gdf["iso3"].isna()]
print("Rows with missing ISO 3 codes in gdf:")
print(missing_gdf_rows[["name", "iso3"]])

# Manually assign ISO 3 codes to missing rows (example assignments)
gdf.loc[gdf["name"] == "Kuril Islands", "iso3"] = "RUS"  # If it's part of Russia
gdf.loc[gdf["name"] == "Ma'tan al-Sarra", "iso3"] = "LBY"  # Libya
# Add more assignments as necessary...

# Verify that no ISO 3 codes are missing in gdf
missing_iso3_gdf_after = gdf["iso3"].isna().sum()
print(
    f"Number of missing ISO 3 codes in gdf after manual correction: {missing_iso3_gdf_after}"
)

Rows with missing ISO 3 codes in gdf:
                                           name  iso3
1                                 Kuril Islands  None
10                              Ma'tan al-Sarra  None
12                                       Jersey  None
56                              Madeira Islands  None
90                                   Guantanamo  None
107                            Glorioso Islands  None
108                             Paracel Islands  None
109                                    Guernsey  None
115                                       Abyei  None
120                              Ilemi Triangle  None
121                             Spratly Islands  None
123                           Arunachal Pradesh  None
138                                  Aksai Chin  None
169                                  Midway Is.  None
179                               Jarvis Island  None
207  South Georgia & the South Sandwich Islands  None
212                               Jammu-Kash

In [21]:
# Manually assign ISO 3 codes to remaining missing rows in gdf
gdf.loc[gdf["name"] == "Jersey", "iso3"] = "JEY"  # Jersey
gdf.loc[gdf["name"] == "Madeira Islands", "iso3"] = "PRT"  # Portugal
gdf.loc[gdf["name"] == "Guantanamo", "iso3"] = "CUB"  # Cuba
gdf.loc[gdf["name"] == "Glorioso Islands", "iso3"] = "FRA"  # France
gdf.loc[gdf["name"] == "Paracel Islands", "iso3"] = "CHN"  # China
gdf.loc[gdf["name"] == "Guernsey", "iso3"] = "GGY"  # Guernsey
gdf.loc[gdf["name"] == "Abyei", "iso3"] = "SSD"  # South Sudan
gdf.loc[gdf["name"] == "Ilemi Triangle", "iso3"] = "KEN"  # Kenya
gdf.loc[gdf["name"] == "Spratly Islands", "iso3"] = "CHN"  # China
gdf.loc[gdf["name"] == "Arunachal Pradesh", "iso3"] = "IND"  # India
gdf.loc[gdf["name"] == "Aksai Chin", "iso3"] = "CHN"  # China
gdf.loc[gdf["name"] == "Midway Is.", "iso3"] = "USA"  # United States
gdf.loc[gdf["name"] == "Jarvis Island", "iso3"] = "USA"  # United States
gdf.loc[gdf["name"] == "South Georgia & the South Sandwich Islands", "iso3"] = (
    "SGS"  # UK (territory)
)
gdf.loc[gdf["name"] == "Jammu-Kashmir", "iso3"] = "IND"  # India
gdf.loc[gdf["name"] == "Hala'ib Triangle", "iso3"] = "EGY"  # Egypt

# Verify that no ISO 3 codes are missing in gdf
missing_iso3_gdf_after = gdf["iso3"].isna().sum()
print(
    f"Number of missing ISO 3 codes in gdf after manual correction: {missing_iso3_gdf_after}"
)

Number of missing ISO 3 codes in gdf after manual correction: 0


In [22]:
# Extract unique ISO 3 codes from each dataset
iso3_gdf = set(gdf["iso3"])
iso3_merged_df = set(merged_df["ISO_alpha3"])

# Find discrepancies between the two datasets
iso3_only_in_gdf = iso3_gdf - iso3_merged_df
iso3_only_in_merged_df = iso3_merged_df - iso3_gdf

# Display differences
print(f"ISO 3 codes only in gdf: {iso3_only_in_gdf}")
print(f"ISO 3 codes only in merged_df: {iso3_only_in_merged_df}")

# Check if all sets match
if not iso3_only_in_gdf and not iso3_only_in_merged_df:
    print("The ISO 3 codes match perfectly across both datasets.")
else:
    print("There are discrepancies between the ISO 3 codes in the two datasets.")

ISO 3 codes only in gdf: {'ATG', 'GRD', 'SMR', 'SUR', 'QAT', 'PYF', 'BMU', 'CXR', 'VAT', 'ESH', 'TCA', 'IOT', 'ATF', 'ANT', 'PSE', 'COK', 'BHR', 'KIR', 'MLT', 'FSM', 'BVT', 'MCO', 'HMD', 'KWT', 'MYT', 'LIE', 'MSR', 'GGY', 'TKM', 'CYM', 'VGB', 'HKG', 'FLK', 'BRN', 'JEY', 'SGS', 'SYC', 'ABW', 'EST', 'MHL', 'IMY', 'PLW', 'CCK', 'NCL', 'MYS', 'AND', 'MKD', 'GUF', 'KNA', 'FRO', 'VIR', 'FJI', 'GUM', 'ISL', 'MTQ', 'PRK', 'IRQ', 'PAN', 'SJM', 'GIB', 'TUV', 'BRB', 'LUX', 'MNP', 'VCT', 'TKL', 'GRL', 'PRI', 'AIA', 'GLP', 'PCN', 'CUB', 'NFK', 'ASM', 'BHS', 'LVA', 'OMN', 'NIU', 'REU', 'MAC', 'NRU'}
ISO 3 codes only in merged_df: set()
There are discrepancies between the ISO 3 codes in the two datasets.


### CountryInfo Data (Kyle 3)

In [23]:
# kyle 3
country_info_path = (
    "https://drive.google.com/uc?id=1xfYlruvfAi6yieOd_S69pPYWphckRLr5&export=download"
)

# Load the countryInfo.txt file into a DataFrame
# Skip lines starting with '#' which are comments and handle any malformed lines
country_info_df = pd.read_csv(
    country_info_path, delimiter="\t", comment="#", on_bad_lines="skip"
)
country_info_df

Unnamed: 0,AD,AND,020,AN,Andorra,Andorra la Vella,468,77006,EU,.ad,EUR,Euro,376,AD.1
0,AE,ARE,784,AE,United Arab Emirates,Abu Dhabi,82880.0,9630959,AS,.ae,AED,Dirham,971,
1,AI,AIA,660,AV,Anguilla,The Valley,102.0,13254,,.ai,XCD,Dollar,+1-264,AI-
2,AL,ALB,8,AL,Albania,Tirana,28748.0,2866376,EU,.al,ALL,Lek,355,
3,AM,ARM,51,AM,Armenia,Yerevan,29800.0,2951776,AS,.am,AMD,Dram,374,
4,AR,ARG,32,AR,Argentina,Buenos Aires,2766890.0,44494502,SA,.ar,ARS,Peso,54,@
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,WF,WLF,876,WF,Wallis and Futuna,Mata Utu,274.0,16025,OC,.wf,XPF,Franc,681,
161,YT,MYT,175,MF,Mayotte,Mamoudzou,374.0,279471,AF,.yt,EUR,Euro,262,
162,ZA,ZAF,710,SF,South Africa,Pretoria,1219912.0,57779622,AF,.za,ZAR,Rand,27,
163,ZM,ZMB,894,ZA,Zambia,Lusaka,752614.0,17351822,AF,.zm,ZMW,Kwacha,260,


In [24]:

# Rename the columns to more descriptive names
# Rename the columns to more descriptive names using the actual column names
country_info_df.rename(
    columns={
        "AND": "ISO_alpha3",
        "Andorra": "Country_Name",
        "Andorra la Vella": "Capital",
        "468": "Area_km2",
        "77006": "Population",
        "EU": "Continent",
    },
    inplace=True,
)

country_info_df.head()

Unnamed: 0,AD,ISO_alpha3,020,AN,Country_Name,Capital,Area_km2,Population,Continent,.ad,EUR,Euro,376,AD.1
0,AE,ARE,784,AE,United Arab Emirates,Abu Dhabi,82880.0,9630959,AS,.ae,AED,Dirham,971,
1,AI,AIA,660,AV,Anguilla,The Valley,102.0,13254,,.ai,XCD,Dollar,+1-264,AI-
2,AL,ALB,8,AL,Albania,Tirana,28748.0,2866376,EU,.al,ALL,Lek,355,
3,AM,ARM,51,AM,Armenia,Yerevan,29800.0,2951776,AS,.am,AMD,Dram,374,
4,AR,ARG,32,AR,Argentina,Buenos Aires,2766890.0,44494502,SA,.ar,ARS,Peso,54,@


In [25]:
# Step 1: Extract and display the unique ISO 3 codes from country_info_df
unique_iso3_country_info = country_info_df["ISO_alpha3"].unique()


In [26]:
# Step 2: Compare ISO 3 codes in country_info_df with merged_df
iso3_merged_df = set(merged_df["ISO_alpha3"])
iso3_country_info = set(country_info_df["ISO_alpha3"])

# Find discrepancies between the two datasets
missing_in_country_info = iso3_merged_df - iso3_country_info
missing_in_merged_df = iso3_country_info - iso3_merged_df

# Display the discrepancies
print(f"ISO 3 codes in merged_df but not in country_info_df: {missing_in_country_info}")
print(f"ISO 3 codes in country_info_df but not in merged_df: {missing_in_merged_df}")

ISO 3 codes in merged_df but not in country_info_df: {'TZA', 'ERI', 'TON', 'CAF', 'SLE', 'COM', 'NAM', 'DMA', 'AGO', 'LCA', 'COD', 'DJI', 'YEM', 'MRT', 'IRL', 'BWA', 'WSM', 'ZWE', 'CMR', 'STP', 'BLZ', 'BFA', 'COG', 'BOL', 'MLI', 'MUS', 'GNQ', 'TTO', 'JAM', 'UGA', 'BTN', 'TCD', 'TLS', 'BDI', 'TGO', 'SYR', 'VUT', 'GIN', 'RWA', 'GUY', 'BEN', 'GHA', 'GAB', 'SLB', 'SSD', 'AFG', 'LBY', 'GMB', 'CIV'}
ISO 3 codes in country_info_df but not in merged_df: {'ALA', 'CCK', 'MCO', 'NCL', 'BLM', 'MYS', 'HMD', 'GRL', 'PRI', 'KWT', 'MKD', 'MYT', 'SMR', 'LIE', 'GUF', 'FRO', 'VIR', 'GGY', 'AIA', 'TKM', 'GLP', 'GUM', 'ISL', 'MTQ', 'PYF', 'CUB', 'BMU', 'NFK', 'IRQ', 'CXR', 'PRK', 'PAN', 'VAT', 'SPM', 'SJM', 'ASM', 'HKG', 'BRN', 'JEY', 'BRB', 'LUX', 'BHR', 'LVA', 'OMN', 'WLF', 'MNP', 'IMN', 'MAF', 'MLT', 'FSM', 'EST', 'MHL', 'NIU', 'REU', 'SCG', 'MAC'}


In [27]:
# List of ISO 3 codes in merged_df but not in country_info_df
missing_in_country_info = {
    "WSM",
    "BLZ",
    "BTN",
    "GNQ",
    "IRL",
    "VUT",
    "BWA",
    "TLS",
    "DJI",
    "BDI",
    "SYR",
    "RWA",
    "SLB",
    "MLI",
    "SSD",
    "BFA",
    "BOL",
    "MRT",
    "JAM",
    "YEM",
    "GUY",
    "ERI",
    "TGO",
    "ZWE",
    "NAM",
    "COM",
    "GMB",
    "LBY",
    "MUS",
    "GAB",
    "CIV",
    "UGA",
    "AFG",
    "BEN",
    "GIN",
    "DMA",
    "LCA",
    "TCD",
    "COD",
    "TON",
    "COG",
    "AGO",
    "CMR",
    "GHA",
    "SLE",
    "CAF",
    "TZA",
    "TTO",
    "STP",
}

# Manually create new rows for missing ISO 3 codes and append to country_info_df
for iso3_code in missing_in_country_info:
    new_row = {

        1: iso3_code,  # Set the ISO 3 code

    }
    country_info_df = pd.concat(
        [country_info_df, pd.DataFrame([new_row])], ignore_index=True
    )

# Verify that there are no missing ISO 3 codes in country_info_df compared to merged_df
iso3_country_info = set(country_info_df[1])
missing_in_country_info_after = iso3_merged_df - iso3_country_info
print(
    f"ISO 3 codes in merged_df but not in country_info_df after correction: {missing_in_country_info_after}"
)

ISO 3 codes in merged_df but not in country_info_df after correction: {'HUN', 'TWN', 'GEO', 'AUS', 'KEN', 'VNM', 'DOM', 'ARM', 'RUS', 'NIC', 'KGZ', 'POL', 'BEL', 'LBN', 'LSO', 'SDN', 'PRY', 'TJK', 'UKR', 'JPN', 'PNG', 'EGY', 'NER', 'MEX', 'SOM', 'SWE', 'NOR', 'NZL', 'MOZ', 'SRB', 'MNE', 'BIH', 'SVK', 'MDG', 'VEN', 'NPL', 'GTM', 'MMR', 'CRI', 'SLV', 'ARE', 'IND', 'TUR', 'DEU', 'HRV', 'ECU', 'ITA', 'LTU', 'CZE', 'SWZ', 'ALB', 'DNK', 'PER', 'KAZ', 'SGP', 'JOR', 'THA', 'BRA', 'BGD', 'MDV', 'USA', 'ETH', 'UZB', 'NGA', 'BGR', 'HTI', 'TUN', 'LAO', 'MNG', 'LKA', 'IRN', 'CPV', 'GBR', 'COL', 'HND', 'CHN', 'FRA', 'ZMB', 'IDN', 'NLD', 'CHL', 'ROU', 'MDA', 'ESP', 'PRT', 'PHL', 'GRC', 'SVN', 'PAK', 'FIN', 'BLR', 'LBR', 'MWI', 'SEN', 'ISR', 'ZAF', 'KHM', 'KOR', 'GNB', 'CHE', 'ARG', 'DZA', 'MAR', 'URY', 'CYP', 'SAU', 'CAN', 'AZE', 'AUT'}


In [28]:
# Step 1: Extract unique ISO 3 codes from each dataset
iso3_merged_df = set(merged_df["ISO_alpha3"])
iso3_country_info = set(
    country_info_df[1]
)  # Assuming column 1 contains the ISO 3 codes
iso3_gdf = set(gdf["iso3"])

# Step 2: Compare the ISO 3 codes in each dataset

# Find ISO 3 codes that are in merged_df but not in the other datasets
only_in_merged_df = iso3_merged_df - iso3_country_info - iso3_gdf

# Find ISO 3 codes that are in country_info_df but not in the other datasets
only_in_country_info = iso3_country_info - iso3_merged_df - iso3_gdf

# Find ISO 3 codes that are in gdf but not in the other datasets
only_in_gdf = iso3_gdf - iso3_merged_df - iso3_country_info

# Step 3: Display the differences
print(f"ISO 3 codes only in merged_df: {only_in_merged_df}")
print(f"ISO 3 codes only in country_info_df: {only_in_country_info}")
print(f"ISO 3 codes only in gdf: {only_in_gdf}")

# Step 4: Check if all datasets have the same ISO 3 codes
if not only_in_merged_df and not only_in_country_info and not only_in_gdf:
    print("The ISO 3 codes match perfectly across all three datasets.")
else:
    print("There are discrepancies between the ISO 3 codes in the datasets.")

ISO 3 codes only in merged_df: set()
ISO 3 codes only in country_info_df: {nan}
ISO 3 codes only in gdf: {'ATG', 'GRD', 'SMR', 'SUR', 'QAT', 'PYF', 'BMU', 'CXR', 'VAT', 'ESH', 'TCA', 'IOT', 'ATF', 'ANT', 'PSE', 'COK', 'BHR', 'KIR', 'MLT', 'FSM', 'BVT', 'MCO', 'HMD', 'KWT', 'MYT', 'LIE', 'MSR', 'GGY', 'TKM', 'CYM', 'VGB', 'HKG', 'FLK', 'BRN', 'JEY', 'SGS', 'SYC', 'ABW', 'EST', 'MHL', 'IMY', 'PLW', 'CCK', 'NCL', 'MYS', 'AND', 'MKD', 'GUF', 'KNA', 'FRO', 'VIR', 'FJI', 'GUM', 'ISL', 'MTQ', 'PRK', 'IRQ', 'PAN', 'SJM', 'GIB', 'TUV', 'BRB', 'LUX', 'MNP', 'VCT', 'TKL', 'GRL', 'PRI', 'AIA', 'GLP', 'PCN', 'CUB', 'NFK', 'ASM', 'BHS', 'LVA', 'OMN', 'NIU', 'REU', 'MAC', 'NRU'}
There are discrepancies between the ISO 3 codes in the datasets.


In [29]:
# Step 1: Filter gdf and country_info_df to keep only rows with ISO 3 codes that are in merged_df
iso3_merged_df = set(merged_df["ISO_alpha3"])

# Filter gdf to keep only relevant rows and ensure unique ISO 3 codes
gdf_filtered = gdf[gdf["iso3"].isin(iso3_merged_df)].drop_duplicates(subset="iso3")

# Filter country_info_df to keep only relevant rows and ensure unique ISO 3 codes
country_info_filtered = country_info_df[
    country_info_df["ISO_alpha3"].isin(iso3_merged_df)
].drop_duplicates(subset="ISO_alpha3")

# Step 1: Ensure all datasets use a unified ISO 3 column name

# Rename the ISO 3 code column in gdf_filtered to 'ISO_alpha3' to match merged_df
gdf_filtered.rename(columns={"iso3": "ISO_alpha3"}, inplace=True)

# Step 2: Merge merged_df with gdf_filtered
merged_with_gdf = pd.merge(
    merged_df,
    gdf_filtered,
    on="ISO_alpha3",  # Use the unified column name 'ISO_alpha3' for merging
    how="left",  # Left merge to keep all rows from merged_df
    suffixes=("", "_gdf"),  # Suffix for columns from gdf_filtered
)

# Step 3: Merge the result with country_info_filtered
final_merged = pd.merge(
    merged_with_gdf,
    country_info_filtered,
    on="ISO_alpha3",  # Use the unified column name 'ISO_alpha3' for merging
    how="left",  # Left merge to keep all rows from merged_with_gdf
    suffixes=("", "_country_info"),  # Suffix for columns from country_info_filtered
)

# Step 4: Verify the final merged DataFrame
print(f"Number of rows in the final merged DataFrame: {final_merged.shape[0]}")

# Check that all original columns from merged_df are retained
original_columns = list(merged_df.columns)
final_columns = list(final_merged.columns)

# Find which original columns are missing
missing_columns = [col for col in original_columns if col not in final_columns]
if len(missing_columns) == 0:
    print("All original columns from merged_df are retained.")
else:
    print(f"Missing columns from the final merged DataFrame: {missing_columns}")

# Print the first few rows of the final DataFrame to verify
final_merged.head()

Number of rows in the final merged DataFrame: 485
Missing columns from the final merged DataFrame: [1]


Unnamed: 0,0,1,2,3,latitude,longitude,6,7,8,9,...,Capital,Area_km2,Population,Continent,.ad,EUR,Euro,376,AD.1,1_country_info
0,292968,Abu Dhabi,Abu Dhabi,"A-pu-that-pi,AEbu Saby,AUH,Aboe Dhabi,Abou Dab...",24.45118,54.39696,P,PPLC,AE,,...,Abu Dhabi,82880.0,9630959.0,AS,.ae,AED,Dirham,971.0,,
1,1138958,Kabul,Kabul,"Cabool,Caboul,Cabul,Cabura,Cabúl,Caubul,KBL,Ka...",34.52813,69.17233,P,PPLC,AF,,...,,,,,,,,,,
2,3183875,Tirana,Tirana,"TIA,Terana,Theranda,Tiorana,Tiorána,Tiran,Tira...",41.3275,19.81889,P,PPLC,AL,,...,Tirana,28748.0,2866376.0,EU,.al,ALL,Lek,355.0,,
3,616052,Yerevan,Yerevan,"Ayrivan,Djerevan,EVN,Eireavan,Eireaván,Ereban,...",40.18111,44.51361,P,PPLC,AM,,...,Yerevan,29800.0,2951776.0,AS,.am,AMD,Dram,374.0,,
4,2240449,Luanda,Luanda,"LAD,Loanda,Louanda,Louanta,Luand,Luanda,Luanda...",-8.83682,13.23432,P,PPLC,AO,,...,,,,,,,,,,


### Final Merged without Features

In [30]:
# Compare the 'ISO_alpha3' columns from final_merged and original merged_df
iso3_match = final_merged["ISO_alpha3"].equals(merged_df["ISO_alpha3"])

# Display the result
print(
    f"Are the 'ISO_alpha3' columns in final_merged and merged_df the same? {iso3_match}"
)

Are the 'ISO_alpha3' columns in final_merged and merged_df the same? True


### GDP (missing Taiwan, now 481 rows)

In [31]:
gdp_data_path = (
    "../data/raw/gdp_data.csv"
)
gdp_data = pd.read_csv(gdp_data_path)

gdp_data

Unnamed: 0,Country,Continent,ISO_Code,Level,GDLCODE,Region,1990,1991,1992,1993,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Afghanistan,Asia/Pacific,AFG,National,AFGt,Total,0.284,0.292,0.299,0.307,...,0.475,0.480,0.479,0.483,0.485,0.486,0.492,0.488,0.473,0.462
1,Afghanistan,Asia/Pacific,AFG,Subnat,AFGr101,Central (Kabul Wardak Kapisa Logar Parwan Panj...,0.343,0.352,0.361,0.371,...,0.554,0.554,0.548,0.553,0.555,0.558,0.565,0.561,0.545,0.531
2,Afghanistan,Asia/Pacific,AFG,Subnat,AFGr102,Central Highlands (Bamyan Daikundi),0.296,0.305,0.313,0.322,...,0.488,0.487,0.480,0.482,0.483,0.484,0.489,0.484,0.468,0.459
3,Afghanistan,Asia/Pacific,AFG,Subnat,AFGr103,East (Nangarhar Kunar Laghman Nooristan),0.298,0.306,0.313,0.320,...,0.473,0.469,0.459,0.463,0.465,0.467,0.472,0.468,0.453,0.442
4,Afghanistan,Asia/Pacific,AFG,Subnat,AFGr104,North (Samangan Sar-e-Pul Balkh Jawzjan Faryab),0.272,0.280,0.287,0.295,...,0.479,0.492,0.499,0.503,0.504,0.506,0.512,0.507,0.492,0.481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1979,Zimbabwe,Africa,ZWE,Subnat,ZWEr104,Mashonaland West,0.430,0.432,0.417,0.412,...,0.511,0.520,0.528,0.527,0.528,0.536,0.528,0.522,0.517,0.519
1980,Zimbabwe,Africa,ZWE,Subnat,ZWEr108,Masvingo,0.448,0.451,0.435,0.430,...,0.516,0.529,0.541,0.544,0.547,0.559,0.554,0.548,0.543,0.545
1981,Zimbabwe,Africa,ZWE,Subnat,ZWEr105,Matebeleland North,0.428,0.430,0.416,0.411,...,0.502,0.507,0.510,0.509,0.509,0.516,0.508,0.502,0.498,0.499
1982,Zimbabwe,Africa,ZWE,Subnat,ZWEr106,Matebeleland South,0.440,0.442,0.427,0.422,...,0.515,0.518,0.520,0.527,0.535,0.551,0.551,0.545,0.541,0.542


In [32]:
# Step 1: Filter GDP data to keep only the national-level data for the year 2022
# We will use the 'ISO_Code' column to merge and filter only the "National" level entries
filtered_gdp_data = gdp_data[(gdp_data["Level"] == "National")][["ISO_Code", "2022"]]

# Add a manual entry for Taiwan (TWN), since it is missing in the filtered GDP data
filtered_gdp_data = pd.concat(
    [filtered_gdp_data, pd.DataFrame([{"ISO_Code": "TWN", "2022": None}])],
    ignore_index=True,
)

# Step 2: Merge the filtered GDP data with the final_merged DataFrame
final_merged_with_gdp = pd.merge(
    final_merged.drop(
        columns=["ISO_Code"], errors="ignore"
    ),  # Drop 'ISO_Code' if it already exists to avoid duplication
    filtered_gdp_data,
    left_on="ISO_alpha3",  # Use 'ISO_alpha3' from final_merged as the baseline for merging
    right_on="ISO_Code",  # Matching 'ISO_Code' in the filtered GDP dataset
    how="left",  # Left join to keep all rows from final_merged
    suffixes=("", "_gdp"),  # Suffix to differentiate columns from GDP data
)

# Drop the duplicate 'ISO_Code' column after the merge
final_merged_with_gdp.drop(columns=["ISO_Code"], inplace=True)

# Rename the '2022' column from the GDP dataset to make it clear
final_merged_with_gdp.rename(columns={"2022": "GDP_2022"}, inplace=True)

# Step 3: Verify the final merged DataFrame
print(
    f"Number of rows in the final merged DataFrame with GDP data: {final_merged_with_gdp.shape[0]}"
)

# Check that all original columns from final_merged are retained
original_columns_final = list(final_merged.columns)
final_columns_with_gdp = list(final_merged_with_gdp.columns)

# Find which original columns are missing
missing_columns_gdp_merge = [
    col for col in original_columns_final if col not in final_columns_with_gdp
]
if len(missing_columns_gdp_merge) == 0:
    print(
        "All original columns from final_merged are retained after merging with GDP data."
    )
else:
    print(
        f"Missing columns from the final merged DataFrame after merging with GDP data: {missing_columns_gdp_merge}"
    )

Number of rows in the final merged DataFrame with GDP data: 485
Missing columns from the final merged DataFrame after merging with GDP data: ['ISO_Code']


In [33]:
# Step 4: Check if 'GDP_2022' has 482 non-null values
missing_gdp_entries = final_merged_with_gdp["GDP_2022"].isna().sum()
print(f"Number of missing entries in 'GDP_2022': {missing_gdp_entries}")

Number of missing entries in 'GDP_2022': 1


In [34]:
# Find rows in final_merged_with_gdp where GDP_2022 is missing
missing_gdp_row = final_merged_with_gdp[final_merged_with_gdp["GDP_2022"].isna()]

# Display the rows with missing GDP
print("Row with missing GDP data:")
print(missing_gdp_row[["ISO_alpha3", "Country"]])

Row with missing GDP data:
    ISO_alpha3 Country
429        TWN  Taiwan


### Urbanization Rate (missing Taiwan, now 481 rows)

In [35]:
# Load the Urbanization Rate dataset, skipping metadata rows if necessary
urbanization_rate_url = (
    "https://drive.google.com/uc?id=1YteyPHAWnJUKG0LWogS98EYnwjRTeZDf&export=download"
)
urbanization_rate = pd.read_csv(urbanization_rate_url, skiprows=4)
urbanization_rate

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,Unnamed: 68
0,Aruba,ABW,Urban population (% of total population),SP.URB.TOTL.IN.ZS,50.776000,50.761000,50.746000,50.730000,50.715000,50.700000,...,43.108000,43.192000,43.293000,43.411000,43.546000,43.697000,43.866000,44.052000,44.254000,
1,Africa Eastern and Southern,AFE,Urban population (% of total population),SP.URB.TOTL.IN.ZS,14.563810,14.811410,15.069249,15.347976,15.640195,15.941282,...,34.425584,34.894753,35.358901,35.847598,36.336259,36.828302,37.323699,37.825158,38.335337,
2,Afghanistan,AFG,Urban population (% of total population),SP.URB.TOTL.IN.ZS,8.401000,8.684000,8.976000,9.276000,9.586000,9.904000,...,24.803000,25.020000,25.250000,25.495000,25.754000,26.026000,26.314000,26.616000,26.933000,
3,Africa Western and Central,AFW,Urban population (% of total population),SP.URB.TOTL.IN.ZS,14.705391,15.090123,15.484076,15.897592,16.329208,16.778650,...,44.805863,45.425066,46.039478,46.649426,47.255413,47.857831,48.454806,49.047385,49.635410,
4,Angola,AGO,Urban population (% of total population),SP.URB.TOTL.IN.ZS,10.435000,10.798000,11.204000,11.624000,12.058000,12.504000,...,63.446000,64.149000,64.839000,65.514000,66.177000,66.825000,67.460000,68.081000,68.688000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,Urban population (% of total population),SP.URB.TOTL.IN.ZS,,,,,,,...,,,,,,,,,,
262,"Yemen, Rep.",YEM,Urban population (% of total population),SP.URB.TOTL.IN.ZS,9.100000,9.459000,9.831000,10.216000,10.614000,11.026000,...,34.777000,35.394000,36.016000,36.642000,37.273000,37.908000,38.546000,39.188000,39.831000,
263,South Africa,ZAF,Urban population (% of total population),SP.URB.TOTL.IN.ZS,46.619000,46.793000,46.906000,47.020000,47.134000,47.248000,...,64.828000,65.341000,65.850000,66.355000,66.856000,67.354000,67.847000,68.335000,68.819000,
264,Zambia,ZMB,Urban population (% of total population),SP.URB.TOTL.IN.ZS,18.145000,18.951000,19.785000,20.712000,22.015000,23.372000,...,41.907000,42.438000,42.976000,43.521000,44.072000,44.629000,45.192000,45.761000,46.335000,


In [36]:
# Step 1: Filter the Urbanization Rate data to keep only relevant columns for 2022
filtered_urbanization_rate = urbanization_rate[["Country Code", "2022"]].rename(
    columns={"2022": "Urbanization_Rate_2022"}
)

# Step 2: Merge the filtered Urbanization Rate data with the final_merged_with_gdp DataFrame
final_merged_with_urbanization = pd.merge(
    final_merged_with_gdp.drop(
        columns=["Country Code"], errors="ignore"
    ),  # Drop 'Country Code' if it already exists
    filtered_urbanization_rate,
    left_on="ISO_alpha3",  # Use 'ISO_alpha3' from final_merged_with_gdp as the baseline for merging
    right_on="Country Code",  # Matching 'Country Code' in the filtered Urbanization dataset
    how="left",  # Left join to keep all rows from final_merged_with_gdp
    suffixes=(
        "",
        "_urbanization",
    ),  # Suffix to differentiate columns from Urbanization Rate data
)

# Drop the duplicate 'Country Code' column after the merge
final_merged_with_urbanization.drop(columns=["Country Code"], inplace=True)

# Step 3: Verify the final merged DataFrame with Urbanization Rate data
print(
    f"Number of rows in the final merged DataFrame with Urbanization Rate data: {final_merged_with_urbanization.shape[0]}"
)

# Check that all original columns from final_merged_with_gdp are retained
original_columns_with_gdp = list(final_merged_with_gdp.columns)
final_columns_with_urbanization = list(final_merged_with_urbanization.columns)

# Find which original columns are missing
missing_columns_urbanization_merge = [
    col
    for col in original_columns_with_gdp
    if col not in final_columns_with_urbanization
]
if len(missing_columns_urbanization_merge) == 0:
    print(
        "All original columns from final_merged_with_gdp are retained after merging with Urbanization Rate data."
    )
else:
    print(
        f"Missing columns from the final merged DataFrame after merging with Urbanization Rate data: {missing_columns_urbanization_merge}"
    )

# Step 4: Check if 'Urbanization_Rate_2022' has 482 non-null values
missing_urbanization_entries = (
    final_merged_with_urbanization["Urbanization_Rate_2022"].isna().sum()
)
print(
    f"Number of missing entries in 'Urbanization_Rate_2022': {missing_urbanization_entries}"
)

Number of rows in the final merged DataFrame with Urbanization Rate data: 485
Missing columns from the final merged DataFrame after merging with Urbanization Rate data: ['Country Code']
Number of missing entries in 'Urbanization_Rate_2022': 1


In [37]:
# Step 5: Display the row with missing Urbanization Rate data
missing_urbanization_row = final_merged_with_urbanization[
    final_merged_with_urbanization["Urbanization_Rate_2022"].isna()
]
print("Row with missing Urbanization Rate data:")
print(missing_urbanization_row[["ISO_alpha3", "Country"]])

Row with missing Urbanization Rate data:
    ISO_alpha3 Country
429        TWN  Taiwan


In [38]:
merged_df = final_merged_with_urbanization


## Models

In [39]:
# this is temporal, look for a way to fill this
merged_df = merged_df[merged_df['Country'] != 'Taiwan']

In [40]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred, y_test

def traditional_train_test_split(X, y, model_class, model_name, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    model = model_class()
    y_pred, _ = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)
    
    metrics = {
        "MSE": mean_squared_error(y_test, y_pred),
        "R²": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "MAPE": mean_absolute_percentage_error(y_test, y_pred),
        "WAPE": np.sum(np.abs(y_test - y_pred)) / np.sum(np.abs(y_test)) * 100,
    }
    return metrics

merged_df.loc[:, 'Region'] = merged_df['Region Grouped']
regions = merged_df["Region"].unique()
metrics_by_region = []

for region in regions:
    region_data = merged_df[merged_df["Region"] == region]
    X_lr = region_data[["nearest_hdd", "GDP_2022", "Urbanization_Rate_2022", "latitude", "longitude", "Population_2023", "Paris_Agreement"]]
    y_residential = region_data["Residential EUI (kWh/m2/year)"]
    y_non_residential = region_data["Non-residential EUI (kWh/m2/year)"]
    
    metrics_lr_residential = traditional_train_test_split(
        X_lr,
        y_residential,
        LinearRegression,
        "LR (Res EUI, HDD | GDP | URB | Lat-Long | Pop | Paris)",
    )
    metrics_lr_residential['Region'] = region
    metrics_lr_residential['Model'] = 'LR'
    metrics_lr_residential['Y'] = 'Res EUI'
    metrics_lr_residential['X'] = 'HDD | GDP | URB | Lat-Long | Pop | Paris'
    metrics_by_region.append(metrics_lr_residential)
    
    metrics_lr_non_residential = traditional_train_test_split(
        X_lr,
        y_non_residential,
        LinearRegression,
        "LR (Non-Res EUI, HDD | GDP | URB | Lat-Long | Pop | Paris)",
    )
    metrics_lr_non_residential['Region'] = region
    metrics_lr_non_residential['Model'] = 'LR'
    metrics_lr_non_residential['Y'] = 'Non-Res EUI'
    metrics_lr_non_residential['X'] = 'HDD | GDP | URB | Lat-Long | Pop | Paris'
    metrics_by_region.append(metrics_lr_non_residential)

X_lr_total = merged_df[["nearest_hdd", "GDP_2022", "Urbanization_Rate_2022", "latitude", "longitude", "Population_2023", "Paris_Agreement"]]
y_residential_total = merged_df["Residential EUI (kWh/m2/year)"]
y_non_residential_total = merged_df["Non-residential EUI (kWh/m2/year)"]

metrics_lr_residential_total = traditional_train_test_split(
    X_lr_total,
    y_residential_total,
    LinearRegression,
    "LR (Res EUI, HDD | GDP | URB | Lat-Long | Pop | Paris)",
)
metrics_lr_residential_total['Region'] = 'Total'
metrics_lr_residential_total['Model'] = 'LR'
metrics_lr_residential_total['Y'] = 'Res EUI'
metrics_lr_residential_total['X'] = 'HDD | GDP | URB | Lat-Long | Pop | Paris'
metrics_by_region.append(metrics_lr_residential_total)

metrics_lr_non_residential_total = traditional_train_test_split(
    X_lr_total,
    y_non_residential_total,
    LinearRegression,
    "LR (Non-Res EUI, HDD | GDP | URB | Lat-Long | Pop | Paris)",
)
metrics_lr_non_residential_total['Region'] = 'Total'
metrics_lr_non_residential_total['Model'] = 'LR'
metrics_lr_non_residential_total['Y'] = 'Non-Res EUI'
metrics_lr_non_residential_total['X'] = 'HDD | GDP | URB | Lat-Long | Pop | Paris'
metrics_by_region.append(metrics_lr_non_residential_total)

metrics_by_region_df = pd.DataFrame(metrics_by_region).round(2)
metrics_by_region_df = metrics_by_region_df[["Model", "Y", "X", "Region"] + [col for col in metrics_by_region_df.columns if col not in ["Model", "Y", "X", "Region"]]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df.loc[:, 'Region'] = merged_df['Region Grouped']


In [42]:
metrics_by_region_df.sort_values(by='Y')

Unnamed: 0,Model,Y,X,Region,MSE,R²,MAE,RMSE,MAPE,WAPE
1,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Asia & Oceania,874.26,0.62,21.36,29.57,14.78,14.72
3,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Europe,4536.99,-0.35,42.32,67.36,25.64,20.1
5,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Africa,162.32,0.05,4.73,12.74,8.38,4.61
7,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Central and South America,41.9,0.36,3.85,6.47,4.03,3.7
9,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Northern America,1285.15,-1.46,26.82,35.85,9.63,10.29
11,LR,Non-Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Total,1652.65,0.58,27.73,40.65,21.26,20.43
0,LR,Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Asia & Oceania,2393.34,0.45,22.08,48.92,17.44,21.21
2,LR,Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Europe,3201.31,0.06,44.0,56.58,29.17,22.46
4,LR,Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Africa,43.05,-1.12,4.94,6.56,7.51,7.36
6,LR,Res EUI,HDD | GDP | URB | Lat-Long | Pop | Paris,Central and South America,151.31,0.6,9.69,12.3,10.58,11.47
