## Imports

In [1]:
# IMPORTS
#ML
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pycountry
import rasterio
from scipy.spatial import cKDTree
from tqdm import tqdm
from datetime import datetime
import geopandas as gpd

import zipfile
import os
import gdown

# Contry data

In [2]:
country_info_path = "https://drive.google.com/uc?id=1xfYlruvfAi6yieOd_S69pPYWphckRLr5&export=download"

column_names = [
    'Country_Code',        # Alpha-2 code
    'ISO_Alpha_3',        # Alpha-3 code
    'Numeric_Code',        # Numeric code
    'Alpha_2',            # Alpha-2 code (duplicate)
    'Country_Name',       # Name of the country
    'Capital',            # Capital city
    'Area',               # Area in square kilometers
    'Population',         # Population
    'Region',             # Region
    'TLD',                # Top-level domain
    'Currency_Code',      # Currency code
    'Currency_Name',      # Currency name
    'Currency_Numeric',   # Numeric currency code
    'Additional_Info'     # Additional information
]

country_info_df = pd.read_csv(
    country_info_path, delimiter="\t", comment="#", on_bad_lines="skip", header=None
)

country_info_df.columns = column_names
country_info_df.head()

Unnamed: 0,Country_Code,ISO_Alpha_3,Numeric_Code,Alpha_2,Country_Name,Capital,Area,Population,Region,TLD,Currency_Code,Currency_Name,Currency_Numeric,Additional_Info
0,AD,AND,20,AN,Andorra,Andorra la Vella,468.0,77006,EU,.ad,EUR,Euro,376,AD
1,AE,ARE,784,AE,United Arab Emirates,Abu Dhabi,82880.0,9630959,AS,.ae,AED,Dirham,971,
2,AI,AIA,660,AV,Anguilla,The Valley,102.0,13254,,.ai,XCD,Dollar,+1-264,AI-
3,AL,ALB,8,AL,Albania,Tirana,28748.0,2866376,EU,.al,ALL,Lek,355,
4,AM,ARM,51,AM,Armenia,Yerevan,29800.0,2951776,AS,.am,AMD,Dram,374,


## Countries 

In [3]:
countries_zip_url = "https://drive.google.com/uc?id=1UQzdO7suT0BnwKBeNybMG97vM9GIDogA"
countries_zip_file_path = "../../allCountries.zip"

# Download the ZIP file if it doesn't exist; otherwise, proceed to read the TXT file.
if not os.path.exists(countries_zip_file_path):
    gdown.download(countries_zip_url, countries_zip_file_path, quiet=False)

with zipfile.ZipFile(countries_zip_file_path) as z:
    countries_txt_filename = "allCountries.txt"

    with z.open(countries_txt_filename) as txt_file:
        countries_df = pd.read_csv(txt_file, sep="\t", header=None)


# https://download.geonames.org/export/dump/
countries_df.columns = [
    'geonameid',         
    'name',             
    'asciiname',        
    'alternatenames',  
    'latitude',         
    'longitude',       
    'feature class',    
    'feature code',      
    'iso alpha 2',      
    'cc2',              
    'admin1 code',     
    'admin2 code',       
    'admin3 code',      
    'admin4 code',   
    'population',      
    'elevation',       
    'dem',             
    'timezone',          
    'modification date'  
]

print(f"\nshape: {countries_df.shape}")
countries_df.head()

  countries_df = pd.read_csv(txt_file, sep="\t", header=None)



shape: (12950185, 19)


Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,iso alpha 2,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
0,2994701,Roc Meler,Roc Meler,"Roc Mele,Roc Meler,Roc Mélé",42.58765,1.7418,T,PK,AD,"AD,FR",02,,,,0,2811.0,2348,Europe/Andorra,2023-10-03
1,3017832,Pic de les Abelletes,Pic de les Abelletes,"Pic de la Font-Negre,Pic de la Font-Nègre,Pic ...",42.52535,1.73343,T,PK,AD,FR,A9,66.0,663.0,66146.0,0,,2411,Europe/Andorra,2014-11-05
2,3017833,Estany de les Abelletes,Estany de les Abelletes,"Estany de les Abelletes,Etang de Font-Negre,Ét...",42.52915,1.73362,H,LK,AD,FR,A9,,,,0,,2260,Europe/Andorra,2014-11-05
3,3023203,Port Vieux de la Coume d’Ose,Port Vieux de la Coume d'Ose,"Port Vieux de Coume d'Ose,Port Vieux de Coume ...",42.62568,1.61823,T,PASS,AD,,00,,,,0,,2687,Europe/Andorra,2014-11-05
4,3029315,Port de la Cabanette,Port de la Cabanette,"Port de la Cabanette,Porteille de la Cabanette",42.6,1.73333,T,PASS,AD,"AD,FR",B3,9.0,91.0,9139.0,0,,2379,Europe/Andorra,2014-11-05


## EUI

In [4]:
eui_url = "https://drive.google.com/uc?id=12qGq_DLefI1RihIF_RKQUyJtm480-xRC"
eui_df = pd.read_csv(eui_url)

print(f"shape: {eui_df.shape}")
eui_df.head()

shape: (482, 5)


Unnamed: 0,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year)
0,Nha Trang,1572151,Vietnam,59.096065,112.778867
1,Aberdeen,2657832,United Kingdom,231.302877,259.832393
2,Abidjan,2293538,Cote d'Ivoire,73.830819,105.622137
3,Abu Dhabi,292968,United Arab Emirates,128.447899,226.725457
4,Abuja,2352778,Nigeria,63.955819,103.009079


In [5]:
merged_df = pd.merge(
    countries_df, eui_df, left_on="geonameid", right_on="Geonames ID", how="inner"
)
assert merged_df.shape[0] == eui_df.shape[0]
print(f"shape: {merged_df.shape}")
merged_df.head()

shape: (482, 24)


Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,iso alpha 2,cc2,...,population,elevation,dem,timezone,modification date,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year)
0,292968,Abu Dhabi,Abu Dhabi,"A-pu-that-pi,AEbu Saby,AUH,Aboe Dhabi,Abou Dab...",24.45118,54.39696,P,PPLC,AE,,...,1807000,,6,Asia/Dubai,2024-03-27,Abu Dhabi,292968,United Arab Emirates,128.447899,226.725457
1,1138958,Kabul,Kabul,"Cabool,Caboul,Cabul,Cabura,Cabúl,Caubul,KBL,Ka...",34.52813,69.17233,P,PPLC,AF,,...,4434550,,1798,Asia/Kabul,2024-09-05,Kabul,1138958,Afghanistan,213.167026,144.39584
2,3183875,Tirana,Tirana,"TIA,Terana,Theranda,Tiorana,Tiorána,Tiran,Tira...",41.3275,19.81889,P,PPLC,AL,,...,418495,,113,Europe/Tirane,2023-01-01,Tirana,3183875,Albania,133.717672,101.873579
3,616052,Yerevan,Yerevan,"Ayrivan,Djerevan,EVN,Eireavan,Eireaván,Ereban,...",40.18111,44.51361,P,PPLC,AM,,...,1093485,,994,Asia/Yerevan,2023-11-13,Yerevan,616052,Armenia,198.865302,132.124738
4,2240449,Luanda,Luanda,"LAD,Loanda,Louanda,Louanta,Luand,Luanda,Luanda...",-8.83682,13.23432,P,PPLC,AO,,...,2776168,,73,Africa/Luanda,2024-03-26,Luanda,2240449,Angola,65.34375,104.3111


## Adding ISO 3 Code

In [6]:
# Adding ISO CODE 3
alpha_2_to_alpha_3 = {country.alpha_2: country.alpha_3 for country in pycountry.countries}
merged_df.loc[:, 'ISO_alpha3'] = merged_df['iso alpha 2'].map(alpha_2_to_alpha_3)

# Manually correcting the missing country code for Namibia by assigning 'NAM' because country code is null for Nambia
merged_df.loc[merged_df['Country'] == 'Namibia', 'ISO_alpha3'] = 'NAM'

print(f"shape: {merged_df.shape}")

assert merged_df["ISO_alpha3"].isna().sum() == 0, "There are missing values in the ISO_alpha3 column."
assert merged_df.shape[0] == 482, "The number of rows in merged_df is not 482."

shape: (482, 25)


# Population

In [7]:
population_path = '../data/raw/population/API_SP.POP.TOTL_DS2_en_csv_v2_31753.csv'
population_df = pd.read_csv(population_path, skiprows=4)
population_2023 = population_df[['Country Code', '2023']]

population_2023.rename(columns={
    '2023': 'Population_2023',
    'Country Code': 'ISO_alpha3'
}, inplace=True)

#taiwan
taiwan_raw = {'Country Name': 'Taiwan', 'ISO_alpha3': 'TWN', 'Population_2023': 23894394}
population_2023.loc[len(population_2023)] = taiwan_raw

merged_df = merged_df.merge(population_2023, on='ISO_alpha3', how='left')

assert merged_df['Population_2023'].notnull().all(), "Error: There are null values in 'Population_2023'."
print(f"shape: {merged_df.shape}")
merged_df.head()

shape: (482, 26)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population_2023.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population_2023.loc[len(population_2023)] = taiwan_raw


Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,iso alpha 2,cc2,...,dem,timezone,modification date,City,Geonames ID,Country,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year),ISO_alpha3,Population_2023
0,292968,Abu Dhabi,Abu Dhabi,"A-pu-that-pi,AEbu Saby,AUH,Aboe Dhabi,Abou Dab...",24.45118,54.39696,P,PPLC,AE,,...,6,Asia/Dubai,2024-03-27,Abu Dhabi,292968,United Arab Emirates,128.447899,226.725457,ARE,9516871.0
1,1138958,Kabul,Kabul,"Cabool,Caboul,Cabul,Cabura,Cabúl,Caubul,KBL,Ka...",34.52813,69.17233,P,PPLC,AF,,...,1798,Asia/Kabul,2024-09-05,Kabul,1138958,Afghanistan,213.167026,144.39584,AFG,42239854.0
2,3183875,Tirana,Tirana,"TIA,Terana,Theranda,Tiorana,Tiorána,Tiran,Tira...",41.3275,19.81889,P,PPLC,AL,,...,113,Europe/Tirane,2023-01-01,Tirana,3183875,Albania,133.717672,101.873579,ALB,2745972.0
3,616052,Yerevan,Yerevan,"Ayrivan,Djerevan,EVN,Eireavan,Eireaván,Ereban,...",40.18111,44.51361,P,PPLC,AM,,...,994,Asia/Yerevan,2023-11-13,Yerevan,616052,Armenia,198.865302,132.124738,ARM,2777970.0
4,2240449,Luanda,Luanda,"LAD,Loanda,Louanda,Louanta,Luand,Luanda,Luanda...",-8.83682,13.23432,P,PPLC,AO,,...,73,Africa/Luanda,2024-03-26,Luanda,2240449,Angola,65.34375,104.3111,AGO,36684202.0


# HDI - Educational Index - Income Index.csv

In [8]:
merged_df.columns

Index(['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude',
       'longitude', 'feature class', 'feature code', 'iso alpha 2', 'cc2',
       'admin1 code', 'admin2 code', 'admin3 code', 'admin4 code',
       'population', 'elevation', 'dem', 'timezone', 'modification date',
       'City', 'Geonames ID', 'Country', 'Residential EUI (kWh/m2/year)',
       'Non-residential EUI (kWh/m2/year)', 'ISO_alpha3', 'Population_2023'],
      dtype='object')

In [9]:
HDI_EI_II_path = '../data/raw/HDI_educationalIndex_incomeIndex.csv'
HDI_EI_II_df = pd.read_csv(HDI_EI_II_path)
HDI_EI_II_df = HDI_EI_II_df[["ISO_Code", "Subnational HDI","Educational index", "Income index"]]
HDI_EI_II_df.rename(columns={'ISO_Code':'ISO_alpha3' }, inplace=True)


merged_df = merged_df.merge(HDI_EI_II_df, 
                             on='ISO_alpha3', 
                             how='left')

assert merged_df.loc[merged_df['Country'] != 'Taiwan', 
                     ['Subnational HDI', 'Educational index', 'Income index']].notnull().all().all()
print(f"shape: {merged_df.shape}")

shape: (482, 29)


# GDP

In [10]:
gdp_data_path = (
    "../data/raw/gdp_data.csv"
)
gdp_df = pd.read_csv(gdp_data_path)
gdp_df = gdp_df[gdp_df["Level"]=="National"]

gdp_df = gdp_df[['ISO_Code', '2022']]
gdp_df.rename(columns={'2022': 'GDP_2022', 'ISO_Code':'ISO_alpha3' }, inplace=True)


merged_df = merged_df.merge(gdp_df, 
                             on='ISO_alpha3', 
                             how='left')

assert merged_df.loc[merged_df['Country'] != 'Taiwan', 
                     ['GDP_2022']].notnull().all().all()
print(f"shape: {merged_df.shape}")

shape: (482, 30)


# Urbanization rate

In [11]:
print(merged_df.columns)


Index(['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude',
       'longitude', 'feature class', 'feature code', 'iso alpha 2', 'cc2',
       'admin1 code', 'admin2 code', 'admin3 code', 'admin4 code',
       'population', 'elevation', 'dem', 'timezone', 'modification date',
       'City', 'Geonames ID', 'Country', 'Residential EUI (kWh/m2/year)',
       'Non-residential EUI (kWh/m2/year)', 'ISO_alpha3', 'Population_2023',
       'Subnational HDI', 'Educational index', 'Income index', 'GDP_2022'],
      dtype='object')


In [12]:
# Load the Urbanization Rate dataset, skipping metadata rows if necessary
urbanization_rate_path = (
    "https://drive.google.com/uc?id=1YteyPHAWnJUKG0LWogS98EYnwjRTeZDf&export=download"
)
urbanization_rate_df = pd.read_csv(urbanization_rate_path, skiprows=4)


urbanization_rate_df = urbanization_rate_df[["Country Code", "2022"]].rename(
    columns={"2022": "Urbanization_Rate_2022" , 'Country Code':'ISO_alpha3'}
)

merged_df = merged_df.merge(urbanization_rate_df, 
                             on='ISO_alpha3', 
                             how='left')

assert merged_df.loc[merged_df['Country'] != 'Taiwan', 
                     ['Urbanization_Rate_2022']].notnull().all().all()
print(f"shape: {merged_df.shape}")

shape: (482, 31)


# Paris Agreement

In [13]:
paris_agreement_iso_codes = [
    "AFG", "ALB", "DZA", "AND", "AGO", "ATG", "ARG", "AUS", "AUT", "AZE", 
    "BHS", "BHR", "BGD", "BRB", "BLR", "BEL", "BLZ", "BEN", "BTN", "BOL", 
    "BIH", "BWA", "BRA", "BRN", "BGR", "BFA", "BDI", "CPV", "KHM", "CMR", 
    "CAN", "CAF", "TCD", "CHN", "COL", "COM", "COG", "CRI", "CIV", "HRV", 
    "CUB", "CYP", "CZE", "PRK", "COD", "DNK", "DJI", "DMA", "DOM", "EGY", 
    "SLV", "GNQ", "ERI", "EST", "ETH", "EUN", "FJI", "FIN", "FRA", "GAB", 
    "GEO", "DEU", "GHA", "GRC", "GRD", "GTM", "GIN", "GNB", "GUY", "HTI", 
    "HND", "HUN", "ISL", "IND", "IDN", "IRN", "IRL", "ISR", "ITA", "JAM", 
    "JPN", "JOR", "KEN", "KIR", "KWT", "LAO", "LVA", "LBN", "LSO", "LBR", 
    "LBY", "LIE", "LTU", "LUX", "MDG", "MYS", "MDV", "MLI", "MLT", "MHL", 
    "MUS", "MRT", "MEX", "FSM", "MCO", "MNG", "MNE", "MAR", "MOZ", "MMR", 
    "NAM", "NRU", "NPL", "NLD", "NZL", "NER", "NOR", "OMN", "PAK", "PLW", 
    "PAN", "PNG", "PRY", "PER", "PHL", "POL", "PRT", "QAT", "KOR", "ROU", 
    "RUS", "RWA", "KNA", "LCA", "VCT", "WSM", "SMR", "STP", "SEN", "SRB", 
    "SGP", "SVK", "SVN", "SLB", "SOM", "ZAF", "SSD", "ESP", "LKA", "PSE", 
    "SDN", "SUR", "SWZ", "SWE", "CHE", "TJK", "THA", "MKD", "TLS", "TON", 
    "TTO", "TUN", "TUR", "TUV", "UGA", "UKR", "ARE", "GBR", "TZA", "USA", 
    "URY", "VUT", "VEN", "VNM", "ZWE"
]

merged_df['Paris_Agreement'] = merged_df['ISO_alpha3'].apply(lambda x: 1 if x in paris_agreement_iso_codes else 0)

## Region

In [14]:
world_boundaries_url = "https://drive.google.com/uc?id=1k-2ECd2gwJ9FBz1anMRZy7O85uExAFY_"
world_boundaries_path = "../../world-administrative-boundaries.geojson"

gdown.download(world_boundaries_url, world_boundaries_path, quiet=False)
world_boundaries_df = gpd.read_file(world_boundaries_path)
world_boundaries_df.head()

Downloading...
From: https://drive.google.com/uc?id=1k-2ECd2gwJ9FBz1anMRZy7O85uExAFY_
To: /Users/barbaraflores/Desktop/MIDS/IDS798_Capstone/world-administrative-boundaries.geojson
100%|██████████| 8.58M/8.58M [00:00<00:00, 20.0MB/s]


Unnamed: 0,geo_point_2d,iso3,status,color_code,name,continent,region,iso_3166_1_alpha_2_codes,french_short,geometry
0,"{'lon': 145.67921950822935, 'lat': 15.08852006...",MNP,US Territory,USA,Northern Mariana Islands,Oceania,Micronesia,MP,Northern Mariana Islands,"MULTIPOLYGON (((145.63331 14.91236, 145.62412 ..."
1,"{'lon': 147.1622109044358, 'lat': 44.691217168...",,Sovereignty unsettled,RUS,Kuril Islands,Asia,Eastern Asia,,Kuril Islands,"MULTIPOLYGON (((146.68274 43.70777, 146.66664 ..."
2,"{'lon': 2.551955216777798, 'lat': 46.564502053...",FRA,Member State,FRA,France,Europe,Western Europe,FR,France,"MULTIPOLYGON (((9.44750 42.68305, 9.45014 42.6..."
3,"{'lon': 20.805271723235375, 'lat': 44.03149841...",SRB,Member State,SRB,Serbia,Europe,Southern Europe,RS,Serbie,"POLYGON ((20.26102 46.11485, 20.31403 46.06986..."
4,"{'lon': -56.01239637788298, 'lat': -32.7996453...",URY,Member State,URY,Uruguay,Americas,South America,UY,Uruguay,"POLYGON ((-53.37430 -33.74067, -53.39917 -33.7..."


In [15]:
merged_df = merged_df.merge(world_boundaries_df, left_on='ISO_alpha3', right_on='iso3', how='left')


merged_df['Region Grouped'] = np.where(
    merged_df['region'] == 'Northern America', 
    'Northern America',  
    np.where(
        merged_df['continent'] == 'Americas', 
        'Central and South America',  
        np.where(
            merged_df['continent'].isin(['Asia', 'Oceania']), 
            'Asia & Oceania',  
            merged_df['continent']  
        )
    )
)

assert merged_df['Region Grouped'].notnull().all(), "The 'Region Grouped' column contains null values."
merged_df.groupby(['continent', 'region' , 'Region Grouped']).size().reset_index(name='count').sort_values(by='Region Grouped')


Unnamed: 0,continent,region,Region Grouped,count
0,Africa,Eastern Africa,Africa,34
1,Africa,Middle Africa,Africa,18
2,Africa,Northern Africa,Africa,20
3,Africa,Southern Africa,Africa,17
4,Africa,Western Africa,Africa,38
18,Oceania,Australia and New Zealand,Asia & Oceania,11
13,Asia,Western Asia,Asia & Oceania,15
12,Asia,Southern Asia,Asia & Oceania,90
11,Asia,South-Eastern Asia,Asia & Oceania,31
19,Oceania,Melanesia,Asia & Oceania,4


# Temperature

### Note: The following tempreture data is downloaded via API. I chooes the data as 2m temperature, 2023, April, 22, 3:00, NetCDF.

### Here is the API: 
```
import cdsapi

c = cdsapi.Client()

c.retrieve(
    'reanalysis-era5-single-levels',
    {
        'product_type': 'reanalysis',
        'format': 'netcdf',
        'variable': '2m_temperature',
        'year': '2023',
        'month': '04',
        'day': '22',
        'time': '03:00',
    },
    'download.nc')
```

In [16]:
""" 
c = cdsapi.Client()

c.retrieve(
    "reanalysis-era5-single-levels",
    {
        "product_type": "reanalysis",
        "format": "netcdf",
        "variable": "2m_temperature",
        "year": "2023",
        "month": "04",
        "day": "22",
        "time": "03:00",
    },
    "download.nc",
)
"""

' \nc = cdsapi.Client()\n\nc.retrieve(\n    "reanalysis-era5-single-levels",\n    {\n        "product_type": "reanalysis",\n        "format": "netcdf",\n        "variable": "2m_temperature",\n        "year": "2023",\n        "month": "04",\n        "day": "22",\n        "time": "03:00",\n    },\n    "download.nc",\n)\n'

In [17]:
merged_df["nearest_hdd"] = np.nan
latitude = merged_df["latitude"]

In [18]:
temp_thresh = 18
file_path = "../data/raw/download.nc"

def get_hdd(temperature, temp_thresh):
    keep = temperature < temp_thresh
    hdd = np.zeros(temperature.shape)
    hdd[keep] = temp_thresh - temperature[keep]
    return hdd / 24  # Convert from heating degree hours to degree days

with rasterio.open(file_path) as src:
    celsius_factor = -273.15
    temp_conversion = lambda x: x + celsius_factor

    # blocky, blockx = src.block_shapes[0]
    n_lon = src.width
    n_lat = src.height
    left = src.bounds.left
    right = src.bounds.right
    bottom = src.bounds.bottom
    top = src.bounds.top
    new_count = 1
    # new_crs = rasterio.crs.CRS.from_string("EPSG:4326")
    new_crs = {"init": "epsg:4326"}
    new_transform = rasterio.transform.from_bounds(
        left, bottom, right, top, n_lon, n_lat
    )
    
    out_profile = {
        "driver": "GTiff",
        "dtype": "float64",
        "nodata": None,
        "width": n_lon,
        "height": n_lat,
        "count": new_count,
        "crs": new_crs,
        "transform": new_transform,
        "tiled": False,
        "interleave": "band",
        # 'blockxsize': blockx,
        # 'blockysize': blocky,
        "compress": "lzw",
        "driver": "GTiff",
        "dtype": "float64",
        "interleave": "band",
        "nodata": 99999999,
        # 'tiled': True}
    }

    yearly_hdd = np.zeros((n_lat, n_lon))

    for i in tqdm(src.indexes):
        # Read the data from this timepoint
        raw_data = src.read(i)
        # Get the quarter from which the data are from
        time = int(src.tags(i)["NETCDF_DIM_valid_time"])
        # Convert the temperature data to Celcius
        date_time = datetime.utcfromtimestamp(time)
        year = date_time.year

        temperature = temp_conversion(raw_data)
        # Compute the HDDs
        chdd = get_hdd(temperature, temp_thresh)

        yearly_hdd += chdd

        # # Accumulate HDDs only into the corresponding quarter----modify to
        # hdd_quarter[quarter] += chdd
lon_arr = np.linspace(left, right, n_lon)
lat_arr = np.linspace(bottom, top, n_lat)
lon_grid, lat_grid = np.meshgrid(lon_arr, lat_arr)
lon_flat = lon_grid.ravel()
lat_flat = lat_grid.ravel()
hdd_flat = yearly_hdd.ravel()

100%|██████████| 1/1 [00:00<00:00, 45.25it/s]


In [19]:
# get nearest point with KDtree
tree = cKDTree(np.column_stack((lat_flat, lon_flat)))

for index, row in tqdm(merged_df.iterrows(), total=merged_df.shape[0]):
    latitude = row["latitude"]
    longitude = row["longitude"]
    # latitude = 45.0
    # longitude = -75.0
    # Query the tree for the nearest point
    dist, idx = tree.query([latitude, longitude])
    # Get the nearest HDD value
    merged_df.at[index, "nearest_hdd"] = hdd_flat[idx]

100%|██████████| 485/485 [00:00<00:00, 23428.90it/s]


In [20]:
print(f"\nshape: {countries_df.shape}")
merged_df.head()


shape: (12950185, 19)


Unnamed: 0,geonameid,name_x,asciiname,alternatenames,latitude,longitude,feature class,feature code,iso alpha 2,cc2,...,status,color_code,name_y,continent,region,iso_3166_1_alpha_2_codes,french_short,geometry,Region Grouped,nearest_hdd
0,292968,Abu Dhabi,Abu Dhabi,"A-pu-that-pi,AEbu Saby,AUH,Aboe Dhabi,Abou Dab...",24.45118,54.39696,P,PPLC,AE,,...,Member State,ARE,United Arab Emirates,Asia,Western Asia,AE,Émirats arabes unis,"MULTIPOLYGON (((53.96486 24.17944, 53.95500 24...",Asia & Oceania,0.0
1,1138958,Kabul,Kabul,"Cabool,Caboul,Cabul,Cabura,Cabúl,Caubul,KBL,Ka...",34.52813,69.17233,P,PPLC,AF,,...,Member State,AFG,Afghanistan,Asia,Southern Asia,AF,Afghanistan,"POLYGON ((74.91574 37.23733, 74.80873 37.22423...",Asia & Oceania,0.0
2,3183875,Tirana,Tirana,"TIA,Terana,Theranda,Tiorana,Tiorána,Tiran,Tira...",41.3275,19.81889,P,PPLC,AL,,...,Member State,ALB,Albania,Europe,Southern Europe,AL,Albanie,"POLYGON ((20.07142 42.56091, 20.10208 42.53347...",Europe,0.285643
3,616052,Yerevan,Yerevan,"Ayrivan,Djerevan,EVN,Eireavan,Eireaván,Ereban,...",40.18111,44.51361,P,PPLC,AM,,...,Member State,ARM,Armenia,Asia,Western Asia,AM,Arménie,"POLYGON ((46.54038 38.87559, 46.51639 38.87804...",Asia & Oceania,0.156981
4,2240449,Luanda,Luanda,"LAD,Loanda,Louanda,Louanta,Luand,Luanda,Luanda...",-8.83682,13.23432,P,PPLC,AO,,...,Member State,AGO,Angola,Africa,Middle Africa,AO,Angola,"MULTIPOLYGON (((23.98621 -10.87046, 23.98805 -...",Africa,0.0


In [22]:
output_path = "../data/processed"
os.makedirs(output_path, exist_ok=True)

merged_df.to_csv(os.path.join(output_path, "merged_df.csv"), index=False)