## Imports

In [1]:
# IMPORTS
#ML
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pycountry
import rasterio
from scipy.spatial import cKDTree
from tqdm import tqdm
from datetime import datetime
import geopandas as gpd

import zipfile
import os
import gdown

# Contry data

In [None]:
country_info_path = "https://drive.google.com/uc?id=1xfYlruvfAi6yieOd_S69pPYWphckRLr5&export=download"

column_names = [
    'Country_Code',        # Alpha-2 code
    'ISO_Alpha_3',        # Alpha-3 code
    'Numeric_Code',        # Numeric code
    'Alpha_2',            # Alpha-2 code (duplicate)
    'Country_Name',       # Name of the country
    'Capital',            # Capital city
    'Area',               # Area in square kilometers
    'Population',         # Population
    'Region',             # Region
    'TLD',                # Top-level domain
    'Currency_Code',      # Currency code
    'Currency_Name',      # Currency name
    'Currency_Numeric',   # Numeric currency code
    'Additional_Info'     # Additional information
]

country_info_df = pd.read_csv(
    country_info_path, delimiter="\t", comment="#", on_bad_lines="skip", header=None
)

country_info_df.columns = column_names
country_info_df.head()

## Countries 

In [None]:
countries_zip_url = "https://drive.google.com/uc?id=1UQzdO7suT0BnwKBeNybMG97vM9GIDogA"
countries_zip_file_path = "../../allCountries.zip"

# Download the ZIP file if it doesn't exist; otherwise, proceed to read the TXT file.
if not os.path.exists(countries_zip_file_path):
    gdown.download(countries_zip_url, countries_zip_file_path, quiet=False)

with zipfile.ZipFile(countries_zip_file_path) as z:
    countries_txt_filename = "allCountries.txt"

    with z.open(countries_txt_filename) as txt_file:
        countries_df = pd.read_csv(txt_file, sep="\t", header=None)


# https://download.geonames.org/export/dump/
countries_df.columns = [
    'geonameid',         
    'name',             
    'asciiname',        
    'alternatenames',  
    'latitude',         
    'longitude',       
    'feature class',    
    'feature code',      
    'iso alpha 2',      
    'cc2',              
    'admin1 code',     
    'admin2 code',       
    'admin3 code',      
    'admin4 code',   
    'population',      
    'elevation',       
    'dem',             
    'timezone',          
    'modification date'  
]

print(f"\nshape: {countries_df.shape}")
countries_df.head()

## EUI

In [None]:
eui_url = "https://drive.google.com/uc?id=12qGq_DLefI1RihIF_RKQUyJtm480-xRC"
eui_df = pd.read_csv(eui_url)

print(f"shape: {eui_df.shape}")
eui_df.head()

In [None]:
merged_df = pd.merge(
    countries_df, eui_df, left_on="geonameid", right_on="Geonames ID", how="inner"
)
assert merged_df.shape[0] == eui_df.shape[0]
print(f"shape: {merged_df.shape}")
merged_df.head()

## Adding ISO 3 Code

In [None]:
# Adding ISO CODE 3
alpha_2_to_alpha_3 = {country.alpha_2: country.alpha_3 for country in pycountry.countries}
merged_df.loc[:, 'ISO_alpha3'] = merged_df['iso alpha 2'].map(alpha_2_to_alpha_3)

# Manually correcting the missing country code for Namibia by assigning 'NAM' because country code is null for Nambia
merged_df.loc[merged_df['Country'] == 'Namibia', 'ISO_alpha3'] = 'NAM'
merged_df.loc[merged_df['Country'] == 'Namibia', 'iso alpha 2'] = 'NA'

print(f"shape: {merged_df.shape}")

assert merged_df["ISO_alpha3"].isna().sum() == 0, "There are missing values in the ISO_alpha3 column."
assert merged_df.shape[0] == 482, "The number of rows in merged_df is not 482."

# Population

In [None]:
population_path = '../data/01_raw/population.csv'
population_df = pd.read_csv(population_path, skiprows=4)
population_2023 = population_df[['Country Code', '2023']]

population_2023.rename(columns={
    '2023': 'Population_2023',
    'Country Code': 'ISO_alpha3'
}, inplace=True)

#taiwan
taiwan_raw = {'Country Name': 'Taiwan', 'ISO_alpha3': 'TWN', 'Population_2023': 23894394}
population_2023.loc[len(population_2023)] = taiwan_raw

merged_df = merged_df.merge(population_2023, on='ISO_alpha3', how='left')

assert merged_df['Population_2023'].notnull().all(), "Error: There are null values in 'Population_2023'."
assert merged_df.shape[0] == 482, "The number of rows in merged_df is not 482."
print(f"shape: {merged_df.shape}")
merged_df.head()

# HDI - Educational Index - Income Index.csv

In [None]:
merged_df.columns

In [None]:
HDI_EI_II_path = '../data/01_raw/HDI_educationalIndex_incomeIndex.csv'
HDI_EI_II_df = pd.read_csv(HDI_EI_II_path)
HDI_EI_II_df = HDI_EI_II_df[["ISO_Code", "Subnational HDI","Educational index", "Income index"]]
HDI_EI_II_df.rename(columns={'ISO_Code':'ISO_alpha3' }, inplace=True)


merged_df = merged_df.merge(HDI_EI_II_df, 
                             on='ISO_alpha3', 
                             how='left')

assert merged_df.loc[merged_df['Country'] != 'Taiwan', 
                     ['Subnational HDI', 'Educational index', 'Income index']].notnull().all().all()
assert merged_df.shape[0] == 482, "The number of rows in merged_df is not 482."
print(f"shape: {merged_df.shape}")

# GDP

In [None]:
gdp_data_path = (
    "../data/01_raw/gdp_data.csv"
)
gdp_df = pd.read_csv(gdp_data_path)
gdp_df = gdp_df[gdp_df["Level"]=="National"]

gdp_df = gdp_df[['ISO_Code', '2022']]
gdp_df.rename(columns={'2022': 'GDP_2022', 'ISO_Code':'ISO_alpha3' }, inplace=True)


merged_df = merged_df.merge(gdp_df, 
                             on='ISO_alpha3', 
                             how='left')

assert merged_df.loc[merged_df['Country'] != 'Taiwan', 
                     ['GDP_2022']].notnull().all().all()
assert merged_df.shape[0] == 482, "The number of rows in merged_df is not 482."
print(f"shape: {merged_df.shape}")

# Urbanization rate

In [None]:
print(merged_df.columns)


In [None]:
# Load the Urbanization Rate dataset, skipping metadata rows if necessary
urbanization_rate_path = (
    "https://drive.google.com/uc?id=1YteyPHAWnJUKG0LWogS98EYnwjRTeZDf&export=download"
)
urbanization_rate_df = pd.read_csv(urbanization_rate_path, skiprows=4)


urbanization_rate_df = urbanization_rate_df[["Country Code", "2022"]].rename(
    columns={"2022": "Urbanization_Rate_2022" , 'Country Code':'ISO_alpha3'}
)

merged_df = merged_df.merge(urbanization_rate_df, 
                             on='ISO_alpha3', 
                             how='left')

assert merged_df.shape[0] == 482, "The number of rows in merged_df is not 482."
assert merged_df.loc[merged_df['Country'] != 'Taiwan', 
                     ['Urbanization_Rate_2022']].notnull().all().all()
print(f"shape: {merged_df.shape}")

# Paris Agreement

In [None]:
paris_agreement_iso_codes = [
    "AFG", "ALB", "DZA", "AND", "AGO", "ATG", "ARG", "AUS", "AUT", "AZE", 
    "BHS", "BHR", "BGD", "BRB", "BLR", "BEL", "BLZ", "BEN", "BTN", "BOL", 
    "BIH", "BWA", "BRA", "BRN", "BGR", "BFA", "BDI", "CPV", "KHM", "CMR", 
    "CAN", "CAF", "TCD", "CHN", "COL", "COM", "COG", "CRI", "CIV", "HRV", 
    "CUB", "CYP", "CZE", "PRK", "COD", "DNK", "DJI", "DMA", "DOM", "EGY", 
    "SLV", "GNQ", "ERI", "EST", "ETH", "EUN", "FJI", "FIN", "FRA", "GAB", 
    "GEO", "DEU", "GHA", "GRC", "GRD", "GTM", "GIN", "GNB", "GUY", "HTI", 
    "HND", "HUN", "ISL", "IND", "IDN", "IRN", "IRL", "ISR", "ITA", "JAM", 
    "JPN", "JOR", "KEN", "KIR", "KWT", "LAO", "LVA", "LBN", "LSO", "LBR", 
    "LBY", "LIE", "LTU", "LUX", "MDG", "MYS", "MDV", "MLI", "MLT", "MHL", 
    "MUS", "MRT", "MEX", "FSM", "MCO", "MNG", "MNE", "MAR", "MOZ", "MMR", 
    "NAM", "NRU", "NPL", "NLD", "NZL", "NER", "NOR", "OMN", "PAK", "PLW", 
    "PAN", "PNG", "PRY", "PER", "PHL", "POL", "PRT", "QAT", "KOR", "ROU", 
    "RUS", "RWA", "KNA", "LCA", "VCT", "WSM", "SMR", "STP", "SEN", "SRB", 
    "SGP", "SVK", "SVN", "SLB", "SOM", "ZAF", "SSD", "ESP", "LKA", "PSE", 
    "SDN", "SUR", "SWZ", "SWE", "CHE", "TJK", "THA", "MKD", "TLS", "TON", 
    "TTO", "TUN", "TUR", "TUV", "UGA", "UKR", "ARE", "GBR", "TZA", "USA", 
    "URY", "VUT", "VEN", "VNM", "ZWE"
]

merged_df['Paris_Agreement'] = merged_df['ISO_alpha3'].apply(lambda x: 1 if x in paris_agreement_iso_codes else 0)

## Region

In [None]:
world_boundaries_url = "https://drive.google.com/uc?id=1k-2ECd2gwJ9FBz1anMRZy7O85uExAFY_"
world_boundaries_path = "../../world-administrative-boundaries.geojson"

gdown.download(world_boundaries_url, world_boundaries_path, quiet=False)
world_boundaries_df = gpd.read_file(world_boundaries_path)
world_boundaries_df = world_boundaries_df[world_boundaries_df['name'] != 'Azores Islands']
world_boundaries_df.head()

In [None]:
merged_df = merged_df.merge(world_boundaries_df, left_on='ISO_alpha3', right_on='iso3', how='left')


merged_df['Region Grouped'] = np.where(
    merged_df['region'] == 'Northern America', 
    'Northern America',  
    np.where(
        merged_df['continent'] == 'Americas', 
        'Central and South America',  
        np.where(
            merged_df['continent'].isin(['Asia', 'Oceania']), 
            'Asia & Oceania',  
            merged_df['continent']  
        )
    )
)
assert merged_df.shape[0] == 482, "The number of rows in merged_df is not 482."
assert merged_df['Region Grouped'].notnull().all(), "The 'Region Grouped' column contains null values."
merged_df.groupby(['continent', 'region' , 'Region Grouped']).size().reset_index(name='count').sort_values(by='Region Grouped')


# HDD

In [None]:
HDD_path = "../data/02_interim/HDD.csv"
HDD_df = pd.read_csv(HDD_path)

HDD_df = HDD_df.rename(columns={
    'total_year': 'hdd_total_year',
    'average_year': 'hdd_average_year',
    'variance_year': 'hdd_variance_year'
})

merged_df = merged_df.merge(HDD_df, on='geonameid', how='left')
assert merged_df['hdd_total_year'].notnull().all()
print(f"shape: {merged_df.shape}")
merged_df.head()

# 

# CDD

In [None]:
CDD_path = "../data/02_interim/CDD.csv"
CDD_df = pd.read_csv(HDD_path)

CDD_df = CDD_df.rename(columns={
    'total_year': 'cdd_total_year',
    'average_year': 'cdd_average_year',
    'variance_year': 'cdd_variance_year'
})

merged_df = merged_df.merge(CDD_df, on='geonameid', how='left')
assert merged_df['cdd_total_year'].notnull().all()
print(f"shape: {merged_df.shape}")
merged_df.head()

In [None]:
output_path = "../data/03_processed"
os.makedirs(output_path, exist_ok=True)
merged_df.to_csv(os.path.join(output_path, "merged_df.csv"), index=False)