In [1]:
import sys
import os
from datetime import datetime
import time
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

pd.set_option("display.max_rows", None)
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor

In [2]:
sys.path.append(os.path.abspath("../src"))
from lib import (
    train_and_evaluate_models,
    convert_results_to_dataframe,
    create_eui_comparison_plots,
    evaluate_model_strategies,
    calculate_average_metrics,
    create_error_distribution_plots,
    grid_search_best_params,
)

In [3]:
output_path = "../data/03_processed/merged_df.csv"
merged_df = pd.read_csv(output_path)
merged_df = merged_df.rename(columns={"total_year": "hdd_total_year"})

# GDP per capita
merged_df["GDP_per_capita"] = merged_df["GDP_2022"] / merged_df["Population_2023"]

# Delete Taiwan (temporal)
merged_df = merged_df[merged_df["Country"] != "Taiwan"]

# np.random.seed(123)
# merged_df['is_train'] = np.random.choice([1, 0], size=len(merged_df), p=[0.8, 0.2])

# Train / Test

In [4]:
train_test_split_original_data_path = (
    "../data/03_processed/train_test_split_original_data.csv"
)
train_test_split_original_data = pd.read_csv(train_test_split_original_data_path)

train_test_split_new_data_path = "../data/03_processed/train_test_split_new_data.csv"
train_test_split_new_data = pd.read_csv(train_test_split_new_data_path)

train_test_split = pd.concat(
    [train_test_split_original_data, train_test_split_new_data], ignore_index=True
)

merged_df = merged_df.merge(train_test_split, on=["index", "geonameid"], how="left")

# We want to keep original data
merged_df = merged_df[merged_df["source"] == "World Bank CURB"]

In [5]:


features = [
    "hdd_total_year", 
    "cdd_total_year", 
    # "GDP_per_capita", 
    "Urbanization_Rate_2022",
    "latitude",
    "longitude",  
    # "Paris_Agreement",
    "Subnational HDI", 
    "Educational index", 
    "Income index",
    "2m_temperature_2023_avg",
    "2m_dewpoint_temperature_2023_avg",
    "total_precipitation_2023",
    # "longitude_sin", 
    # "longitude_cos", 
    # "latitude_sin", 
    # "latitude_cos",  
    # "population_density",

    # Image data

    # "pca_result", 
    # "K_mean_label", 
    # "C_mean_label", 
    # "GMM_label", 
    # "Cluster_1",
    # "Cluster_2", 
    # "Cluster_3",
    # "Cluster_4", 
    # "Cluster_5",
    # "Cluster_6", 
    # "Cluster_7", 
    # "Cluster_8", 
    # "Cluster_9", 
    # "Cluster_10",
    # "Cluster_11", 
    # "Cluster_12", 
    # "Cluster_13", 
    # "Cluster_14", 
    # "Cluster_15",
    # "Cluster_16", 
    # "Cluster_17", 
    # "Cluster_18", 
    # "Cluster_19"
]

feature_abbreviations = {
    "hdd_total_year": "HDD",
    "cdd_total_year": "CDD", 
    # "GDP_per_capita": "GDP",
    "Urbanization_Rate_2022": "URB",
    "latitude": "Lat",
    "longitude": "Long",
    # "Paris_Agreement": "Paris",
    "Subnational HDI": "HDI", 
    "Educational index": "EDU", 
    "Income index": "Income",
    "2m_temperature_2023_avg": "Temp",
    "2m_dewpoint_temperature_2023_avg": "Dew",
    "total_precipitation_2023": "Precip",
    # "longitude_sin": "Long_Sin", 
    # "longitude_cos": "Long_Cos", 
    # "latitude_sin": "Lat_Sin", 
    # "latitude_cos": "Lat_Cos",  
    # "population_density": "Pop_Dens",
    # "pca_result": "PCA",
    # "Cluster_1": "C1",
    # "Cluster_2": "C2",
    # "Cluster_3": "C3",
    # "Cluster_4": "C4",
    # "Cluster_5": "C5",
    # "Cluster_6": "C6",
    # "Cluster_7": "C7",
    # "Cluster_8": "C8",
    # "Cluster_9": "C9",
    # "Cluster_10": "C10",
    # "Cluster_11": "C11",
    # "Cluster_12": "C12",
    # "Cluster_13": "C13",
    # "Cluster_14": "C14",
    # "Cluster_15": "C15",
    # "Cluster_16": "C16",
    # "Cluster_17": "C17",
    # "Cluster_18": "C18",
    # "Cluster_19": "C19"
}


target_columns = [
    "Residential EUI (kWh/m2/year)",
    "Non-residential EUI (kWh/m2/year)"
]

regions = ['Asia & Oceania', 'Europe', 'Africa', 'Central and South America', 'Northern America']


# Delete Image Null

In [6]:
print(merged_df.shape)

merged_df = merged_df.dropna(subset=['pca_result'])

print(merged_df.shape)

(481, 67)
(473, 67)


# CatBoost - Grid Serach

In [7]:
# Define parameter grid for CatBoost
cat_param_grid = {
    "iterations": [100, 200, 300],
    #"depth": [4, 6, 8],
    #"learning_rate": [0.01, 0.05, 0.1],
    # "subsample": [0.7, 0.8, 0.9],
    # "subsample": [0.9, 1.0],
    # "colsample_bylevel": [0.7, 0.9],
    # "l2_leaf_reg": [1, 3, 5],
    # "random_seed": [42],
    # "bootstrap_type": ["Bernoulli"],
    # "loss_function": ["MAPE"],
    # "eval_metric": ["MAPE"],
    # "verbose": [False],
}

# Run grid search with cross-validation
cat_best_params, cat_best_score, cat_all_results = grid_search_best_params(
    merged_df=merged_df,
    regions=regions,
    features=features,
    model_class=CatBoostRegressor,
    param_grid=cat_param_grid,
    feature_abbreviations=feature_abbreviations,
)

# Store the best parameters
cat_params = cat_best_params

# Save the grid search results
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
cat_grid_search_results = convert_results_to_dataframe(cat_all_results, features)
cat_grid_search_results.to_csv(
    f"../results/gridsearch/{timestamp}_cat_grid_search_results.csv"
)

Evaluating combination 1/3: {'iterations': 100}
Learning rate set to 0.192124
0:	learn: 56.9529674	total: 58.2ms	remaining: 5.76s
1:	learn: 51.5770968	total: 58.9ms	remaining: 2.89s
2:	learn: 47.8834676	total: 59.7ms	remaining: 1.93s
3:	learn: 44.6964877	total: 60.4ms	remaining: 1.45s
4:	learn: 41.9891766	total: 61.1ms	remaining: 1.16s
5:	learn: 38.5855416	total: 61.8ms	remaining: 968ms
6:	learn: 36.5770339	total: 62.4ms	remaining: 829ms
7:	learn: 34.0123861	total: 63.1ms	remaining: 726ms
8:	learn: 31.7640782	total: 64.1ms	remaining: 648ms
9:	learn: 30.8596201	total: 64.7ms	remaining: 583ms
10:	learn: 29.3170093	total: 65.4ms	remaining: 529ms
11:	learn: 27.3957868	total: 66.1ms	remaining: 485ms
12:	learn: 26.2498186	total: 66.8ms	remaining: 447ms
13:	learn: 25.3961104	total: 67.4ms	remaining: 414ms
14:	learn: 24.1683483	total: 68.1ms	remaining: 386ms
15:	learn: 23.4050207	total: 68.8ms	remaining: 361ms
16:	learn: 22.4200902	total: 69.7ms	remaining: 340ms
17:	learn: 21.5237791	total: 70

# 