In [1]:
import sys
import os
from datetime import datetime
import time
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

pd.set_option("display.max_rows", None)
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor

In [2]:
sys.path.append(os.path.abspath("../src"))
from lib import (
    train_and_evaluate_models,
    convert_results_to_dataframe,
    create_eui_comparison_plots,
    evaluate_model_strategies,
    calculate_average_metrics,
    create_error_distribution_plots,
    grid_search_best_params,
)

In [3]:
n_clusters = 10  #5, 10 or 20

In [4]:
output_path = "../data/03_processed/merged_df.csv"
merged_df = pd.read_csv(output_path)
merged_df = merged_df.rename(columns={"total_year": "hdd_total_year"})

# GDP per capita
merged_df["GDP_per_capita"] = merged_df["GDP_2022"] / merged_df["Population_2023"]

# Delete Taiwan (temporal)
merged_df = merged_df[merged_df["Country"] != "Taiwan"]

# np.random.seed(123)
# merged_df['is_train'] = np.random.choice([1, 0], size=len(merged_df), p=[0.8, 0.2])

# Train / Test

In [5]:
train_test_split_original_data_path = (
    "../data/03_processed/train_test_split_original_data.csv"
)
train_test_split_original_data = pd.read_csv(train_test_split_original_data_path)

train_test_split_new_data_path = "../data/03_processed/train_test_split_new_data.csv"
train_test_split_new_data = pd.read_csv(train_test_split_new_data_path)

train_test_split = pd.concat(
    [train_test_split_original_data, train_test_split_new_data], ignore_index=True
)

merged_df = merged_df.merge(train_test_split, on=["index", "geonameid"], how="left")

# We want to keep original data
merged_df = merged_df[merged_df["source"] == "World Bank CURB"]

In [6]:
feature_selection = f"{n_clusters}_clusters" 

features = [
    "hdd_total_year", 
    "cdd_total_year", 
    "GDP_per_capita", 
    "Urbanization_Rate_2022",
    "latitude",
    "longitude",  
    "Paris_Agreement",
    "Subnational HDI", 
    "Educational index", 
    "Income index",
    "2m_temperature_2023_avg",
    "2m_dewpoint_temperature_2023_avg",
    "total_precipitation_2023",
    #"longitude_sin", 
    #"longitude_cos", 
    #"latitude_sin", 
    #"latitude_cos",  
    "population_density",

    #Image data
    'pca_result'
    ]

features += [f'Cluster_{n_clusters}_{i}' for i in range(n_clusters)]


feature_abbreviations = {
    "hdd_total_year": "Heating Degree Days",
    "cdd_total_year": "Cooling Degree Days", 
    "GDP_per_capita": "GDP per Capita",
    "Urbanization_Rate_2022": "Urbanization Rate",
    "latitude": "Latitude",
    "longitude": "Longitude",  
    "Paris_Agreement": "Paris Agreement",
    "Subnational HDI": "Human Development Index",
    "Educational index": "Educational Index", 
    "Income index": "Income Index",
    "2m_temperature_2023_avg": "Avg Temperature",
    "2m_dewpoint_temperature_2023_avg": "Avg Dewpoint Temp",
    "total_precipitation_2023": "Precipitation",
    "longitude_sin": "Longitude Sin", 
    "longitude_cos": "Longitude Cos", 
    "latitude_sin": "Latitude Sin", 
    "latitude_cos": "Latitude Cos",  
    "population_density": "Population Density",

    # Image-based features
    "pca_result": "First Principal Component (PCA)",
}

feature_abbreviations.update({f'Cluster_{n_clusters}_{i}': f'Fuzzy C-means Image Cluster {i}' for i in range(n_clusters)})


target_columns = [
    "Residential EUI (kWh/m2/year)",
    "Non-residential EUI (kWh/m2/year)"
]

regions = ['Asia & Oceania', 'Europe', 'Africa', 'Central and South America', 'Northern America']


# Delete Image Null

In [7]:
print(merged_df.shape)

merged_df = merged_df.dropna(subset=['pca_result'])

print(merged_df.shape)

(481, 78)
(473, 78)


# CatBoost - Grid Serach

In [8]:
# Define parameter grid for CatBoost
cat_param_grid = {
    "iterations": [100, 200, 300],
    #"depth": [4, 6, 8],
    #"learning_rate": [0.01, 0.05, 0.1],
    # "subsample": [0.7, 0.8, 0.9],
    # "subsample": [0.9, 1.0],
    # "colsample_bylevel": [0.7, 0.9],
    # "l2_leaf_reg": [1, 3, 5],
    # "random_seed": [42],
    # "bootstrap_type": ["Bernoulli"],
    # "loss_function": ["MAPE"],
    # "eval_metric": ["MAPE"],
    # "verbose": [False],
}

# Run grid search with cross-validation
cat_best_params, cat_best_score, cat_all_results = grid_search_best_params(
    merged_df=merged_df,
    regions=regions,
    features=features,
    model_class=CatBoostRegressor,
    param_grid=cat_param_grid,
    feature_abbreviations=feature_abbreviations,
)

# Store the best parameters
cat_params = cat_best_params

# Save the grid search results
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
cat_grid_search_results = convert_results_to_dataframe(cat_all_results, features)
cat_grid_search_results.to_csv(
    f"../results/gridsearch/{timestamp}_cat_grid_search_results.csv"
)

Evaluating combination 1/3: {'iterations': 100}
Learning rate set to 0.192124
0:	learn: 57.8633088	total: 58ms	remaining: 5.74s
1:	learn: 52.0900074	total: 59.6ms	remaining: 2.92s
2:	learn: 48.8729439	total: 60.5ms	remaining: 1.96s
3:	learn: 44.9982482	total: 61.4ms	remaining: 1.47s
4:	learn: 41.9231447	total: 62.4ms	remaining: 1.19s
5:	learn: 39.7653698	total: 63.4ms	remaining: 994ms
6:	learn: 38.2042222	total: 64.4ms	remaining: 856ms
7:	learn: 35.4193628	total: 65.2ms	remaining: 750ms
8:	learn: 33.4491749	total: 66.1ms	remaining: 669ms
9:	learn: 31.7956298	total: 67.2ms	remaining: 605ms
10:	learn: 30.8570862	total: 68ms	remaining: 550ms
11:	learn: 29.9010479	total: 69.1ms	remaining: 506ms
12:	learn: 29.0659667	total: 70.7ms	remaining: 473ms
13:	learn: 26.8507717	total: 71.7ms	remaining: 441ms
14:	learn: 25.0341140	total: 72.8ms	remaining: 413ms
15:	learn: 24.3706496	total: 73.5ms	remaining: 386ms
16:	learn: 23.7293111	total: 75.2ms	remaining: 367ms
17:	learn: 22.2344722	total: 76.3ms

# 