In [1]:
import sys
import os
from datetime import datetime
import time
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
pd.set_option('display.max_rows', None)
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb


In [2]:
sys.path.append(os.path.abspath('../src'))
from lib import (
    train_and_evaluate_models,
    convert_results_to_dataframe,
    create_eui_comparison_plots,
    evaluate_model_strategies,
    calculate_average_metrics,
    create_error_distribution_plots,
    grid_search_best_params
)

In [3]:
n_clusters = 10  #5, 10 or 20

In [4]:
output_path = "../data/03_processed/merged_df.csv"
merged_df = pd.read_csv(output_path)
merged_df = merged_df.rename(columns={'total_year': 'hdd_total_year'})

# GDP per capita
merged_df['GDP_per_capita'] = merged_df['GDP_2022'] / merged_df['Population_2023']

# Delete Taiwan (temporal)
merged_df = merged_df[merged_df['Country'] != 'Taiwan']

#np.random.seed(123)
#merged_df['is_train'] = np.random.choice([1, 0], size=len(merged_df), p=[0.8, 0.2])

# Train / Test

In [5]:
train_test_split_original_data_path = "../data/03_processed/train_test_split_original_data.csv"
train_test_split_original_data = pd.read_csv(train_test_split_original_data_path)

train_test_split_new_data_path = "../data/03_processed/train_test_split_new_data.csv"
train_test_split_new_data = pd.read_csv(train_test_split_new_data_path)

train_test_split = pd.concat([train_test_split_original_data, train_test_split_new_data], ignore_index=True)

merged_df = merged_df.merge(train_test_split, on=['index', 'geonameid'], how='left')

# We want to keep original data 
merged_df = merged_df[merged_df["source"] == "World Bank CURB"]

# Delete Image Null

In [6]:
print(merged_df.shape)

merged_df = merged_df.dropna(subset=['pca_result'])

print(merged_df.shape)

(481, 78)
(473, 78)


In [7]:
feature_selection = f"{n_clusters}_clusters" 

features = [
    "hdd_total_year", 
    "cdd_total_year", 
    "GDP_per_capita", 
    "Urbanization_Rate_2022",
    "latitude",
    "longitude",  
    "Paris_Agreement",
    "Subnational HDI", 
    "Educational index", 
    "Income index",
    "2m_temperature_2023_avg",
    "2m_dewpoint_temperature_2023_avg",
    "total_precipitation_2023",
    #"longitude_sin", 
    #"longitude_cos", 
    #"latitude_sin", 
    #"latitude_cos",  
    "population_density",

    #Image data
    'pca_result'
    ]

features += [f'Cluster_{n_clusters}_{i}' for i in range(n_clusters)]


feature_abbreviations = {
    "hdd_total_year": "Heating Degree Days",
    "cdd_total_year": "Cooling Degree Days", 
    "GDP_per_capita": "GDP per Capita",
    "Urbanization_Rate_2022": "Urbanization Rate",
    "latitude": "Latitude",
    "longitude": "Longitude",  
    "Paris_Agreement": "Paris Agreement",
    "Subnational HDI": "Human Development Index",
    "Educational index": "Educational Index", 
    "Income index": "Income Index",
    "2m_temperature_2023_avg": "Avg Temperature",
    "2m_dewpoint_temperature_2023_avg": "Avg Dewpoint Temp",
    "total_precipitation_2023": "Precipitation",
    "longitude_sin": "Longitude Sin", 
    "longitude_cos": "Longitude Cos", 
    "latitude_sin": "Latitude Sin", 
    "latitude_cos": "Latitude Cos",  
    "population_density": "Population Density",

    # Image-based features
    "pca_result": "First Principal Component (PCA)",
}

feature_abbreviations.update({f'Cluster_{n_clusters}_{i}': f'Fuzzy C-means Image Cluster {i}' for i in range(n_clusters)})


target_columns = [
    "Residential EUI (kWh/m2/year)",
    "Non-residential EUI (kWh/m2/year)"
]

regions = ['Asia & Oceania', 'Europe', 'Africa', 'Central and South America', 'Northern America']


# XGBoost - Grid Serach

In [8]:
# Define parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200],
    #'max_depth': [4, 6, 8],
    #'learning_rate': [0.01, 0.05, 0.1, 0.2],
    #'subsample': [0.8, 0.9],
    #'colsample_bytree': [0.7, 0.9],
    #'min_child_weight': [1, 3],
    #'gamma': [0, 0.1],
    #'random_state': [42]
}
# Run grid search with cross-validation
xgb_best_params, xgb_best_score, xgb_all_results = grid_search_best_params(
    merged_df=merged_df,
    regions=regions,
    features=features,
    model_class=xgb.XGBRegressor,
    param_grid=xgb_param_grid,
    feature_abbreviations=feature_abbreviations
)

# Store the best parameters
xgb_params = xgb_best_params

# Save the grid search results
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
xgb_grid_search_results = convert_results_to_dataframe(xgb_all_results, features)
xgb_grid_search_results.to_csv(f'../results/gridsearch/{timestamp}_xgb_grid_search_results.csv')

Evaluating combination 1/2: {'n_estimators': 100}
Strategy 'within_domain' - Overall Average MAPE: 13.07%
Strategy 'cross_domain' - Overall Average MAPE: 19.76%
Strategy 'all_domain' - Overall Average MAPE: 12.54%
Evaluating combination 2/2: {'n_estimators': 200}
Strategy 'within_domain' - Overall Average MAPE: 13.07%
Strategy 'cross_domain' - Overall Average MAPE: 19.76%
Strategy 'all_domain' - Overall Average MAPE: 12.54%

=== Best Parameters for cross_domain strategy ===
{'n_estimators': 100}
Cross-domain Average CV MAPE: 19.76%


# 