In [1]:
import os
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
pd.set_option('display.max_rows', None)

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [2]:
output_path = "../data/processed/merged_df.csv"
merged_df = pd.read_csv(output_path)

# this is temporal, look for a way to fill this
merged_df = merged_df[merged_df['Country'] != 'Taiwan']

# Create Train & Test 
np.random.seed(42)
merged_df['is_train'] = np.random.choice([1, 0], size=len(merged_df), p=[0.8, 0.2])

In [3]:

def calculate_metrics(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "R²": r2_score(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "MAPE": np.mean(np.abs((y_true - y_pred) / y_true)) * 100,
        "WAPE": np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true)) * 100
    }

def train_and_evaluate(X_train, X_test, y_train, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return calculate_metrics(y_test, y_pred)

def add_metadata(metrics, region, target, strategy, features_used):
    metrics.update({
        'Region': region,
        'Target': target,
        'Strategy': strategy,
        'Model': 'Linear Regression',
        'Features Used': ', '.join(features_used)
    })
    return metrics

def reorder_columns(df):
    column_order = [
        'Region', 'Target', 'Strategy', 'Model', 'Features Used', 'Features Abbreviated',
        'MSE', 'R²', 'MAE', 'RMSE', 'MAPE', 'WAPE'
    ]
    existing_columns = df.columns.tolist()
    final_columns = [col for col in column_order if col in existing_columns]
    return df[final_columns]


def within_domain(df, features, target_columns):
    results = []
    for region in df['Region Grouped'].unique():
        region_data = df[df['Region Grouped'] == region].copy()
        mask = np.random.rand(len(region_data)) < 0.8
        
        for target in target_columns:
            X_train = region_data[features][mask]
            X_test = region_data[features][~mask]
            y_train = region_data[target][mask]
            y_test = region_data[target][~mask]
            
            metrics = train_and_evaluate(X_train, X_test, y_train, y_test)
            results.append(add_metadata(metrics, region, target, 'Within-Domain', features))
    return reorder_columns(pd.DataFrame(results).round(2))

def cross_domain(df, features, target_columns):
    results = []
    regions = df['Region Grouped'].unique()
    
    for test_region in regions:
        train_data = df[df['Region Grouped'] != test_region]
        test_data = df[df['Region Grouped'] == test_region]
        
        for target in target_columns:
            X_train = train_data[features]
            X_test = test_data[features]
            y_train = train_data[target]
            y_test = test_data[target]
            
            metrics = train_and_evaluate(X_train, X_test, y_train, y_test)
            results.append(add_metadata(metrics, test_region, target, 'Cross-Domain', features))
    return reorder_columns(pd.DataFrame(results).round(2))

def domain_adaptation(df, features, target_columns):
    results = []
    regions = df['Region Grouped'].unique()
    
    train_mask = np.random.rand(len(df)) < 0.8
    X_train_all = df[features][train_mask]
    
    for region in regions:
        test_data = df[(df['Region Grouped'] == region) & (~train_mask)]
        
        for target in target_columns:
            y_train = df[target][train_mask]
            y_test = test_data[target]
            X_test = test_data[features]
            
            metrics = train_and_evaluate(X_train_all, X_test, y_train, y_test)
            results.append(add_metadata(metrics, region, target, 'Domain-Adaptation', features))
    return reorder_columns(pd.DataFrame(results).round(2))

def add_abbreviated_feature_column(df, features, feature_abbreviations):
    abbreviated_features = [feature_abbreviations.get(feature, feature) for feature in features]
    df['Features Abbreviated'] = ' | '.join(abbreviated_features)
    return df

def append_total_row(results_df):
    total_row_df = pd.DataFrame()
    
    for target in results_df['Target'].unique():
        target_df = results_df[results_df['Target'] == target]
        metrics_avg = target_df[['MSE', 'R²', 'MAE', 'RMSE', 'MAPE', 'WAPE']].mean()
        
        total_row = {
            'Region': 'Total',
            'Target': target,
            'Strategy': results_df['Strategy'].iloc[0],
            'Model': results_df['Model'].iloc[0],
            'Features Used': results_df['Features Used'].iloc[0],
            'Features Abbreviated': results_df['Features Abbreviated'].iloc[0] if 'Features Abbreviated' in results_df.columns else 'N/A',
            'MSE': metrics_avg['MSE'],
            'R²': metrics_avg['R²'],
            'MAE': metrics_avg['MAE'],
            'RMSE': metrics_avg['RMSE'],
            'MAPE': metrics_avg['MAPE'],
            'WAPE': metrics_avg['WAPE']
        }
        
        total_row_df = pd.concat([total_row_df, pd.DataFrame([total_row])], ignore_index=True)
    
    return reorder_columns(pd.concat([results_df, total_row_df], ignore_index=True))


def run_all_strategies(df, features, target_columns, seed=42):
    np.random.seed(seed)
    
    within_domain_results = within_domain(df, features, target_columns)
    cross_domain_results = cross_domain(df, features, target_columns)
    domain_adaptation_results = domain_adaptation(df, features, target_columns)
    return within_domain_results, cross_domain_results, domain_adaptation_results

def filter_and_concatenate(*dfs):
    filtered_dfs = [df[df['Region'] == 'Total'] for df in dfs]
    concatenated_df = pd.concat(filtered_dfs, ignore_index=True)
    return concatenated_df

def filter_and_concatenate(*dfs):
    filtered_dfs = [df[df['Region'] == 'Total'] for df in dfs]
    concatenated_df = pd.concat(filtered_dfs, ignore_index=True)
    return concatenated_df


In [4]:
features = [
    "nearest_hdd", "GDP_2022", "Urbanization_Rate_2022",
    "latitude", "longitude", "Population_2023", "Paris_Agreement"
]

feature_abbreviations = {
    "nearest_hdd": "HDD",
    "GDP_2022": "GDP",
    "Urbanization_Rate_2022": "URB",
    "latitude": "Lat",
    "longitude": "Long",
    "Population_2023": "Pop",
    "Paris_Agreement": "Paris"
}

target_columns = [
    "Residential EUI (kWh/m2/year)",
    "Non-residential EUI (kWh/m2/year)"
]

within_domain_results, cross_domain_results, domain_adaptation_results = run_all_strategies(
    df=merged_df,
    features=features,
    target_columns=target_columns,
    seed=42
)

within_domain_results = add_abbreviated_feature_column(within_domain_results, features, feature_abbreviations)
cross_domain_results = add_abbreviated_feature_column(cross_domain_results, features, feature_abbreviations)
domain_adaptation_results = add_abbreviated_feature_column(domain_adaptation_results, features, feature_abbreviations)

within_domain_results = append_total_row(within_domain_results)
cross_domain_results = append_total_row(cross_domain_results)
domain_adaptation_results = append_total_row(domain_adaptation_results)

concatenated_results = filter_and_concatenate(
    within_domain_results,
    cross_domain_results,
    domain_adaptation_results
)

concatenated_results

Unnamed: 0,Region,Target,Strategy,Model,Features Used,Features Abbreviated,MSE,R²,MAE,RMSE,MAPE,WAPE
0,Total,Residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,929.068,0.386,20.4,25.408,15.894,14.604
1,Total,Non-residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,1457.406,-0.892,24.172,31.774,14.468,12.486
2,Total,Residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,2123.31,-0.416,33.394,41.186,33.776,27.508
3,Total,Non-residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,9672.768,-15.8,70.33,86.898,49.086,46.574
4,Total,Residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,1518.924,-0.33,27.084,35.554,21.252,20.722
5,Total,Non-residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,2864.274,-13.212,37.034,45.182,21.908,20.282


In [5]:
# SAVE RESULTS 

results_dir = '../results/'
os.makedirs(results_dir, exist_ok=True)

date_str = datetime.now().strftime("%Y%m%d_%H%M")

within_domain_results.to_csv(
    os.path.join(results_dir, f'results_{date_str}_within_domain.csv'), 
    index=False
)
cross_domain_results.to_csv(
    os.path.join(results_dir, f'results_{date_str}_cross_domain.csv'), 
    index=False
)
domain_adaptation_results.to_csv(
    os.path.join(results_dir, f'results_{date_str}_domain_adaptation.csv'), 
    index=False
)

concatenated_results.to_csv(  
    os.path.join(results_dir, f'results_{date_str}_Total.csv'), 
    index=False )

In [6]:
concatenated_results.sort_values(by=['Target', 'Strategy'], ascending=False).reset_index(drop=True)

Unnamed: 0,Region,Target,Strategy,Model,Features Used,Features Abbreviated,MSE,R²,MAE,RMSE,MAPE,WAPE
0,Total,Residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,929.068,0.386,20.4,25.408,15.894,14.604
1,Total,Residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,1518.924,-0.33,27.084,35.554,21.252,20.722
2,Total,Residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,2123.31,-0.416,33.394,41.186,33.776,27.508
3,Total,Non-residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,1457.406,-0.892,24.172,31.774,14.468,12.486
4,Total,Non-residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,2864.274,-13.212,37.034,45.182,21.908,20.282
5,Total,Non-residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,9672.768,-15.8,70.33,86.898,49.086,46.574


In [7]:
within_domain_results

Unnamed: 0,Region,Target,Strategy,Model,Features Used,Features Abbreviated,MSE,R²,MAE,RMSE,MAPE,WAPE
0,Asia & Oceania,Residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,833.79,0.73,18.67,28.88,18.75,18.7
1,Asia & Oceania,Non-residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,596.4,0.71,17.36,24.42,12.25,12.12
2,Europe,Residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,2882.02,0.38,43.64,53.68,27.01,21.26
3,Europe,Non-residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,3930.52,0.12,47.93,62.69,27.31,22.04
4,Africa,Residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,26.83,0.23,4.2,5.18,6.21,6.08
5,Africa,Non-residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,174.87,0.02,4.56,13.22,8.6,4.47
6,Central and South America,Residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,134.04,0.45,9.77,11.58,11.93,12.02
7,Central and South America,Non-residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,70.4,-0.28,6.85,8.39,6.78,6.55
8,Northern America,Residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,768.66,0.14,25.72,27.72,15.57,14.96
9,Northern America,Non-residential EUI (kWh/m2/year),Within-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,2514.84,-5.03,44.16,50.15,17.4,17.25


In [8]:
cross_domain_results

Unnamed: 0,Region,Target,Strategy,Model,Features Used,Features Abbreviated,MSE,R²,MAE,RMSE,MAPE,WAPE
0,Asia & Oceania,Residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,3507.62,-0.05,46.1,59.23,73.58,50.21
1,Asia & Oceania,Non-residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,28985.18,-15.4,129.15,170.25,106.96,96.9
2,Europe,Residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,4843.73,0.16,54.66,69.6,34.77,26.03
3,Europe,Non-residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,5997.01,0.2,52.9,77.44,25.41,23.2
4,Africa,Residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,370.22,-1.4,14.78,19.24,19.78,20.69
5,Africa,Non-residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,2170.15,-30.98,40.93,46.58,38.98,39.0
6,Central and South America,Residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,339.4,-0.17,15.19,18.42,19.08,18.6
7,Central and South America,Non-residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,1921.0,-26.6,39.44,43.83,38.74,37.62
8,Northern America,Residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,1555.58,-0.62,36.24,39.44,21.67,22.01
9,Northern America,Non-residential EUI (kWh/m2/year),Cross-Domain,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,9290.5,-6.22,89.23,96.39,35.34,36.15


In [9]:
domain_adaptation_results

Unnamed: 0,Region,Target,Strategy,Model,Features Used,Features Abbreviated,MSE,R²,MAE,RMSE,MAPE,WAPE
0,Asia & Oceania,Residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,2583.23,0.29,25.57,50.83,25.56,27.53
1,Asia & Oceania,Non-residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,1153.4,-0.43,23.9,33.96,19.46,18.81
2,Europe,Residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,2960.8,0.07,47.79,54.41,27.18,21.5
3,Europe,Non-residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,2826.47,-0.0,34.8,53.16,23.6,16.5
4,Africa,Residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,397.43,-1.65,13.34,19.94,16.97,18.45
5,Africa,Non-residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,349.75,-36.99,14.27,18.7,13.47,13.57
6,Central and South America,Residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,215.28,-0.13,12.1,14.67,16.25,15.7
7,Central and South America,Non-residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,518.14,-7.3,18.29,22.76,18.29,17.54
8,Northern America,Residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,1437.88,-0.23,36.62,37.92,20.3,20.43
9,Northern America,Non-residential EUI (kWh/m2/year),Domain-Adaptation,Linear Regression,"nearest_hdd, GDP_2022, Urbanization_Rate_2022,...",HDD | GDP | URB | Lat | Long | Pop | Paris,9473.61,-21.34,93.91,97.33,34.72,34.99
