In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor


In [30]:
output_path = "../data/03_processed/merged_df.csv"
merged_df = pd.read_csv(output_path)
merged_df = merged_df.rename(columns={'total_year': 'HDD_total_year'})

# GDP per capita
merged_df['GDP_per_capita'] = merged_df['GDP_2022'] / merged_df['Population_2023']

# Delete Taiwan (temporal)
merged_df = merged_df[merged_df['Country'] != 'Taiwan']


#np.random.seed(123)
#merged_df['is_train'] = np.random.choice([1, 0], size=len(merged_df), p=[0.8, 0.2])

# Train / Test

In [31]:
train_test_split_original_data_path = "../data/03_processed/train_test_split_original_data.csv"
train_test_split_original_data = pd.read_csv(train_test_split_original_data_path)

train_test_split_new_data_path = "../data/03_processed/train_test_split_new_data.csv"
train_test_split_new_data = pd.read_csv(train_test_split_new_data_path)

train_test_split = pd.concat([train_test_split_original_data, train_test_split_new_data], ignore_index=True)

merged_df = merged_df.merge(train_test_split, on=['index', 'geonameid'], how='left')

# We want to keep original data 
merged_df = merged_df[merged_df["source"] == "World Bank CURB"]

train_df = merged_df[merged_df['is_train'] == 1].copy()


In [32]:
features = [
    "hdd_total_year", 
    "cdd_total_year", 
    "GDP_per_capita", 
    "Urbanization_Rate_2022",
    #"latitude",
    #"longitude",  
    "Paris_Agreement",
    "Subnational HDI", 
    "Educational index", 
    "Income index",
    "2m_temperature_2023_avg",
    "2m_dewpoint_temperature_2023_avg",
    "total_precipitation_2023",
    "longitude_sin", 
    "longitude_cos", 
    "latitude_sin", 
    "latitude_cos",  
    "population_density"
]

feature_abbreviations = {
    "hdd_total_year": "HDD",
    "cdd_total_year": "CDD", 
    "GDP_per_capita": "GDP",
    "Urbanization_Rate_2022": "URB",
   # "latitude": "Lat",
   # "longitude": "Long",
    "Paris_Agreement": "Paris",
    "Subnational HDI": "HDI", 
    "Educational index": "EDU" , 
    "Income index": "Income",
    "2m_temperature_2023_avg": "Temp",
    "2m_dewpoint_temperature_2023_avg": "Dew",
    "total_precipitation_2023": "Precip",
    "longitude_sin": "Long_Sin", 
    "longitude_cos": "Long_Cos", 
    "latitude_sin": "Lat_Sin", 
    "latitude_cos": "Lat_Cos",  
    "population_density": "Pop_Dens"
}

target_columns = [
    "Residential EUI (kWh/m2/year)",
    "Non-residential EUI (kWh/m2/year)"
]


In [33]:

importance_by_target = {}
models = {}

# Train a Random Forest model for each target
for target in target_columns:
    
    # Prepare data
    X = train_df[features]
    y = train_df[target]
    
    # Train model
    rf = RandomForestRegressor(n_estimators=100, 
                               max_depth=None, 
                               min_samples_split=2, 
                               random_state=42, 
                               n_jobs=-1)
    rf.fit(X, y)
    
    # Save model and importance values
    models[target] = rf
    importance_by_target[target] = rf.feature_importances_

# Create DataFrame with importance values
importance_df = pd.DataFrame(index=features)

# Add importance values for each target
for target in target_columns:
    importance_df[target] = importance_by_target[target]

# Calculate average importance
importance_df['Average'] = importance_df.mean(axis=1)

# Sort by average importance
importance_df = importance_df.sort_values('Average', ascending=False)

# Replace feature names with abbreviations
importance_df_abbr = importance_df.copy()
importance_df_abbr.index = [feature_abbreviations[feature] for feature in importance_df.index]


importance_df_abbr

Unnamed: 0,Residential EUI (kWh/m2/year),Non-residential EUI (kWh/m2/year),Average
Income,0.023792,0.619573,0.321683
Temp,0.356251,0.101552,0.228901
Lat_Sin,0.197256,0.036012,0.116634
Lat_Cos,0.14489,0.023393,0.084142
Dew,0.047699,0.071941,0.05982
Long_Cos,0.035593,0.026291,0.030942
HDI,0.030204,0.029154,0.029679
HDD,0.033884,0.017364,0.025624
Long_Sin,0.029354,0.012595,0.020974
EDU,0.014543,0.026129,0.020336
