In [1]:
!pip install lightgbm -q
!pip install category_encoders -q
!pip install xgboost -q
!pip install timeseries-fastai -q
!pip install pandas

import os
import glob
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import category_encoders as ce
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor

import matplotlib.ticker as ticker
import re
from sklearn.model_selection import GridSearchCV
from joblib import dump, load



In [15]:
import joblib

Python 3.10.10


In [2]:
# Loading the Datasets
filepath = 'data/schemas/warm_up/'

# Building information
b_1 = pd.read_csv(filepath + 'Building_1.csv')
b_2 = pd.read_csv(filepath + 'Building_2.csv')
b_3 = pd.read_csv(filepath + 'Building_3.csv')

# Other information
carbon_int = pd.read_csv(filepath + 'carbon_intensity.csv')
pricing    = pd.read_csv(filepath + 'pricing.csv')
weather    = pd.read_csv(filepath + 'weather.csv')

# Building level combine the dfs
comb_b_1 = pd.concat([b_1.reset_index(drop=True),
                      carbon_int.reset_index(drop=True),
                      pricing.reset_index(drop=True),
                      weather.reset_index(drop=True)], axis=1)

comb_b_2 = pd.concat([b_2.reset_index(drop=True),
                      carbon_int.reset_index(drop=True),
                      pricing.reset_index(drop=True),
                      weather.reset_index(drop=True)], axis=1)

comb_b_3 = pd.concat([b_3.reset_index(drop=True),
                      carbon_int.reset_index(drop=True),
                      pricing.reset_index(drop=True),
                      weather.reset_index(drop=True)], axis=1)

# Make a list of the buildings
b_list = [comb_b_1,comb_b_2,comb_b_3]

In [3]:
# Check if the dataframes contain inf

# Building 1
d = np.isfinite(comb_b_1) 

# Building 2
d = np.isfinite(comb_b_2) 


# Building 3
d = np.isfinite(comb_b_3) 

In [4]:
# Fix the titles
b_list_clear = []

for b in b_list:
    regex = re.compile(r"\[|\]|<", re.IGNORECASE)
    b.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in b.columns.values]
    b_list_clear.append(b)

In [5]:
# XGBoost Models

def XGBoost_Model(X_train, X_test, y_train, y_test,hpt):

    reg = XGBRegressor(n_estimators=1000)
    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            early_stopping_rounds=50,
           verbose=False)
     
    y_pred  = reg.predict(X_test)
    
    # Generate the df
    df = pd.DataFrame(
        {'Actual Value': y_test,
        'Predicted Value': y_pred
        })

    return df, reg
    

In [11]:
# LightGBM Models

# Generating the LightGBM

def LightGBM_Model(X_train, X_test, y_train, y_test,hpt):

    if hpt == True:
        params = {
            'max_depth':        [3, 4, 5],
            'num_leaves':       [10, 15, 20],
            'learning_rate':    [0.05, 0.1, 0.15],
            'n_estimators':     [50, 100, 200],
            'subsample':        [0.5, 0.7, 0.9],
            'colsample_bytree': [0.5, 0.7, 0.9],
            'reg_alpha':        [0.01, 0.1, 1],
            'reg_lambda':       [0.01, 0.1, 1],
            'verbose':[-1]
        }
    
        lgb_mean = LGBMRegressor(boosting_type='gbdt', objective='regression')
        grid_search_mean = GridSearchCV(lgb_mean, params, cv=5, n_jobs=-1)
        grid_search_mean.fit(X_train, y_train)
        
        y_pred_mean  = grid_search_mean.predict(X_test)
    
        # Generate the df
        df = pd.DataFrame(
            {'Actual Value': y_test,
             'Predicted Value': y_pred_mean
            })
     
        return df, grid_search_mean
    
    
    else:
        lgb_params = {
        'n_jobs': 1,
        'max_depth': 4,
        'min_data_in_leaf': 10,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'subsample': 0.9,
        'n_estimators': 80,
        'learning_rate': 0.1,
        'colsample_bytree': 0.9,
        'steps':48,
        }
        
        # fitting the model
        gbm = LGBMRegressor(**lgb_params)
        gbm.fit(X_train, y_train)
        
        y_pred = gbm.predict(X_test)
        
        # Generate the df
        df = pd.DataFrame(
            {'Actual Value': y_test,
             'Predicted Value': y_pred
            })
     
        return df, gbm

In [12]:
model_type = 'lgb'

In [18]:
# 1.) Cooling Load (kWh)
i = 1
for b in b_list_clear:
    
    # Load the feature importance
    f_l = pd.read_csv('data/features/feature_importance_Cooling_Load__kWh_.csv')
    
    # Generate the x,y
    X = b[f_l['feature']]
    y = b['Cooling Load (kWh)']

    # Generate the test,train 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=False)

    if model_type == 'xgb':
        df, xgb = XGBoost_Model(X_train, X_test, y_train, y_test,False)
        xgb.save_model('my_models/models/cooling_load_model_b'+str(i)+'.json')
    if model_type == 'lgb':
        df, lgb = LightGBM_Model(X_train, X_test, y_train, y_test,True)
        joblib.dump(lgb, 'my_models/models/cooling_load_model_b'+str(i)+'_hyper.pkl')
        #lgb.booster_.save_model('my_models/models/cooling_load_model_b'+str(i)+'_hyper.txt')
    i = i + 1


In [None]:
# 2.) DHW Load (kWh)
i = 1
for b in b_list_clear:

    # Generate the x,y
    X = b
    y = b['DHW Heating (kWh)']

    # Generate the test,train 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=False)

    
    if model_type == 'xgb':
        df, xgb = XGBoost_Model(X_train, X_test, y_train, y_test,False)
        xgb.save_model('my_models/models/dhw_load_model_b'+str(i)+'.json')
    if model_type == 'lgb':
        df, lgb = LightGBM_Model(X_train, X_test, y_train, y_test,True)
        joblib.dump(lgb, 'my_models/models/dhw_load_model_b'+str(i)+'_hyper.pkl')
        #lgb.booster_.save_model('my_models/models/dhw_load_model_b'+str(i)+'_hyper.txt')
    i = i + 1

In [None]:
# 3.) Equipment Electric Power (kWh)
i = 1
for b in b_list_clear:
    
    # Generate the x,y
    X = b
    y = b['Equipment Electric Power (kWh)']

    # Generate the test,train 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=False)
    
    
    if model_type == 'xgb':
        df, xgb = XGBoost_Model(X_train, X_test, y_train, y_test,False)
        xgb.save_model('my_models/models/Equipment_Electric_Power_model_b'+str(i)+'.json')
    if model_type == 'lgb':
        df, lgb = LightGBM_Model(X_train, X_test, y_train, y_test,True)
        joblib.dump(lgb, 'my_models/models/Equipment_Electric_Power_model_b'+str(i)+'_hyper.pkl')
        #lgb.booster_.save_model('my_models/models/Equipment_Electric_Power_model_b'+str(i)+'_hyper.txt')
    i = i + 1

In [None]:
# Neighbour Level: Carbon Intensity (kgCO2e/kWh) ; Solar Generation (W/kW)

# 1.) Carbon Intensity (kgCO2e/kWh)
# combine the datasets to one since we only have one CI 
comb = pd.concat([b_list_clear[0].reset_index(drop=True),
                  b_list_clear[1].reset_index(drop=True),
                  b_list_clear[2].reset_index(drop=True)])
    
# Generate the x,y
X = comb
y = comb['kg_CO2/kWh']

# Generate the test,train 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=False)

    
if model_type == 'xgb':
    df, xgb = XGBoost_Model(X_train, X_test, y_train, y_test,False)
    xgb.save_model('my_models/models/Carbon_Intensity_Power_model'+str(i)+'.json')
if model_type == 'lgb':
    df, lgb = LightGBM_Model(X_train, X_test, y_train, y_test,True)
    joblib.dump(lgb, 'my_models/models/Carbon_Intensity_Power_model_hyper.pkl')
    #lgb.booster_.save_model('my_models/models/Carbon_Intensity_Power_model_hyper.txt')


In [None]:
# 3.) Solar Generation (W/kW)
sg = []
i = 1

for b in b_list_clear:
    
    # Load the feature importance
    f_l = pd.read_csv('data/features/feature_importance_Solar_Generation__W_kW_.csv')

    # Generate the x,y
    X = b[f_l['feature']]
    y = b['Solar Generation (W/kW)']

    # Generate the test,train 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=False)
    
    if model_type == 'xgb':
        df, xgb = XGBoost_Model(X_train, X_test, y_train, y_test,False)
        xgb.save_model('my_models/models/solar_generation_model_b'+str(i)+'.json')
    if model_type == 'lgb':
        df, lgb = LightGBM_Model(X_train, X_test, y_train, y_test,True)
        joblib.dump(lgb, 'my_models/models/solar_generation_model_b'+str(i)+'_hyper.pkl')
        #lgb.booster_.save_model('my_models/models/solar_generation_model_b'+str(i)+'_hyper.txt')
        
    sg.append(df)
    i = i + 1

### FastAI Testing

In [14]:
from timeseries_fastai.imports import *
from timeseries_fastai.core import *
from timeseries_fastai.data import *
from timeseries_fastai.models import *

In [16]:
PATH = Path.cwd().parent
df_train, df_test = load_df_ucr(PATH, 'Adiac')
x_cols = df_train.columns[0:-2].to_list()
dls = TSDataLoaders.from_dfs(df_train, df_test, x_cols=x_cols, label_col='target', bs=16)
dls.show_batch()

Loading files from: /home/philaupk/work/CityLearn_Competition/Adiac
Error loading files: /home/philaupk/work/CityLearn_Competition/Adiac


TypeError: cannot unpack non-iterable NoneType object

In [None]:
inception = create_inception(1, len(dls.vocab))
learn = Learner(dls, inception, metrics=[accuracy])
learn.fit_one_cycle(1, 1e-3)

## Feature Selection

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.feature_selection import mutual_info_regression


# Save the important features into files
b = 1
for data in b_list_clear:
    feature_selection(data,'Cooling Load (kWh)')
    feature_selection(data,'DHW Heating (kWh)')
    feature_selection(data,'Equipment Electric Power (kWh)')
    feature_selection(data,'kg_CO2/kWh')
    feature_selection(data,'Solar Generation (W/kW)')
    b = b + 1

In [30]:
def feature_selection(data,obs_feature):
    # Split the dataset into features and target
    X = data
    y = data[obs_feature]
    
    # Apply Information Gain
    ig = mutual_info_regression(X, y)

    # Create a dictionary of feature importance scores
    feature_scores = {}
    i = 0
    for (columnName, columnData) in data.iteritems():
        feature_scores[columnName] = ig[i]
        i = i + 1
    # Sort the features by importance score in descending order
    sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)

    f_l = []
    s_l = []
    # Print the feature importance scores and the sorted features
    for feature, score in sorted_features:
        if score > 0.10:
            # save the features
            f_l.append(feature)
            s_l.append(score)
            
    dic = {'feature': f_l, 'score': s_l}
    df = pd.DataFrame(dic)
    obs_feature = obs_feature.replace(" ", "_")
    obs_feature = obs_feature.replace(")", "_")
    obs_feature = obs_feature.replace("(", "_")
    obs_feature = obs_feature.replace("/", "_")
    df.to_csv('data/features/feature_importance_'+str(obs_feature)+'.csv')
    