In [2]:
# change root path one level up
import os
os.chdir("..")


In [3]:
import numpy as np
import pandas as pd
import pickle
import pytz
import time

# lightgbm and optuna
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
# import datetime and timedelta
from datetime import datetime, timedelta




# CAT_FEATURES = ['stationID', 'spaceID', 'siteID', 'clusterID', 'userID', 'paymentRequired', 'connectionTime_Weekday', 'connectionTime_Hour', 'connectionTime_Month', 'connectionTime_is_holiday', 'connectionTime_is_weekend', 'connectionTime_weekday_hour']
CAT_FEATURES = ["userID"]
NUM_FEATURES = [
    "connectionTime_hour_x",
    "connectionTime_hour_y",
    "connectionTime_month_x",
    "connectionTime_month_y",
    "connectionTime_weekday_x",
    "connectionTime_weekday_y",
]
TARGET = 'kWhDelivered'


In [4]:
sessions = pd.read_csv("./data/caltech_test_data.csv", index_col=[0, 1])
for cols in CAT_FEATURES:
    sessions[cols] = sessions[cols].astype("category")
sessions = sessions[NUM_FEATURES + CAT_FEATURES + [TARGET]]
sessions


Unnamed: 0_level_0,Unnamed: 1_level_0,connectionTime_hour_x,connectionTime_hour_y,connectionTime_month_x,connectionTime_month_y,connectionTime_weekday_x,connectionTime_weekday_y,userID,kWhDelivered
connection_time_copy,_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-04-30,5bc915caf9af8b0dad3c0660,-5.000000e-01,8.660254e-01,0.892254,-0.451533,9.555728e-01,0.294755,22,47.808
2018-05-07,5bc917d0f9af8b0dc677b8bb,-2.588190e-01,9.659258e-01,0.565554,0.824711,9.659258e-01,0.258819,61,27.683
2018-05-11,5bc9190ff9af8b0dc677b9c1,-5.000000e-01,8.660254e-01,-0.232105,0.972691,-7.330519e-01,-0.680173,22,17.485
2018-05-14,5bc919c3f9af8b0dc677ba32,8.770390e-14,1.000000e+00,-0.742684,0.669642,9.749279e-01,0.222521,66,11.795
2018-05-14,5bc919c3f9af8b0dc677ba62,-2.588190e-01,-9.659258e-01,-0.802694,0.596391,8.045978e-01,0.593820,61,3.076
...,...,...,...,...,...,...,...,...,...
2019-12-30,5e1fbc9ff9af8b5391bcd6fc,-7.071068e-01,7.071068e-01,0.968119,-0.250491,9.438833e-01,0.330279,743,5.814
2019-12-30,5e1fbc9ff9af8b5391bcd700,-1.000000e+00,-9.512791e-14,0.974267,-0.225399,9.009689e-01,0.433884,1124,6.509
2019-12-31,5e210e1ff9af8b57bb4f54fa,-5.000000e-01,8.660254e-01,0.998482,-0.055088,3.653410e-01,0.930874,743,4.793
2019-12-31,5e210e1ff9af8b57bb4f54fd,-9.659258e-01,2.588190e-01,0.999571,-0.029276,2.588190e-01,0.965926,1124,6.618


In [5]:
# find dates 30, 60, 120, 240, 360, 480, 600 days before 2020-12-01 and make a list of them
dates = []
for i in [30, 60, 120, 240, 360, 480]:
    dates.append((datetime(2019, 11, 1) - timedelta(days=i)).strftime("%Y-%m-%d"))
dates


['2019-10-02',
 '2019-09-02',
 '2019-07-04',
 '2019-03-06',
 '2018-11-06',
 '2018-07-09']

In [6]:
# make 6 partitions for training starting with dates[i] and ending with datetime(2020, 12, 1)
train_sessions = []
for i in range(len(dates)):
    train_sessions.append(sessions.loc[dates[i] : "2019-11-01"])
    # print number of sessions in each partition
    print(train_sessions[i].shape)


(218, 8)
(488, 8)
(962, 8)
(2342, 8)
(4022, 8)
(4601, 8)


In [7]:
# testing data is collected from Dec. 1, 2018 to Jan. 1, 2019
test = sessions.loc[(slice("2019-12-01", "2020-01-01"), slice(None)), :]

In [10]:
def get_similar_sessions(train_test_ds, feats, session_idx, n=5, target=TARGET):
    train_ds_scaled = train_test_ds[feats]
    # start timer
    start = time.time()
    # if categorical features are in feats
    if any([x in CAT_FEATURES for x in feats]):
        # dummy encode categorical features
        train_ds_scaled = pd.get_dummies(train_ds_scaled, columns=CAT_FEATURES)
    historical = train_ds_scaled[:session_idx]
    # calculate the cosine similarity for the row of session_idx
    cosine_sim = cosine_similarity(
        historical, train_ds_scaled.loc[session_idx].values.reshape(1, -1)
    )
    # make a dataframe
    cosine_sim_df = pd.DataFrame(cosine_sim, index=historical.index)
    # show the top 10 most similar sessions
    top = (
        train_test_ds[:session_idx]
        .loc[cosine_sim_df.nlargest(n, columns=0).index][target]
        .values
    )
    # calculate mean of top n similar sessions
    top_mean = top.mean()
    runtime = time.time() - start
    # clean up
    del train_ds_scaled, historical, cosine_sim, cosine_sim_df, top
    return top_mean, runtime

# evaluate smape of lightgbm model
def smape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / (y_true + y_pred))) * 100


def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

# save distribution of errors
def save_error_dist(y_true, y_pred, name):
    error = y_pred - y_true
    # save as a picle file
    with open('./results/'+name+'.pkl', "wb") as f:
        pickle.dump(error, f)


In [12]:
results = pd.DataFrame(
    columns=["smape", "mae"], index=[30, 60, 120, 240, 360, 480]
)
for i, ds in enumerate(train_sessions):
    # size of the dataset
    print(ds.shape)
    concat = pd.concat(
        [ds[NUM_FEATURES + [TARGET]], test[NUM_FEATURES + [TARGET]]]
    )
    test_fore = test.copy()
    rt_list = []
    for j in range(20):
        for idx in test.index.tolist():
            # get similar sessions
            top_mean, rt = get_similar_sessions(concat, NUM_FEATURES, idx)
            test_fore.loc[idx, TARGET+"_forecast"] = top_mean
            rt_list.append(rt)
    print(f"Runtime: {np.mean(rt_list)}")
    # evaluate smape of similar sessions model
    print(
        "smape of similar sessions model: ",
        smape(test_fore[TARGET], test_fore[TARGET+"_forecast"]),
    )
    results.iloc[i]["smape"] = smape(
        test_fore[TARGET], test_fore[TARGET+"_forecast"]
    )
    # evaluate mae of similar sessions model
    print(
        "mae of similar sessions model: ",
        mae(test_fore[TARGET], test_fore[TARGET+"_forecast"]),
    )
    results.iloc[i]["mae"] = mae(
        test_fore[TARGET], test_fore[TARGET+"_forecast"]
    )
    save_error_dist(test_fore[TARGET], test_fore[TARGET+"_forecast"], "similar_sessions_energy_"+str(results.index[i]))

(218, 8)
Runtime: 0.0008562879197916407
smape of similar sessions model:  28.752663928707683
mae of similar sessions model:  4.033828025477707
(488, 8)
Runtime: 0.0008205079728630698
smape of similar sessions model:  28.656051691358446
mae of similar sessions model:  4.21057247487615
(962, 8)
Runtime: 0.0008700814216759554
smape of similar sessions model:  27.766354093737057
mae of similar sessions model:  4.031037161712668
(2342, 8)
Runtime: 0.0009966427353537007
smape of similar sessions model:  27.55799471483465
mae of similar sessions model:  3.8502973156404807
(4022, 8)
Runtime: 0.0010702827174192782
smape of similar sessions model:  27.166989104667916
mae of similar sessions model:  3.807233621372965
(4601, 8)
Runtime: 0.0010907584694540425
smape of similar sessions model:  27.526375314072272
mae of similar sessions model:  3.862169830460623


In [60]:
# save to csv
results.to_csv("./results/similar_sessions_energy_nologs.csv")
results


Unnamed: 0,smape,mae
30,28.7527,4.03383
60,28.6561,4.21057
120,27.7664,4.03104
240,27.558,3.8503
360,27.167,3.80723
480,27.5264,3.86217


In [13]:
results = pd.DataFrame(
    columns=["smape", "mae"], index=[30, 60, 120, 240, 360, 480]
)
for i, ds in enumerate(train_sessions):
    # size of the dataset
    print(ds.shape)
    concat = pd.concat(
        [ds[NUM_FEATURES + CAT_FEATURES +[TARGET]], test[NUM_FEATURES + CAT_FEATURES + [TARGET]]]
    )
    test_fore = test.copy()
    rt_list = []
    for idx in test.index.tolist():
        # get similar sessions
        top_mean, rt = get_similar_sessions(concat, NUM_FEATURES + CAT_FEATURES, idx)
        test_fore.loc[idx, TARGET+"_forecast"] = top_mean
    # evaluate smape of similar sessions model
    print(
        "smape of similar sessions model: ",
        smape(test_fore[TARGET], test_fore[TARGET+"_forecast"]),
    )
    results.iloc[i]["smape"] = smape(
        test_fore[TARGET], test_fore[TARGET+"_forecast"]
    )
    # evaluate mae of similar sessions model
    print(
        "mae of similar sessions model: ",
        mae(test_fore[TARGET], test_fore[TARGET+"_forecast"]),
    )
    results.iloc[i]["mae"] = mae(
        test_fore[TARGET], test_fore[TARGET+"_forecast"]
    )
    save_error_dist(test_fore[TARGET], test_fore[TARGET+"_forecast"], "similar_sessions_energy_"+str(results.index[i]))


(218, 8)
smape of similar sessions model:  12.449173288268135
mae of similar sessions model:  1.6448828025477706
(488, 8)
smape of similar sessions model:  10.410934038869113
mae of similar sessions model:  1.4127496705590938
(962, 8)
smape of similar sessions model:  7.69987776059069
mae of similar sessions model:  1.1023312513107908
(2342, 8)
smape of similar sessions model:  7.883141017188168
mae of similar sessions model:  1.07233735626175
(4022, 8)
smape of similar sessions model:  8.079263076632401
mae of similar sessions model:  1.1948530028308564
(4601, 8)
smape of similar sessions model:  7.967738639866748
mae of similar sessions model:  1.1772853293838192


In [58]:
# save to csv
results.to_csv("./results/similar_sessions_energy.csv")
results

Unnamed: 0,smape,mae
30,12.4492,1.64488
60,10.4109,1.41275
120,7.69988,1.10233
240,7.88314,1.07234
360,8.07926,1.19485
480,7.96774,1.17729
