In [1]:
# change root path one level up
import os
os.chdir("..")


In [34]:
import numpy as np
import pandas as pd
import pytz
import time

# lightgbm and optuna
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
# import datetime and timedelta
from datetime import datetime, timedelta




# CAT_FEATURES = ['stationID', 'spaceID', 'siteID', 'clusterID', 'userID', 'paymentRequired', 'connectionTime_Weekday', 'connectionTime_Hour', 'connectionTime_Month', 'connectionTime_is_holiday', 'connectionTime_is_weekend', 'connectionTime_weekday_hour']
CAT_FEATURES = ["userID"]
NUM_FEATURES = [
    "connectionTime_hour_x",
    "connectionTime_hour_y",
    "connectionTime_month_x",
    "connectionTime_month_y",
    "connectionTime_weekday_x",
    "connectionTime_weekday_y",
]
TARGET = 'parking_time'


In [38]:
sessions = pd.read_csv("./data/caltech_test_data.csv", index_col=[0, 1])
for cols in CAT_FEATURES:
    sessions[cols] = sessions[cols].astype("category")
sessions = sessions[NUM_FEATURES + CAT_FEATURES + [TARGET]]
sessions


Unnamed: 0_level_0,Unnamed: 1_level_0,connectionTime_hour_x,connectionTime_hour_y,connectionTime_month_x,connectionTime_month_y,connectionTime_weekday_x,connectionTime_weekday_y,userID,parking_time
connection_time_copy,_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-04-30,5bc915caf9af8b0dad3c0660,-5.000000e-01,8.660254e-01,0.892254,-0.451533,9.555728e-01,0.294755,22,9.335000
2018-05-07,5bc917d0f9af8b0dc677b8bb,-2.588190e-01,9.659258e-01,0.565554,0.824711,9.659258e-01,0.258819,61,10.611944
2018-05-11,5bc9190ff9af8b0dc677b9c1,-5.000000e-01,8.660254e-01,-0.232105,0.972691,-7.330519e-01,-0.680173,22,9.028333
2018-05-14,5bc919c3f9af8b0dc677ba32,8.770390e-14,1.000000e+00,-0.742684,0.669642,9.749279e-01,0.222521,66,9.321944
2018-05-14,5bc919c3f9af8b0dc677ba62,-2.588190e-01,-9.659258e-01,-0.802694,0.596391,8.045978e-01,0.593820,61,1.121944
...,...,...,...,...,...,...,...,...,...
2019-12-30,5e1fbc9ff9af8b5391bcd6fc,-7.071068e-01,7.071068e-01,0.968119,-0.250491,9.438833e-01,0.330279,743,2.066667
2019-12-30,5e1fbc9ff9af8b5391bcd700,-1.000000e+00,-9.512791e-14,0.974267,-0.225399,9.009689e-01,0.433884,1124,7.735556
2019-12-31,5e210e1ff9af8b57bb4f54fa,-5.000000e-01,8.660254e-01,0.998482,-0.055088,3.653410e-01,0.930874,743,1.963889
2019-12-31,5e210e1ff9af8b57bb4f54fd,-9.659258e-01,2.588190e-01,0.999571,-0.029276,2.588190e-01,0.965926,1124,7.338333


In [39]:
# find dates 30, 60, 120, 240, 360, 480, 600 days before 2020-12-01 and make a list of them
dates = []
for i in [30, 60, 120, 240, 360, 480]:
    dates.append((datetime(2019, 11, 1) - timedelta(days=i)).strftime("%Y-%m-%d"))


In [40]:
# make 6 partitions for training starting with dates[i] and ending with datetime(2020, 12, 1)
train_sessions = []
for i in range(len(dates)):
    train_sessions.append(sessions.loc[dates[i] : "2019-12-01"])
    # print number of sessions in each partition
    print(train_sessions[i].shape)


(403, 8)
(673, 8)
(1147, 8)
(2527, 8)
(4207, 8)
(4786, 8)


In [41]:
# testing data is collected from Dec. 1, 2018 to Jan. 1, 2019
test = sessions.loc[(slice("2019-12-01", "2020-01-01"), slice(None)), :]

In [42]:
def get_similar_sessions(train_test_ds, feats, session_idx, n=5, target=TARGET):
    train_ds_scaled = train_test_ds[feats]
    # start timer
    start = time.time()
    # if categorical features are in feats
    if any([x in CAT_FEATURES for x in feats]):
        # dummy encode categorical features
        train_ds_scaled = pd.get_dummies(train_ds_scaled, columns=CAT_FEATURES)
    historical = train_ds_scaled[:session_idx]
    # calculate the cosine similarity for the row of session_idx
    cosine_sim = cosine_similarity(
        historical, train_ds_scaled.loc[session_idx].values.reshape(1, -1)
    )
    # make a dataframe
    cosine_sim_df = pd.DataFrame(cosine_sim, index=historical.index)
    # show the top 10 most similar sessions
    top = (
        train_test_ds[:session_idx]
        .loc[cosine_sim_df.nlargest(n, columns=0).index][target]
        .values
    )
    # calculate mean of top n similar sessions
    top_mean = top.mean()
    runtime = time.time() - start
    # clean up
    del train_ds_scaled, historical, cosine_sim, cosine_sim_df, top
    return top_mean, runtime

# evaluate smape of lightgbm model
def smape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / (y_true + y_pred))) * 100


def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

In [43]:
results = pd.DataFrame(
    columns=["smape", "mae"], index=[30, 60, 120, 240, 360, 480]
)
for i, ds in enumerate(train_sessions):
    # size of the dataset
    print(ds.shape)
    concat = pd.concat(
        [ds[NUM_FEATURES + [TARGET]], test[NUM_FEATURES + [TARGET]]]
    )
    test_fore = test.copy()
    rt_list = []
    for j in range(20):
        for idx in test.index.tolist():
            # get similar sessions
            top_mean, rt = get_similar_sessions(concat, NUM_FEATURES, idx)
            test_fore.loc[idx, TARGET+"_forecast"] = top_mean
            rt_list.append(rt)
    print(f"Runtime: {np.mean(rt_list)}")
    # evaluate smape of similar sessions model
    print(
        "smape of similar sessions model: ",
        smape(test_fore[TARGET], test_fore[TARGET+"_forecast"]),
    )
    results.iloc[i]["smape"] = smape(
        test_fore[TARGET], test_fore[TARGET+"_forecast"]
    )
    # evaluate mae of similar sessions model
    print(
        "mae of similar sessions model: ",
        mae(test_fore[TARGET], test_fore[TARGET+"_forecast"]),
    )
    results.iloc[i]["mae"] = mae(
        test_fore[TARGET], test_fore[TARGET+"_forecast"]
    )
    


(403, 8)
Runtime: 0.0008425283583865803
smape of similar sessions model:  15.843320417331519
mae of similar sessions model:  2.262368365180467
(673, 8)
Runtime: 0.0008742376497596692
smape of similar sessions model:  15.086385773421599
mae of similar sessions model:  2.138595895258316
(1147, 8)
Runtime: 0.0009438989268746345
smape of similar sessions model:  15.047853158170119
mae of similar sessions model:  2.146058032554848
(2527, 8)
Runtime: 0.001126349503826943
smape of similar sessions model:  15.666397046529243
mae of similar sessions model:  2.22184076433121
(4207, 8)
Runtime: 0.0010430021650472264
smape of similar sessions model:  15.394637110968173
mae of similar sessions model:  2.1537084217975937
(4786, 8)
Runtime: 0.001050163150592974
smape of similar sessions model:  15.573088169064562
mae of similar sessions model:  2.176268931351734


In [44]:
# save to csv
results.to_csv("./results/similar_sessions_duration_nologs.csv")
results


Unnamed: 0,smape,mae
30,15.8433,2.26237
60,15.0864,2.1386
120,15.0479,2.14606
240,15.6664,2.22184
360,15.3946,2.15371
480,15.5731,2.17627


In [45]:
results = pd.DataFrame(
    columns=["smape", "mae"], index=[30, 60, 120, 240, 360, 480]
)
for i, ds in enumerate(train_sessions):
    # size of the dataset
    print(ds.shape)
    concat = pd.concat(
        [ds[NUM_FEATURES + CAT_FEATURES +[TARGET]], test[NUM_FEATURES + CAT_FEATURES + [TARGET]]]
    )
    test_fore = test.copy()
    rt_list = []
    for j in range(20):
        for idx in test.index.tolist():
            # get similar sessions
            top_mean, rt = get_similar_sessions(concat, NUM_FEATURES + CAT_FEATURES, idx)
            test_fore.loc[idx, TARGET+"_forecast"] = top_mean
            rt_list.append(rt)
    print(f"Runtime: {np.mean(rt_list)}")
    # evaluate smape of similar sessions model
    print(
        "smape of similar sessions model: ",
        smape(test_fore[TARGET], test_fore[TARGET+"_forecast"]),
    )
    results.iloc[i]["smape"] = smape(
        test_fore[TARGET], test_fore[TARGET+"_forecast"]
    )
    # evaluate mae of similar sessions model
    print(
        "mae of similar sessions model: ",
        mae(test_fore[TARGET], test_fore[TARGET+"_forecast"]),
    )
    results.iloc[i]["mae"] = mae(
        test_fore[TARGET], test_fore[TARGET+"_forecast"]
    )
    


(403, 8)
Runtime: 0.0019114610495840668
smape of similar sessions model:  13.12606917322082
mae of similar sessions model:  1.8045774946921445
(673, 8)
Runtime: 0.0018522778893731962
smape of similar sessions model:  12.778724816123821
mae of similar sessions model:  1.7499263977353152
(1147, 8)
Runtime: 0.0022472327681863384
smape of similar sessions model:  12.615940785896754
mae of similar sessions model:  1.777007077140835
(2527, 8)
Runtime: 0.0026304683108238657
smape of similar sessions model:  11.495878483377636
mae of similar sessions model:  1.6188156404812457
(4207, 8)
Runtime: 0.003145094510096653
smape of similar sessions model:  11.040804754874722
mae of similar sessions model:  1.5895031847133758
(4786, 8)
Runtime: 0.003287547636943258
smape of similar sessions model:  10.925311460482815
mae of similar sessions model:  1.5803404104741685


In [46]:
# save to csv
results.to_csv("./results/similar_sessions_duration.csv")
results

Unnamed: 0,smape,mae
30,13.1261,1.80458
60,12.7787,1.74993
120,12.6159,1.77701
240,11.4959,1.61882
360,11.0408,1.5895
480,10.9253,1.58034
