In [2]:
# change root path one level up
import os
os.chdir("..")


In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pytz
import seaborn as sns
import sys

# import us holidays
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

from acnportal import acnsim
from acnportal.acndata import DataClient

CAT_COLS = [
    "Hour",
    "Weekday",
    "weekday_hour",
    "Month",
    "is_holiday",
    "is_weekend",
]


In [None]:
# print versions of pandas, numpy and python
print("pandas version: ", pd.__version__)
print("numpy version: ", np.__version__)
print("python version: ", sys.version)


In [None]:
# import data used in the paper by Lee et al. (2019)
api_token = "7dtw3CHtDeqjlG4PZ6OIzbtAnsOJRa8jqcdHUupFduM"
TIMEZONE = pytz.timezone("America/Los_Angeles")
client = DataClient(api_token)
site = "caltech"
start = datetime(2018, 5, 1, tzinfo=pytz.timezone("US/Pacific"))
end = datetime(2019, 1, 1, tzinfo=pytz.timezone("US/Pacific"))
docs = client.get_sessions_by_time(site, start, end)
# keep only sessions with userID not None
docs = [doc for doc in docs if doc["userID"] is not None]


In [None]:
# dictionary out of generator
docs = list(docs)
# list into dataframe
docs_df = pd.DataFrame(docs)
docs_df


In [None]:
# keep only users with more than 20 sessions between 11/1/2018 and 1/1/2019
user_counts = docs_df["userID"].value_counts()
users = user_counts[user_counts > 20].index
# remove all zeros to the left from user ids
# users = [user.lstrip('0') for user in users]
users


In [None]:
# import data
api_token = "7dtw3CHtDeqjlG4PZ6OIzbtAnsOJRa8jqcdHUupFduM"
TIMEZONE = pytz.timezone("America/Los_Angeles")
client = DataClient(api_token)
site = "caltech"
end = datetime(2021, 1, 1, tzinfo=pytz.timezone("US/Pacific"))
start = end - timedelta(days=750)
docs = client.get_sessions_by_time(site, start, end)
# keep only sessions with userID not None
docs = [doc for doc in docs if doc["userID"] is not None]
# dictionary out of generator
docs = list(docs)
# list into dataframe
docs_df = pd.DataFrame(docs)
# keep only users with more than 20 sessions between 11/1/2018 and 1/1/2019
docs_df = docs_df[docs_df["userID"].isin(users)]
docs_df


In [None]:
def process_timestamp(df, time_col="timestamp"):
    df[time_col] = pd.to_datetime(df[time_col])
    # remove timezone
    df[time_col] = df[time_col].dt.tz_localize(None)
    df[time_col] = (
        df[time_col] - pd.to_datetime("2018-01-01")
    ).dt.total_seconds() // 3600


def add_features(df, time_col="timestamp"):
    df[time_col + "_Weekday"] = df[time_col].dt.dayofweek
    df[time_col + "_Hour"] = df[time_col].dt.hour
    df[time_col + "_Month"] = df[time_col].dt.month
    df[time_col + "_date"] = df[time_col].dt.date
    # time interactions
    df[time_col + "_weekday_hour"] = (
        df[time_col + "_Weekday"].astype(str) + "-" + df[time_col + "_Hour"].astype(str)
    )
    process_timestamp(df, time_col)

    # apply cyclic encoding of periodic features
    df[time_col + "_hour_x"] = np.cos(2 * np.pi * df[time_col] / 24)
    df[time_col + "_hour_y"] = np.sin(2 * np.pi * df[time_col] / 24)

    df[time_col + "_month_x"] = np.cos(2 * np.pi * df[time_col] / (30.4 * 24))
    df[time_col + "_month_y"] = np.sin(2 * np.pi * df[time_col] / (30.4 * 24))

    df[time_col + "_weekday_x"] = np.cos(2 * np.pi * df[time_col] / (7 * 24))
    df[time_col + "_weekday_y"] = np.sin(2 * np.pi * df[time_col] / (7 * 24))

    # get holidays
    dates_range = pd.date_range(start="2018-10-01", end="2019-01-01")
    us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())
    df[time_col + "_is_holiday"] = (
        df[time_col + "_date"].astype("datetime64").isin(us_holidays)
    ).astype(np.int8)
    # is weekend
    df[time_col + "_is_weekend"] = (df[time_col + "_Weekday"].isin([6, 7])).astype(
        np.int8
    )

    CAT_COLS = [
        "Hour",
        "Weekday",
        "weekday_hour",
        "Month",
        "is_holiday",
        "is_weekend",
    ]
    # add time_col to categorical columns
    CAT_COLS = [time_col + "_" + col for col in CAT_COLS]
    # transform as categorical
    for cols in CAT_COLS:
        df[cols] = df[cols].astype("category")
    # drop date column
    df.drop([time_col + "_date"], axis=1, inplace=True)


In [None]:
docs_df["connection_time_copy"] = docs_df["connectionTime"].dt.date
docs_df["parking_time"] = docs_df["disconnectTime"] - docs_df["connectionTime"]
docs_df["idle_time"] = docs_df["doneChargingTime"] - docs_df["connectionTime"]
# turn parking time into minutes
docs_df["parking_time"] = docs_df["parking_time"].dt.total_seconds() / 60 / 60
docs_df["idle_time"] = docs_df["idle_time"].dt.total_seconds() / 60 / 60
# extract kWhRequested from userInputs unless its a string
docs_df["kWhRequested"] = docs_df["userInputs"].apply(lambda x: x[0]["kWhRequested"])
docs_df["Requested_parking_time"] = (
    docs_df["userInputs"].apply(lambda x: x[0]["minutesAvailable"]) / 60
)
docs_df["paymentRequired"] = docs_df["userInputs"].apply(
    lambda x: x[0]["paymentRequired"]
)

# hour of the day, day of the week, month
docs_df["hour"] = docs_df["connectionTime"].dt.hour
docs_df["weekday"] = docs_df["connectionTime"].dt.dayofweek
docs_df["month"] = docs_df["connectionTime"].dt.month

# add calendar features
add_features(docs_df, time_col="connectionTime")


In [None]:
# remove columns userInputs, sessionID and timezone
docs_df = docs_df.drop(
    [
        "userInputs",
        "sessionID",
        "timezone",
        "doneChargingTime",
        "disconnectTime",
        "connectionTime",
    ],
    axis=1,
)

# make _id the index
docs_df = docs_df.set_index(["connection_time_copy", "_id"])


In [None]:
docs_df


In [None]:
# save as csv in a parent directory
docs_df.to_csv("../data/caltech_test_data.csv")
