In [None]:
import joblib
import warnings
import pandas as pd
import numpy as np
import gc
from datetime import datetime, timedelta
from glob import glob
from itertools import product
from tqdm import tqdm
from sklearn import preprocessing

warnings.filterwarnings("ignore")

dataset_period = [datetime.strptime('2013-07-01', '%Y-%m-%d'), datetime.strptime('2017-10-01', '%Y-%m-%d')]
test_period = [datetime.strptime('2017-10-01', '%Y-%m-%d') - timedelta(days=80), datetime.strptime('2017-10-01', '%Y-%m-%d')]
valid_period = [test_period[0] - timedelta(days=40), test_period[0]]
train_period = [dataset_period[0], valid_period[0]]
predict_time = "H"

# Load Data

In [None]:
df_raw = pd.read_hdf("process_data_201306_201709/citibike_raw.h5", key="raw")
station_info = pd.read_hdf("process_data_201306_201709/citibike_raw.h5", key="info")
weather = pd.read_hdf("process_data_201306_201709/citibike_raw.h5", key="weather")

In [None]:
station_info.query('earliest < @train_period[1]', inplace=True)
df_raw.query('startstationid in @station_info.stationid & endstationid in @station_info.stationid', inplace=True)

# Flow

In [None]:
df_raw = pd.get_dummies(
    df_raw, dummy_na=True, columns=["usertype", "gender"], dtype=np.int8
)

In [None]:
flow_in = (
    df_raw.assign(stoptime=df_raw.stoptime.dt.floor(predict_time))
    .groupby(["endstationid", "stoptime"])
    .size()
    .reset_index(name="flow_in")
    .rename(columns={"stoptime": "time", "endstationid": "stationid"})
)

flow_out = (
    df_raw.assign(starttime=df_raw.starttime.dt.floor(predict_time))
    .groupby(["startstationid", "starttime"])
    .size()
    .reset_index(name="flow_out")
    .rename(columns={"starttime": "time", "startstationid": "stationid"})
)

bike_return = (
    df_raw.assign(
        stoptime=df_raw.stoptime.dt.ceil(predict_time),
        starttime=df_raw.starttime.dt.ceil(predict_time),
    )
    .query("starttime != stoptime")
    .groupby(["endstationid", "stoptime"])
    .size()
    .reset_index(name="bike_return")
    .rename(columns={"stoptime": "time", "endstationid": "stationid"})
)

bike_rent = (
    df_raw.assign(
        stoptime=df_raw.stoptime.dt.ceil(predict_time),
        starttime=df_raw.starttime.dt.ceil(predict_time),
    )
    .query("starttime != stoptime")
    .groupby(["startstationid", "starttime"])
    .size()
    .reset_index(name="bike_rent")
    .rename(columns={"starttime": "time", "startstationid": "stationid"})
)

category_in = (
    df_raw.assign(stoptime=df_raw.stoptime.dt.ceil(predict_time))
    .groupby(["endstationid", "stoptime"])
    .agg(
        {
            "usertype_Customer": np.sum,
            "usertype_Subscriber": np.sum,
            "usertype_nan": np.sum,
            "gender_0.0": np.sum,
            "gender_2.0": np.sum,
            "gender_nan": np.sum,
        }
    )
    .add_suffix("_in")
    .reset_index()
    .rename(columns={"stoptime": "time", "endstationid": "stationid"})
)

category_out = (
    df_raw.assign(starttime=df_raw.starttime.dt.ceil(predict_time))
    .groupby(["startstationid", "starttime"])
    .agg(
        {
            "usertype_Customer": np.sum,
            "usertype_Subscriber": np.sum,
            "usertype_nan": np.sum,
            "gender_0.0": np.sum,
            "gender_2.0": np.sum,
            "gender_nan": np.sum,
        }
    )
    .add_suffix("_out")
    .reset_index()
    .rename(columns={"starttime": "time", "startstationid": "stationid"})
)

In [None]:
features = pd.date_range(
    datetime.strptime("2013-07-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
    datetime.strptime("2017-09-30 23:00:00", "%Y-%m-%d %H:%M:%S"),
    freq=predict_time,
)
features = list(product(features, station_info.stationid))
features = pd.DataFrame(features, columns=["time", "stationid"])
features = features.merge(flow_in, on=["time", "stationid"], how="left")
features = features.merge(flow_out, on=["time", "stationid"], how="left")
features = features.merge(bike_return, on=["time", "stationid"], how="left")
features = features.merge(bike_rent, on=["time", "stationid"], how="left")
features = features.merge(category_in, on=["time", "stationid"], how="left")
features = features.merge(category_out, on=["time", "stationid"], how="left")
features.fillna(0, inplace=True)
features[features.columns[1:]] = features[features.columns[1:]].astype("int16")

del flow_in, flow_out, bike_return, bike_rent, category_in, category_out
gc.collect()

# Shift features

In [None]:
features = (
    features.assign(is_weekend=features.time.dt.dayofweek >= 5)
    .astype({"is_weekend": "int8"})
    .set_index(["time", "is_weekend", "stationid"])
)

In [None]:
%%time
shift_column = ['flow_in', 'flow_out', 'bike_return', 'bike_rent']
features = pd.concat(
    [
        features.rename(columns={"flow_in": "y_in", "flow_out": "y_out", "bike_return":"bike_return_b1hour", "bike_rent":"bike_rent_b1hour"}),
        features[["flow_in", "flow_out"]].groupby(level=2).shift(1, fill_value=-1).add_suffix("_b1hour"),
        features[shift_column].groupby(level=2).shift(2, fill_value=-1).add_suffix("_b2hour"),
        features[shift_column].groupby(level=[1, 2]).shift(24, fill_value=-1).add_suffix("_b1day"),
        features[shift_column].groupby(level=[1, 2])
        .shift(24 * 2, fill_value=-1)
        .add_suffix("_b2day"),
        features[shift_column].groupby(level=[1, 2])
        .shift(24 * 3, fill_value=-1)
        .add_suffix("_b3day"),
        features[shift_column].groupby(level=[1, 2])
        .shift(24 * 4, fill_value=-1)
        .add_suffix("_b4day"),
        features[shift_column].groupby(level=2).shift(24 * 7, fill_value=-1).add_suffix("_b1week"),
        features[shift_column].groupby(level=2).shift(24 * 14, fill_value=-1).add_suffix("_b2week"),
    ],
    axis=1,
).iloc[24 * 14 * len(station_info) :]

# Do Dummy

In [None]:
features = features.reset_index()

features = features.assign(
    month=features.time.dt.month,
    dayofweek=features.time.dt.dayofweek,
    hour=features.time.dt.hour,
)

features = pd.get_dummies(
    features, columns=["month", "dayofweek", "hour"], drop_first=True, dtype=np.int8
)

# Weather

In [None]:
# Z-Score normalize
norm_col = weather.columns[1:]
weather[norm_col] = (weather[norm_col] - weather[norm_col].mean()) / weather[norm_col].std()

In [None]:
time = pd.date_range(
    datetime.strptime("2013-07-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
    datetime.strptime("2017-09-30 23:00:00", "%Y-%m-%d %H:%M:%S"),
    freq=predict_time,
)
time = pd.DataFrame(time,columns=['time'])
weather = time.merge(weather, on=["time"], how="left")
weather = weather.fillna(method="ffill")

In [None]:
# Concat weather to features
features = features.merge(weather, on="time", how="left")

# Alive Datafeame

In [None]:
alive_df = features[["stationid", "time"]]

In [None]:
for stationid in tqdm(station_info.stationid):
    condition = (
        alive_df.loc[alive_df.stationid == stationid, "time"]
        >= station_info.loc[station_info.stationid == stationid, "earliest"].values[0]
    ) & (
        alive_df.loc[alive_df.stationid == stationid, "time"]
        <= station_info.loc[station_info.stationid == stationid, "latest"].values[0]
    )
    alive_df.loc[alive_df.stationid == stationid, "is_alive"] = np.where(
        condition, 1, 0
    )
alive_df["is_alive"] = alive_df["is_alive"].astype("int8")

In [None]:
features.to_hdf('process_data_201306_201709/features.h5', key="features", mode="w")
df_raw.to_hdf('process_data_201306_201709/features.h5', key="raw", mode="r+")
station_info.to_hdf('process_data_201306_201709/features.h5', key="info", mode="r+")
alive_df.to_hdf('process_data_201306_201709/features.h5', key="alive", mode="r+")