In [7]:
import duckdb
import pyarrow.dataset as ds
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [2]:
dataset = ds.dataset("data/status", format="parquet", partitioning="hive")

In [3]:
con = duckdb.connect()
con = con.register("status", dataset)

In [5]:
station_ids = con.execute("select distinct(station_id) from status").df()["station_id"].values
len(station_ids)

472

In [77]:
for station_id in station_ids[:1]:
    df_query = f"""
    WITH base_status AS (select
        station_id,
        hour,
        num_bikes_available,
        num_bikes_disabled,
        num_docks_available,
        num_docks_disabled,
        status,
        make_timestamp(year, month, day, hour, minute, 0.0) as ts,
    from
        status
    where
        make_timestamp(year, month, day, hour, minute, 0.0)
        between make_timestamp(2022, 6, 1, 0, 0, 0.0)
        and make_timestamp(2022, 9, 1, 0, 0, 0.0) and
        station_id = {station_id} and
        status = 'IN_SERVICE')"""
    df_query += " union ".join([
    f"""
    select
        station_id,
        hour,
        dayofweek(ts) as dow,
        num_bikes_available,
        num_bikes_disabled,
        num_docks_available,
        num_docks_disabled,
        minute(lead(ts, {i}) over (
            order by ts asc
        ) - ts)  as minutes_bt_check,
        lead(num_bikes_available, {i}) over (
            order by ts asc
        ) as remaining_bikes_available,
    from
        base_status
    """ for i in range(1, 17, 3)])
    dataset_df = con.execute(df_query).df()

In [78]:
dataset_df["remaining_bikes_available"] = (dataset_df["remaining_bikes_available"]>0).astype(int)

In [79]:
len(dataset_df)

38922

In [80]:
dataset_df["remaining_bikes_available"].value_counts(normalize=True)

1    0.991881
0    0.008119
Name: remaining_bikes_available, dtype: float64

In [142]:
FEATURES_ORDER = ["hour", "dow", "num_bikes_available", "num_bikes_disabled", "num_docks_available", "num_docks_disabled", "minutes_bt_check"]
TARGET = "remaining_bikes_available"
CLASS_WEIGHT = {0: 10000, 1: 1}
OHE_SLICE = [0, 1]
SS_SLICE = slice(2,7)
TEST_SIZE = 0.2

In [159]:
pipeline = make_pipeline(
    ColumnTransformer([("ohe",  OneHotEncoder(sparse=False), OHE_SLICE), ("ss",  StandardScaler(), SS_SLICE)]),
    RandomForestClassifier(n_estimators=10, max_depth=10, class_weight=CLASS_WEIGHT))
dataset_df = dataset_df.dropna()
X_train, X_test, y_train, y_test = train_test_split(dataset_df[FEATURES_ORDER].values, dataset_df[TARGET].values, test_size=TEST_SIZE, shuffle=False)
pipeline.fit(X_train, y_train)
rf_metrics = confusion_matrix(y_test, pipeline.predict(X_test), normalize="true").ravel()

In [160]:
tn, fp, fn, tp = rf_metrics
print(f"tn: {tn}, fp: {fp}, fn: {fn}, tp: {tp}")

tn: 0.971830985915493, fp: 0.028169014084507043, fn: 0.0697523661350966, tp: 0.9302476338649034


In [132]:
num_round = 10
param = {'scale_pos_weight': 1/100}
bst = lgb.train(param, train_data, num_round)

71

In [133]:
bst.predict(X_test)


7713

In [105]:
pipeline.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1])