In [27]:
import duckdb
import pyarrow.dataset as ds
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error

In [3]:
dataset = ds.dataset("data/status", format="parquet", partitioning="hive")

In [4]:
con = duckdb.connect()
con = con.register("status", dataset)

In [9]:
dfs_to_concat = []
for i in range(1,16):
    auxdf = con.execute(
        f"""
        select
            make_timestamp(year, month, day, hour, minute, 0.0) as ts,
            station_id,
            hour,
            minute,
            dayofweek(make_timestamp(year, month, day, hour, minute, 0.0)) as dow,
            num_bikes_available,
            num_bikes_disabled,
            num_docks_available,
            num_docks_disabled,
            minute(lead(make_timestamp(year, month, day, hour, minute, 0.0), {i}) over (
                partition by station_id
                order by make_timestamp(year, month, day, hour, minute, 0.0) asc
            ) - make_timestamp(year, month, day, hour, minute, 0.0))  as minutes_bt_check,
            lead(num_bikes_available, {i}) over (
                partition by station_id
                order by make_timestamp(year, month, day, hour, minute, 0.0) asc
            ) as bikes_available,
        from
            status
        where
            (month = 3 or month = 4) and
            station_id = 3 and
            status = 'IN_SERVICE' and
            year = 2022"""
    ).df()
    dfs_to_concat.append(auxdf)
mins_df = pd.concat(dfs_to_concat)
del dfs_to_concat
mins_df.head()

Unnamed: 0,ts,station_id,hour,minute,dow,num_bikes_available,num_bikes_disabled,num_docks_available,num_docks_disabled,minutes_bt_check,bikes_available
0,2022-03-01 01:23:00,3,1,23,2,1,1,18,0,1.0,1.0
1,2022-03-01 01:24:00,3,1,24,2,1,1,18,0,1.0,1.0
2,2022-03-01 01:25:00,3,1,25,2,1,1,18,0,14.0,1.0
3,2022-03-01 01:39:00,3,1,39,2,1,1,18,0,2.0,1.0
4,2022-03-01 01:41:00,3,1,41,2,1,1,18,0,3.0,1.0


In [13]:
dataset_df = mins_df[(mins_df["num_bikes_available"] == 0) & (mins_df["bikes_available"] == 1)]

In [28]:
FEATURES_ORDER = ["hour", "dow", "num_bikes_disabled", "num_docks_available", "num_docks_disabled"]
rf_pipeline = make_pipeline(ColumnTransformer([("ohe",  OneHotEncoder(sparse=False), [0, 1]), ("ss",  StandardScaler(), slice(2,5))]),
                            MLPRegressor((128, 128, 128)))

In [29]:
train_size = int(0.77*len(dataset_df))
X_train = dataset_df[:train_size][FEATURES_ORDER].values
y_train = dataset_df[:train_size]["minutes_bt_check"].values
X_test = dataset_df[train_size:][FEATURES_ORDER].values
y_test = dataset_df[train_size:]["minutes_bt_check"].values

In [30]:
rf_pipeline.fit(X_train, y_train)

In [31]:
mean_absolute_error(y_test, rf_pipeline.predict(X_test))

6.43161756195796

In [33]:
from models.s3 import S3Client
from io import BytesIO 
import joblib 

s3_cli = S3Client()

In [34]:
with BytesIO() as mem_f:
    joblib.dump(rf_pipeline, mem_f)
    mem_f.seek(0)
    s3_cli.client.Bucket("frame").upload_fileobj(Key="models/current_eta_model.joblib", Fileobj=mem_f)
    




In [37]:
with BytesIO() as mem_f:
    s3_cli.client.Bucket("frame").download_fileobj(Key="models/current_eta_model.joblib", Fileobj=mem_f)
    mem_f.seek(0)
    loaded_model = joblib.load(mem_f)

loaded_model.predict(X_test[:3])



array([7.84605797, 7.61528874, 7.61528874])