In [1]:
import duckdb
import pyarrow.dataset as ds
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from models.eta_model_trainer import ETAModelTrainer

2023-02-04 20:03:09,705 - youconfigme.youconfigme - INFO - searching for config on /home/dml/proyectos/bicisba/research/models/settings.ini
2023-02-04 20:03:09,706 - youconfigme.youconfigme - INFO - searching for config on /home/dml/proyectos/bicisba/research/settings.ini


In [2]:
dataset = ds.dataset("data/status", format="parquet", partitioning="hive")

In [3]:
con = duckdb.connect()
con = con.register("status", dataset)

In [46]:
station_id = 3
df_query = f"""
WITH base_status AS (select
    station_id,
    hour,
    num_bikes_available,
    num_bikes_disabled,
    num_docks_available,
    num_docks_disabled,
    status,
    make_timestamp(year, month, day, hour, minute, 0.0) as ts,
from
    status
where
    year = 2022 and
    month >= 1 and
    month <= 4 and
    station_id = {station_id} and
    status = 'IN_SERVICE')"""
df_query += " union ".join([
f"""
select
    station_id,
    hour,
    dayofweek(ts) as dow,
    num_bikes_available,
    num_bikes_disabled,
    num_docks_available,
    num_docks_disabled,
    minute(lead(ts, {i}) over (
        order by ts asc
    ) - ts)  as minutes_bt_check,
    lead(num_bikes_available, {i}) over (
        order by ts asc
    ) as bikes_available,
from
    base_status
""" for i in range(1, 16)])
mins_df = con.execute(df_query).df()

In [47]:
dataset_df = mins_df[(mins_df["num_bikes_available"] == 0) & (mins_df["bikes_available"] > 0)]
len(dataset_df)

6824

In [48]:
FEATURES_ORDER = ["hour", "dow", "num_bikes_disabled", "num_docks_available", "num_docks_disabled"]
rf_pipeline = make_pipeline(ColumnTransformer([("ohe",  OneHotEncoder(sparse=False), [0, 1]), ("ss",  StandardScaler(), slice(2,5))]),
                            MLPRegressor((128, 128, 128)))

In [56]:
X_train, X_test, y_train, y_test = train_test_split(dataset_df[FEATURES_ORDER], dataset_df["minutes_bt_check"], test_size=0.2, shuffle=False)

In [57]:
rf_pipeline.fit(X_train, y_train)

In [58]:
mean_absolute_error(y_test, rf_pipeline.predict(X_test))

6.5640508659359025

In [59]:
from models.s3 import S3Client
from io import BytesIO 
import joblib 

s3_cli = S3Client()

In [60]:
with BytesIO() as mem_f:
    joblib.dump(rf_pipeline, mem_f)
    mem_f.seek(0)
    s3_cli.client.Bucket("frame").upload_fileobj(Key="models/current_eta_model.joblib", Fileobj=mem_f) 




In [61]:
with BytesIO() as mem_f:
    s3_cli.client.Bucket("frame").download_fileobj(Key="models/current_eta_model.joblib", Fileobj=mem_f)
    mem_f.seek(0)
    loaded_model = joblib.load(mem_f)

loaded_model.predict(X_test[:3])



array([ 9.01762462, 14.37548957,  8.96427488])

# Whole pipeline

In [2]:
CURRENT_DATE = "2022/09/10"
avail_model_trainer = ETAModelTrainer()
dataset_df = avail_model_trainer.create_dataset(CURRENT_DATE)



In [3]:
avail_model_trainer.train_all_stations(dataset_df)



ValueError: Found unknown categories [1] in column 0 during transform

In [None]:
DATES = pd.date_range(end=CURRENT_DATE, periods=3)
avail_model_trainer.dump_stations_pipelines(DATES[-1], current=True)

In [5]:
len(dataset_df)/len(dataset_df.station_id.unique())

2453.3217665615143