In [75]:
from models.s3 import S3Client
from models.availability_model_trainer import AvailabilityModelTrainer

from io import BytesIO
import joblib
import duckdb
import pyarrow.dataset as ds
import pandas as pd
import tempfile
import os

In [2]:
s3_cli = S3Client()

In [60]:
CURRENT_DATE = "2022/09/10"
TRAINING_PERIOD = 21
DATES = pd.date_range(end=CURRENT_DATE, periods=TRAINING_PERIOD)

In [61]:
"silver/status/"+DATES[1].strftime('year=%Y/month=%-m/day=%-d')

'silver/status/year=2022/month=8/day=22'

In [None]:
temp_dir = tempfile.TemporaryDirectory()

for day in DATES:
    day_keys= s3_cli.client.Bucket("frame").objects.filter(Prefix="silver/status/"+day.strftime('year=%Y/month=%-m/day=%-d'))
    for parquet_object in day_keys:
        parquet_temp_path = temp_dir.name + "/" + parquet_object.key
        os.makedirs(os.path.dirname(parquet_temp_path), exist_ok = True)
        s3_cli.client.Bucket("frame").download_file(Key=parquet_object.key, Filename=parquet_temp_path)


In [65]:
dataset = ds.dataset(temp_dir.name + "/silver/status", format="parquet", partitioning="hive")
con = duckdb.connect()
con = con.register("status", dataset)

Hay mas de 6M de registros por dia para todas las estaciones, se vuelve intratable para un periodo largo de tiempo. Entonces entrenamos estacion por estacion.

In [66]:
station_ids = con.execute("select distinct(station_id) from status").df()["station_id"].values

In [77]:
avail_model_trainer = AvailabilityModelTrainer()
for station_id in station_ids[:3]:
    dfs_to_concat = []
    for i in range(1,16):
        auxdf = con.execute(
            f"""
            select
                hour,
                dayofweek(make_timestamp(year, month, day, hour, minute, 0.0)) as dow,
                num_bikes_available,
                num_bikes_disabled,
                num_docks_available,
                num_docks_disabled,
                minute(lead(make_timestamp(year, month, day, hour, minute, 0.0), {i}) over (
                    partition by station_id
                    order by make_timestamp(year, month, day, hour, minute, 0.0) asc
                ) - make_timestamp(year, month, day, hour, minute, 0.0))  as minutes_bt_check,
                lead(num_bikes_available, {i}) over (
                    partition by station_id
                    order by make_timestamp(year, month, day, hour, minute, 0.0) asc
                ) as bikes_available,
            from
                status
            where
                station_id = {station_id} and
                status = 'IN_SERVICE'
            """
        ).df()
        dfs_to_concat.append(auxdf)
    mins_df = pd.concat(dfs_to_concat)
    del dfs_to_concat
    mins_df["bikes_a"] = (mins_df["bikes_available"]>0).astype(int)
    avail_model_trainer.train_station(station_id, mins_df.dropna())

In [78]:
avail_model_trainer.dump_stations_pipelines(DATES[-1])



ClientError: An error occurred (AccessDenied) when calling the PutObject operation: Access Denied.

In [None]:
with BytesIO() as mem_f:
    s3_cli.client.Bucket("frame").download_fileobj(Key="models/"+ DATES[-1].strftime('year=%Y/month=%m/%d') +".joblib", Fileobj=mem_f)
    mem_f.seek(0)
    loaded_model = joblib.load(mem_f)

loaded_model.predict_proba(test_set[:3][FEATURES_ORDER].values)

In [79]:
temp_dir.cleanup()