In [3]:
import os
import pandas as pd
import requests
from io import StringIO
from capymoa.stream import stream_from_file
from capymoa.regressor import AdaptiveRandomForestRegressor
from capymoa.evaluation import RegressionEvaluator

def download_dataset():
    url = "https://huggingface.co/datasets/labiaufba/SSA_StopBusTimeSeries_5/raw/main/loader_03-05_2024.csv"
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception("❌ Failed to download dataset.")
    df = pd.read_csv(StringIO(response.text))
    df.rename(columns={df.columns[0]: "timestamp"}, inplace=True)
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    return df

def preprocess_stop_data(df, stop_name, stop_id):
    df_stop = df[["timestamp", stop_name]].dropna().rename(columns={stop_name: "actual"})
    df_stop["hour"] = df_stop["timestamp"].dt.hour
    df_stop["day"] = (df_stop["timestamp"] - df_stop["timestamp"].min()).dt.days
    df_stop["stop_id"] = stop_id
    return df_stop

def run_prediction_per_stop(df_stop, stop_id, limit=1000):
    # Prepare temp CSV
    temp_path = f"temp_stop_{stop_id}.csv"
    df_stop[["hour", "day", "stop_id", "actual"]].to_csv(temp_path, index=False)

    # Load stream
    stream = stream_from_file(temp_path, target_type="numeric")
    schema = stream.get_schema()
    model = AdaptiveRandomForestRegressor(schema=schema, ensemble_size=10)
    evaluator = RegressionEvaluator(schema=schema)

    # Predict
    results = []
    for i, instance in enumerate(stream):
        if i >= limit:
            break
        pred = model.predict(instance) or 0.0
        model.train(instance)
        results.append({
            "timestamp": df_stop.iloc[i]["timestamp"],
            "actual": instance.y_value,
            "predicted": round(pred),
            "error": abs(instance.y_value - round(pred))
        })

    os.remove(temp_path)  # Clean temp file
    return pd.DataFrame(results)

def main():
    print("🚍 Starting bus stop prediction pipeline")
    df = download_dataset()
    stop_ids = df.columns[1:]

    for stop_name in stop_ids:
        stop_num = stop_ids.get_loc(stop_name)
        print(f"🔁 Processing stop: {stop_name} (ID: {stop_num})")

        df_stop = preprocess_stop_data(df, stop_name, stop_num)
        df_pred = run_prediction_per_stop(df_stop, stop_num)

        file_path = f"predictions_stop_{stop_num}.csv"
        df_pred.to_csv(file_path, index=False)
        print(f"✅ Saved {file_path}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


🚍 Starting bus stop prediction pipeline
🔁 Processing stop: 125960550 (ID: 0)
✅ Saved predictions_stop_0.csv
🔁 Processing stop: 230565994 (ID: 1)
✅ Saved predictions_stop_1.csv
🔁 Processing stop: 258781031 (ID: 2)
✅ Saved predictions_stop_2.csv
🔁 Processing stop: 43768720 (ID: 3)
✅ Saved predictions_stop_3.csv
🔁 Processing stop: 44072192 (ID: 4)
✅ Saved predictions_stop_4.csv
🔁 Processing stop: 44783654 (ID: 5)
✅ Saved predictions_stop_5.csv
🔁 Processing stop: 44783914 (ID: 6)
✅ Saved predictions_stop_6.csv
🔁 Processing stop: 44784438 (ID: 7)
✅ Saved predictions_stop_7.csv
🔁 Processing stop: 45833547 (ID: 8)
✅ Saved predictions_stop_8.csv
🔁 Processing stop: 47568123 (ID: 9)
✅ Saved predictions_stop_9.csv
