In [2]:
!pip install polars

Collecting polars
  Downloading polars-1.29.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading polars-1.29.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.8/34.8 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m
[?25hInstalling collected packages: polars
Successfully installed polars-1.29.0


In [8]:
import polars as pl
from functools import partial

target_schema = {
    "VendorID": pl.Int32,
    "tpep_pickup_datetime": pl.Datetime,
    "tpep_dropoff_datetime": pl.Datetime,
    "passenger_count": pl.Float64,
    "trip_distance": pl.Float64,
    "RatecodeID": pl.Int32,
    "store_and_fwd_flag": pl.String,
    "PULocationID": pl.Int32,
    "DOLocationID": pl.Int32,
    "payment_type": pl.Int32,
    "fare_amount": pl.Float64,
    "extra": pl.Float64,
    "mta_tax": pl.Float64,
    "tip_amount": pl.Float64,
    "tolls_amount": pl.Float64,
    "improvement_surcharge": pl.Float64,
    "total_amount": pl.Float64,
    "congestion_surcharge": pl.Float64,
    "airport_fee": pl.Float64,
    "month": pl.String
}


year = '2023'
months = ['01', '02']

# Create LazyFrames for each month with parallel processing
yellow_trip_data = pl.concat(
    [
        pl.scan_parquet(
            f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet'
        )
        .with_columns(month=pl.lit(f'{year}-{month}'))
        .cast(target_schema, strict=False)
        .drop("airport_fee")
        for month in months
    ],
    how='vertical'
)

# Calculate duration in minutes using native datetime operations
yellow_trip_data = yellow_trip_data.with_columns(
    duration=(
        (pl.col("tpep_dropoff_datetime") - pl.col("tpep_pickup_datetime")).dt.total_seconds() / 60
    )
).collect()

ColumnNotFoundError: airport_fee

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'sink' <---
 WITH_COLUMNS:
 ["2023-02".alias("month")] 
  Parquet SCAN [https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet]
  PROJECT */19 COLUMNS

In [None]:
features_needed = ['tpep_pickup_datetime', 'duration', 'PULocationID', 'DOLocationID']
preprocess_data = yellow_trip_data["duration"].clone().to_numpy()

In [None]:
import pymc as pm
import numpy as np
import pandas as pd
import arviz as az
from sklearn.preprocessing import OneHotEncoder

# Data preparation
def preprocess_data(df):
    # Extract temporal features
    df['hour'] = df['tpep_pickup_datetime'].dt.hour
    df['dayofweek'] = df['tpep_pickup_datetime'].dt.dayofweek
    
    # One-hot encode locations with first category dropped
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    pickup_encoded = encoder.fit_transform(df[['PULocationID']])
    dropoff_encoded = encoder.fit_transform(df[['DOLocationID']])
    
    # Combine features
    temporal_features = df[['hour', 'dayofweek']]
    X = np.hstack([temporal_features, pickup_encoded, dropoff_encoded])
    
    return X, df['duration'].values

# Model specification
def build_weibull_model(X, y):
    with pm.Model() as model:
        # Regression coefficients (including intercept)
        beta = pm.Normal('beta', mu=0, sigma=1, shape=X.shape[1])
        
        # Shape parameter (constrained positive)
        alpha = pm.HalfNormal('alpha', sigma=1)
        
        # Linear predictor for scale parameter
        log_scale = pm.math.dot(X, beta)
        scale = pm.math.exp(log_scale)
        
        # Weibull likelihood
        pm.Weibull('duration', alpha=alpha, beta=scale, observed=y)
        
    return model

# Usage example
# if __name__ == "__main__":
    # Load your DataFrame (df must contain: pickup_datetime, pickup_location, dropoff_location, duration)
    # df = pd.read_csv(...)
    
    # X, y = preprocess_data(df)
    
    # Build and sample model
    # model = build_weibull_model(X, y)
    # with model:
    #     trace = pm.sample(2000, tune=1000, target_accept=0.95)
    
    # Analyze results
    # print(az.summary(trace, var_names=['beta', 'alpha']))
    # pm.plot_trace(trace, var_names=['beta', 'alpha'])