### Authentication Steps
It looks like the `gcloud` command is not installed or not in your PATH. You have two ways to fix this:

1.  **Install the Google Cloud SDK:**
    *   Download and install from [cloud.google.com/sdk](https://cloud.google.com/sdk/docs/install).
    *   Restart your terminal.
    *   Run `gcloud auth application-default login`.

2.  **Use a Service Account Key (Easier if you have the file):**
    *   Download a Service Account Key (JSON) from the Google Cloud Console.
    *   Upload it to this folder.
    *   Uncomment and run the cell below to point to it.

In [1]:
# Load environment variables from .env file
from dotenv import load_dotenv
import os
from pathlib import Path

# 1. Find the project root (assuming we are in notebooks/ or root)
current_dir = Path(os.getcwd())
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
else:
    project_root = current_dir

# 2. Load .env from project root
env_path = project_root / '.env'
if env_path.exists():
    load_dotenv(dotenv_path=env_path)
    print(f"Loaded .env from {env_path}")
else:
    print("Warning: .env file not found in project root.")

# 3. Fix the key path to be absolute
key_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
if key_path:
    # If it's just a filename, assume it's in the project root
    if not os.path.isabs(key_path):
        full_key_path = project_root / key_path
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(full_key_path)
        print(f"Set GOOGLE_APPLICATION_CREDENTIALS to: {full_key_path}")
    else:
        print(f"Using existing absolute path: {key_path}")
else:
    print("Warning: GOOGLE_APPLICATION_CREDENTIALS not found in .env file.")

Loaded .env from /Users/danherman/Desktop/headway-prediction/.env
Set GOOGLE_APPLICATION_CREDENTIALS to: /Users/danherman/Desktop/headway-prediction/time-series-478616-69afc5f87832.json


In [2]:
from google.cloud import bigquery
import pandas as pd
import os

# Initialize the BigQuery Client
# Note: Ensure you have authenticated via 'gcloud auth application-default login' 
# or set the GOOGLE_APPLICATION_CREDENTIALS environment variable.
client = bigquery.Client()

In [3]:
# Define the SQL Query
# TODO: Replace `your_project.your_dataset.your_table` with your actual table path
query = """
SELECT
  trip_uid,
  route_id,
  direction,
  stop_id,
  stop_name,
  stop_lat,
  stop_lon,
  arrival_time
FROM
  `time-series-478616.mta_historical_v3.clean`
WHERE
  route_id IN ('A', 'H')
  AND arrival_time >= TIMESTAMP('2025-06-06 00:00:00')
  AND arrival_time <= TIMESTAMP('2025-12-06 23:59:59')
ORDER BY
  arrival_time ASC
"""

In [4]:
print("Running query... this may take a moment depending on data size.")

# Run the query and convert to DataFrame
# We explicitly disable the BQ Storage API to avoid the permission error
# Adding a timeout to prevent hanging indefinitely
job_config = bigquery.QueryJobConfig()
query_job = client.query(query, job_config=job_config)

try:
    # Wait for the query to finish (with a timeout of 300 seconds)
    df = query_job.result(timeout=300).to_dataframe(create_bqstorage_client=False)
    print(f"Query complete. Downloaded {len(df)} rows.")
except Exception as e:
    print(f"Query failed or timed out: {e}")

Running query... this may take a moment depending on data size.
Query complete. Downloaded 2268764 rows.


In [5]:
# Preview the data
df.head()

Unnamed: 0,trip_uid,route_id,direction,stop_id,stop_name,stop_lat,stop_lon,arrival_time
0,1749151110_A..S57R,A,S,A38S,Fulton St,40.710197,-74.007691,2025-06-06 00:00:00+00:00
1,1749153120_A..N55R,A,N,H06N,Beach 67 St,40.590927,-73.796924,2025-06-06 00:00:02+00:00
2,1749149220_A..S58R,A,S,H03S,Howard Beach-JFK Airport,40.660476,-73.830301,2025-06-06 00:00:07+00:00
3,1749153300_A..S57R,A,S,A06S,181 St,40.851695,-73.937969,2025-06-06 00:00:17+00:00
4,1749147750_A..S58R,A,S,H11S,Far Rockaway-Mott Av,40.603995,-73.755405,2025-06-06 00:00:23+00:00


In [6]:
# Save to Parquet (preferred for preserving data types) and CSV
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)

parquet_path = os.path.join(output_dir, "nyc_subway_a_line_arrivals_2025.parquet")
csv_path = os.path.join(output_dir, "nyc_subway_a_line_arrivals_2025.csv")

df.to_parquet(parquet_path)
df.to_csv(csv_path, index=False)

print(f"Data saved to:\n- {parquet_path}\n- {csv_path}")

Data saved to:
- data/nyc_subway_a_line_arrivals_2025.parquet
- data/nyc_subway_a_line_arrivals_2025.csv
