In [1]:

import pandas as pd
import requests
import geopandas as gpd
import os
from io import BytesIO
from io import StringIO
import osmnx as ox
import os



# -----------------------------
# 1️⃣ Setup directories
# -----------------------------

In [2]:
BASE_DIR = "../data_raw"
os.makedirs(BASE_DIR, exist_ok=True)

# -----------------------------
# 2️⃣ Download NYC Taxi Data (3 months)
# -----------------------------

In [3]:
months = ["2023-01", "2023-02", "2023-03"]
for month in months:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{month}.parquet"
    print(f"Downloading {month} data...")
    df = pd.read_parquet(url)
    df.to_csv(f"{BASE_DIR}/yellow_tripdata_{month}.csv", index=False)
    print(f"Saved: yellow_tripdata_{month}.csv")

print("✅ NYC Taxi Data (3 months) saved.")

Downloading 2023-01 data...
Saved: yellow_tripdata_2023-01.csv
Downloading 2023-02 data...
Saved: yellow_tripdata_2023-02.csv
Downloading 2023-03 data...
Saved: yellow_tripdata_2023-03.csv
✅ NYC Taxi Data (3 months) saved.


# -----------------------------
# 3️⃣ Download NOAA Weather Data
# -----------------------------

In [5]:
print("Downloading NOAA weather data...")

Downloading NOAA weather data...


In [6]:
# You can get a free NOAA token at https://www.ncdc.noaa.gov/cdo-web/token
# NOAA token and output path
NOAA_TOKEN = "RdSCTsvgYWKYdfZKUEnIztHAERKHJxxF"  # Replace with your actual token
BASE_DIR = "../data_raw"  # Replace with your desired output directory

# NOAA API URL
url_weather = (
    "https://www.ncei.noaa.gov/access/services/data/v1?"
    "dataset=daily-summaries&stations=USW00094728&"
    "startDate=2023-01-01&endDate=2023-03-31&format=csv&"
    "dataElements=TMAX,TMIN,PRCP"
)

# Make request with token
headers = {"token": NOAA_TOKEN}
response = requests.get(url_weather, headers=headers)
response.raise_for_status()  # Raise error if request fails

# Convert response to DataFrame
weather_df = pd.read_csv(StringIO(response.text))

# Ensure output directory exists
os.makedirs(BASE_DIR, exist_ok=True)

# Save CSV
output_path = os.path.join(BASE_DIR, "weather_2023_Q1.csv")
weather_df.to_csv(output_path, index=False)

print(f"✅ NOAA weather data saved to {output_path}")

✅ NOAA weather data saved to ../data_raw\weather_2023_Q1.csv


# -----------------------------
# 4️⃣ Download OSM (NYC boundaries)
# -----------------------------

In [7]:
# print("Downloading OSM boundaries for NYC...")
# osm_url = "https://data.cityofnewyork.us/api/geospatial/7t3b-ywvw?method=export&format=GeoJSON"
# osm_gdf = gpd.read_file(osm_url)
# osm_gdf.to_file(f"{BASE_DIR}/nyc_osm.geojson", driver='GeoJSON')

# print("✅ OSM NYC boundary data saved.")

# print("🎉 All extraction completed successfully!")


BASE_DIR = "../data_raw"
os.makedirs(BASE_DIR, exist_ok=True)

print("🌍 Downloading NYC boundary using OSMNX...")

# Use OSMNX to fetch NYC’s city boundary polygon directly from OpenStreetMap
osm_gdf = ox.geocode_to_gdf("New York City, New York, USA")

# Save to GeoJSON
osm_gdf.to_file(f"{BASE_DIR}/nyc_osm.geojson", driver="GeoJSON")

print("✅ NYC boundary data saved successfully (from OSMNX).")

🌍 Downloading NYC boundary using OSMNX...
✅ NYC boundary data saved successfully (from OSMNX).


  _init_gdal_data()
