In [2]:
from pathlib import Path

DATA_DIR = Path(r"C:\Users\arpit\Documents\CareerFoundry\Python_dashboard\citibike-dashboard\Data\2022-citibike-tripdata\2022-citibike-tripdata")
OUTPUT_DIR = Path(r"C:\Users\arpit\Documents\CareerFoundry\Python_dashboard\citibike-dashboard\Output")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# look for all .zip files in that folder
zip_files = sorted(DATA_DIR.glob("*.zip"))
len(zip_files), [z.name for z in zip_files[:3]]


(12,
 ['202201-citibike-tripdata.zip',
  '202202-citibike-tripdata.zip',
  '202203-citibike-tripdata.zip'])

In [3]:
import pandas as pd, zipfile

def read_citibike_zip(zippath: Path) -> pd.DataFrame:
    with zipfile.ZipFile(zippath) as zf:
        csv_members = [m for m in zf.namelist() if m.lower().endswith(".csv")]
        if not csv_members:
            raise FileNotFoundError(f"No CSV found inside {zippath.name}")
        with zf.open(csv_members[0]) as f:
            df = pd.read_csv(f, low_memory=False)

    # normalize started_at
    lower = {c.lower(): c for c in df.columns}
    if "started_at" not in df.columns:
        for cand in ("starttime","start_time","started at"):
            if cand in lower:
                df.rename(columns={lower[cand]: "started_at"}, inplace=True)
                break
    if "started_at" not in df.columns:
        raise KeyError(f"'started_at' not found in {zippath.name}")

    df["started_at"] = pd.to_datetime(df["started_at"], errors="coerce", utc=True)
    df["date"] = df["started_at"].dt.date
    return df


In [4]:
all_months = []
for zp in zip_files:
    print("Reading:", zp.name)
    mdf = read_citibike_zip(zp)
    all_months.append(mdf)

df_2022 = pd.concat(all_months, ignore_index=True)
df_2022.shape, df_2022.head()


Reading: 202201-citibike-tripdata.zip
Reading: 202202-citibike-tripdata.zip
Reading: 202203-citibike-tripdata.zip
Reading: 202204-citibike-tripdata.zip
Reading: 202205-citibike-tripdata.zip
Reading: 202206-citibike-tripdata.zip
Reading: 202207-citibike-tripdata.zip
Reading: 202208-citibike-tripdata.zip
Reading: 202209-citibike-tripdata.zip
Reading: 202210-citibike-tripdata.zip
Reading: 202211-citibike-tripdata.zip
Reading: 202212-citibike-tripdata.zip


((8878119, 14),
             ride_id  rideable_type                       started_at  \
 0  63AF72AB3CD47753   classic_bike 2022-01-13 21:36:47.689000+00:00   
 1  9C0DAD8C1E0EA571   classic_bike 2022-01-16 17:56:23.889000+00:00   
 2  9576DDD8920974F5  electric_bike 2022-01-18 07:10:04.799000+00:00   
 3  962A466CC3AC6781   classic_bike 2022-01-22 12:10:10.225000+00:00   
 4  C2585407BA0FE3E9   classic_bike 2022-01-08 16:35:16.497000+00:00   
 
                   ended_at                start_station_name start_station_id  \
 0  2022-01-13 21:46:02.024                   5 Ave & E 63 St          6904.06   
 1  2022-01-16 18:03:50.269  Grand Army Plaza & Plaza St West          4010.15   
 2  2022-01-18 07:20:54.450                  W 20 St & 10 Ave          6306.01   
 3  2022-01-22 12:20:06.899                   W 54 St & 9 Ave          6920.03   
 4  2022-01-08 16:45:33.279              Sharon St & Olive St          5323.05   
 
               end_station_name end_station_id  start_la

In [7]:
from pathlib import Path
from getpass import getpass
import os

# set your repo root (adjust only if your path is different)
repo_root = Path(r"C:\Users\arpit\Documents\CareerFoundry\Python_dashboard\citibike-dashboard")

# prompt for your NOAA token (input is hidden)
token = getpass("Paste your NOAA token here (input is hidden): ")

# set env var for this notebook session
os.environ["NOAA_TOKEN"] = token

# also save a .env file for later use (safe to keep locally; don't push it)
(env_path := repo_root / ".env").write_text(f"NOAA_TOKEN={token}\n", encoding="utf-8")
print(f" Saved token to: {env_path}")
print("Has token:", bool(os.getenv("NOAA_TOKEN")))


Paste your NOAA token here (input is hidden):  ········


✅ Saved token to: C:\Users\arpit\Documents\CareerFoundry\Python_dashboard\citibike-dashboard\.env
Has token: True


In [2]:
#   monthly zips merge with NOAA weather, save 100k sample
import os, zipfile, requests
import pandas as pd
from pathlib import Path
from datetime import datetime
from getpass import getpass

# ---------- Config ----------
YEAR = 2022
STATION_ID = "GHCND:USW00014732"      # LaGuardia
SAMPLE_ROWS = 100_000
# ----------------------------

# Detect project root (works if notebook is in Notebooks/ or repo root)
CWD = Path.cwd().resolve()
cands = [CWD, CWD.parent, CWD.parent.parent, CWD.parent.parent.parent]
PROJECT_ROOT = next((p for p in cands if (p / "Data").exists()), CWD)
DATA_ROOT = PROJECT_ROOT / "Data"
OUTPUT_DIR = PROJECT_ROOT / "Output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# 1) Find ONLY monthly zips (YYYYMM-citibike-tripdata*.zip) recursively under Data/
all_zips = sorted(DATA_ROOT.glob("**/*.zip"))
monthly_zips = [
    z for z in all_zips
    if len(z.name) >= 10
       and z.name[:6].isdigit()            # YYYYMM...
       and "citibike-tripdata" in z.name.lower()
]
assert monthly_zips, f"No monthly zips like YYYYMM-citibike-tripdata*.zip found under {DATA_ROOT}"
print(f"Found {len(monthly_zips)} monthly ZIPs for {YEAR}. First 3:", [z.name for z in monthly_zips[:3]])

# 2) Helper to read one month ZIP; skip archives that don't actually contain a CSV
def read_citibike_zip(zippath: Path) -> pd.DataFrame:
    with zipfile.ZipFile(zippath) as zf:
        csv_members = [m for m in zf.namelist() if m.lower().endswith(".csv")]
        if not csv_members:
            raise FileNotFoundError(f"No CSV inside {zippath.name}")
        with zf.open(csv_members[0]) as f:
            df = pd.read_csv(f, low_memory=False)

    # normalize started_at
    lower = {c.lower(): c for c in df.columns}
    if "started_at" not in df.columns:
        for cand in ("starttime","start_time","started at"):
            if cand in lower:
                df.rename(columns={lower[cand]: "started_at"}, inplace=True)
                break
    if "started_at" not in df.columns:
        raise KeyError(f"'started_at' not found after normalization for {zippath.name}")

    df["started_at"] = pd.to_datetime(df["started_at"], errors="coerce", utc=True)
    df["date"] = df["started_at"].dt.date
    return df

# 3) Load & combine months (robust: skip any unexpected bad zip)
parts = []
for zp in monthly_zips:
    try:
        print("Reading:", zp.name)
        parts.append(read_citibike_zip(zp))
    except Exception as e:
        print(f"!! Skipping {zp.name}: {e}")
        continue

df_2022 = pd.concat(parts, ignore_index=True)
print("Trips combined shape:", df_2022.shape)

# 4) NOAA token: env -> .env -> prompt once (saved locally for reuse)
token = os.getenv("NOAA_TOKEN")
if not token:
    env_path = PROJECT_ROOT / ".env"
    if env_path.exists():
        for line in env_path.read_text(encoding="utf-8").splitlines():
            if line.strip().startswith("NOAA_TOKEN="):
                token = line.split("=", 1)[1].strip()
                break
if not token:
    token = getpass("Paste your NOAA token (hidden): ")
    (PROJECT_ROOT / ".env").write_text(f"NOAA_TOKEN={token}\n", encoding="utf-8")
assert token, "NOAA token missing."

# 5) Fetch NOAA daily temps (prefer TAVG; fallback to mean(TMIN, TMAX))
BASE_URL = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data"
HEADERS = {"token": token}
def fetch_noaa(datatype, start, end, station=STATION_ID, limit=1000):
    params = {
        "datasetid": "GHCND", "datatypeid": datatype, "stationid": station,
        "startdate": start, "enddate": end, "limit": limit, "units": "metric"
    }
    r = requests.get(BASE_URL, headers=HEADERS, params=params, timeout=30)
    r.raise_for_status()
    return r.json().get("results", [])

start, end = f"{YEAR}-01-01", f"{YEAR}-12-31"
res_tavg = fetch_noaa("TAVG", start, end)

if res_tavg:
    dates = [it["date"] for it in res_tavg]
    vals  = [it["value"] for it in res_tavg]  # tenths of °C
    df_weather = pd.DataFrame({
        "date": [datetime.strptime(x, "%Y-%m-%dT%H:%M:%S").date() for x in dates],
        "avgTemp": [v/10 for v in vals]
    })
else:
    tmin = pd.DataFrame(fetch_noaa("TMIN", start, end))
    tmax = pd.DataFrame(fetch_noaa("TMAX", start, end))
    if tmin.empty or tmax.empty:
        raise RuntimeError("NOAA returned no data for TAVG and TMIN/TMAX.")
    tmin["date"] = pd.to_datetime(tmin["date"]).dt.date
    tmax["date"] = pd.to_datetime(tmax["date"]).dt.date
    tmin["TMIN"] = tmin["value"].astype(float)/10.0
    tmax["TMAX"] = tmax["value"].astype(float)/10.0
    df_weather = tmin.merge(tmax, on="date", how="outer")
    df_weather["avgTemp"] = df_weather[["TMIN", "TMAX"]].mean(axis=1)
    df_weather = df_weather[["date","avgTemp"]]

df_weather = df_weather.sort_values("date").reset_index(drop=True)
print("Weather days:", len(df_weather))

# 6) Merge & save a commit-friendly 100k sample
df_merge = df_2022.merge(df_weather, how="left", on="date")
sample = df_merge.sample(min(SAMPLE_ROWS, len(df_merge)), random_state=42)
sample_path = OUTPUT_DIR / f"citibike_weather_{YEAR}_sample_{SAMPLE_ROWS//1000}k.csv"
sample.to_csv(sample_path, index=False)

print("Merged shape:", df_merge.shape)
print("Saved sample CSV:", sample_path)
try:
    display(sample.head())
except Exception:
    pass


Found 12 monthly ZIPs for 2022. First 3: ['202201-citibike-tripdata.zip', '202202-citibike-tripdata.zip', '202203-citibike-tripdata.zip']
Reading: 202201-citibike-tripdata.zip
Reading: 202202-citibike-tripdata.zip
Reading: 202203-citibike-tripdata.zip
Reading: 202204-citibike-tripdata.zip
Reading: 202205-citibike-tripdata.zip
Reading: 202206-citibike-tripdata.zip
Reading: 202207-citibike-tripdata.zip
Reading: 202208-citibike-tripdata.zip
Reading: 202209-citibike-tripdata.zip
Reading: 202210-citibike-tripdata.zip
Reading: 202211-citibike-tripdata.zip
Reading: 202212-citibike-tripdata.zip
Trips combined shape: (8878119, 14)
Weather days: 365
Merged shape: (8878119, 15)
Saved sample CSV: C:\Users\arpit\Documents\CareerFoundry\Python_dashboard\citibike-dashboard\Output\citibike_weather_2022_sample_100k.csv


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,avgTemp
7684352,55D8341CBE041629,classic_bike,2022-11-18 07:22:57.286000+00:00,2022-11-18 07:24:51.457,1 Ave & E 6 St,5626.15,E 6 St & Avenue B,5584.04,40.726331,-73.986169,40.724537,-73.981854,member,2022-11-18,0.45
6968415,B82C3CD4A7162310,classic_bike,2022-10-30 14:58:04.899000+00:00,2022-10-30 15:37:46.486,West St & Liberty St,5184.08,11 Ave & W 59 St,7059.01,40.711444,-74.014847,40.771497,-73.99046,casual,2022-10-30,1.25
8397215,05DE7F6CC9C07083,classic_bike,2022-12-07 20:01:14.344000+00:00,2022-12-07 20:10:56.276,E 78 St & 2 Ave,7057.07,Grand Army Plaza & Central Park S,6839.1,40.772797,-73.955778,40.764397,-73.973715,member,2022-12-07,1.38
6208529,AD2E356505F570DF,electric_bike,2022-09-15 13:57:18.129000+00:00,2022-09-15 14:05:34.197,Suffolk St & Stanton St,5445.02,Ave A & E 14 St,5779.11,40.720525,-73.985271,40.730311,-73.980472,member,2022-09-15,2.18
5232252,2E33A57760B13F8C,electric_bike,2022-08-31 16:23:49.261000+00:00,2022-08-31 16:51:49.143,West St & Chambers St,5329.03,Dock St & Front St,4903.09,40.717661,-74.013138,40.702709,-73.99253,casual,2022-08-31,2.56
