In [None]:
# Mount our Google Drive so we can use for storage

from google.colab import drive
drive.mount("/content/drive", force_remount=True)   # opens an auth popup

Mounted at /content/drive


In [None]:
# Create a new directory in our Drive for our dataset

%%bash
DATA_DIR="/content/drive/MyDrive/datasets/nyc_taxi"
mkdir -p "$DATA_DIR"

In [None]:
# Download our dataset from Kaggle and save it in Google Drive to avoid redownloading

from google.colab import drive
drive.mount("/content/drive")

import os, kagglehub, pathlib

# 1️⃣  Tell kagglehub to cache inside the Drive folder you made
os.environ["KAGGLEHUB_CACHE"] = "/content/drive/MyDrive/datasets/nyc_taxi"

# 2️⃣  Download the dataset (runs only once!)
path = kagglehub.dataset_download("jeffsinsel/nyc-fhvhv-data")
print("Files landed in:", path)

Mounted at /content/drive
Files landed in: /content/drive/MyDrive/datasets/nyc_taxi/datasets/jeffsinsel/nyc-fhvhv-data/versions/4


In [None]:
# Take a look at all the files we have after downloading our data

import os, pathlib
from google.colab import drive
drive.mount("/content/drive")

DATA_DIR = "/content/drive/MyDrive/datasets/nyc_taxi/datasets/jeffsinsel/nyc-fhvhv-data"

# 1) Verify the directory really exists
assert os.path.exists(DATA_DIR), f"⚠️  Path does not exist: {DATA_DIR}"

# 2) Recursively gather all files (skip sub-dirs)
all_files = [str(p) for p in pathlib.Path(DATA_DIR).rglob("*") if p.is_file()]

print(f"Found {len(all_files)} files")
for f in all_files[:100]:          # show the first 20
    print("  •", f)

Found 63 files
  • /content/drive/MyDrive/datasets/nyc_taxi/datasets/jeffsinsel/nyc-fhvhv-data/4.complete
  • /content/drive/MyDrive/datasets/nyc_taxi/datasets/jeffsinsel/nyc-fhvhv-data/versions/4/data_dictionary_trip_records_hvfhs.pdf
  • /content/drive/MyDrive/datasets/nyc_taxi/datasets/jeffsinsel/nyc-fhvhv-data/versions/4/fhvhv_tripdata_2019-02.parquet
  • /content/drive/MyDrive/datasets/nyc_taxi/datasets/jeffsinsel/nyc-fhvhv-data/versions/4/fhvhv_tripdata_2019-03.parquet
  • /content/drive/MyDrive/datasets/nyc_taxi/datasets/jeffsinsel/nyc-fhvhv-data/versions/4/fhvhv_tripdata_2019-04.parquet
  • /content/drive/MyDrive/datasets/nyc_taxi/datasets/jeffsinsel/nyc-fhvhv-data/versions/4/fhvhv_tripdata_2019-05.parquet
  • /content/drive/MyDrive/datasets/nyc_taxi/datasets/jeffsinsel/nyc-fhvhv-data/versions/4/fhvhv_tripdata_2019-06.parquet
  • /content/drive/MyDrive/datasets/nyc_taxi/datasets/jeffsinsel/nyc-fhvhv-data/versions/4/fhvhv_tripdata_2019-07.parquet
  • /content/drive/MyDrive/datas

In [None]:
# Merge together all of our .parquet files to have 1 dataset that we can use

!pip install -q "polars[all]==1.27.1"

import polars as pl, pathlib, time, os

# We'll grab the files from our drive and save them there as well
SRC_DIR   = "/content/drive/MyDrive/datasets/nyc_taxi/datasets/jeffsinsel/nyc-fhvhv-data/versions/4"
DEST_FILE = "/content/drive/MyDrive/datasets/nyc_taxi/fhvhv_all_years.zstd.parquet"

files = [str(p) for p in pathlib.Path(SRC_DIR).rglob("*.parquet")]
print("Shards:", len(files))

lazy_frames = []
for f in files:
    lf = pl.scan_parquet(f)

    # add or cast wav_match_flag so every shard has Utf8
    if "wav_match_flag" in lf.columns:
        lf = lf.with_columns(pl.col("wav_match_flag").cast(pl.Utf8))
    else:
        lf = lf.with_columns(pl.lit(None, dtype=pl.Utf8).alias("wav_match_flag"))

    lazy_frames.append(lf)

t0 = time.time()
(pl.concat(lazy_frames, how="diagonal_relaxed")      # tolerate missing cols
   .sink_parquet(DEST_FILE, compression="zstd"))     # streaming write
print(f"✅  written in {time.time()-t0:.1f}s ; size {os.path.getsize(DEST_FILE)/1e9:.2f} GB")


Shards: 46


  if "wav_match_flag" in lf.columns:


✅  written in 350.7s ; size 20.53 GB


In [None]:
# Load full dataframe into memory, take a peek at our data to understand it,
# then we can do our feature engineering, etc., afterwards

# ╔════════════════════════════════════════════════════════════╗
# ║  EAGER PEEK  (336 GB RAM, v2-8 TPU)                        ║
# ╚════════════════════════════════════════════════════════════╝
from google.colab import drive
drive.mount("/content/drive")

import os, subprocess, time, psutil, polars as pl

PARQ_DRIVE = "/content/drive/MyDrive/datasets/nyc_taxi/fhvhv_all_years.zstd.parquet"
PARQ_LOCAL = "/content/fhvhv_all_years.zstd.parquet"   # SSD copy

# ── 1️⃣ Copy to SSD with progress (only if needed) ────────────────────────
if (not os.path.exists(PARQ_LOCAL) or
        os.path.getsize(PARQ_LOCAL) != os.path.getsize(PARQ_DRIVE)):
    print("➤  Copying Parquet from Drive to SSD …")
    t0 = time.time()
    subprocess.run([
        "rsync", "-ah", "--info=progress2", "--no-inc-recursive",
        PARQ_DRIVE, PARQ_LOCAL
    ], check=True)
    print(f"   ✅  Copied in {time.time()-t0:.1f}s\n")
else:
    print("✔  Parquet already on SSD — skipping copy\n")

# ── 2️⃣ Read entire file eagerly (multithreaded) ──────────────────────────
t0 = time.time()
df = pl.read_parquet(PARQ_LOCAL, low_memory=False)    # uses all CPU cores
print(f"Loaded full table in {time.time()-t0:.1f}s | rows={len(df):,}")

# ── 3️⃣ Configure Polars console for full-width display ───────────────────
pl.Config.set_tbl_cols(100)
pl.Config.set_tbl_rows(10)
pl.Config.set_fmt_str_lengths(40)

print("\n── First 5 rows ─────────────────────────────────────────────")
print(df.head(5))

print("\n── Null counts ──────────────────────────────────────────────")
print(df.null_count())

print("\n── Numeric describe() ───────────────────────────────────────")
print(df.select(pl.col(pl.NUMERIC_DTYPES)).describe())

rss = psutil.Process().memory_info().rss / 1e9
print(f"\n✅  Done | RAM in use {rss:.1f} GB")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
➤  Copying Parquet from Drive to SSD …
   ✅  Copied in 1473.7s

Loaded full table in 16.5s | rows=745,287,023

── First 5 rows ─────────────────────────────────────────────
shape: (5, 24)
┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┐
│ hvf ┆ dis ┆ ori ┆ req ┆ on_ ┆ pic ┆ dro ┆ PUL ┆ DOL ┆ tri ┆ tri ┆ bas ┆ tol ┆ bcf ┆ sal ┆ con ┆ air ┆ tip ┆ dri ┆ sha ┆ sha ┆ acc ┆ wav ┆ wav │
│ hs_ ┆ pat ┆ gin ┆ ues ┆ sce ┆ kup ┆ pof ┆ oca ┆ oca ┆ p_m ┆ p_t ┆ e_p ┆ ls  ┆ --- ┆ es_ ┆ ges ┆ por ┆ s   ┆ ver ┆ red ┆ red ┆ ess ┆ _re ┆ _ma │
│ lic ┆ chi ┆ ati ┆ t_d ┆ ne_ ┆ _da ┆ f_d ┆ tio ┆ tio ┆ ile ┆ ime ┆ ass ┆ --- ┆ f64 ┆ tax ┆ tio ┆ t_f ┆ --- ┆ _pa ┆ _re ┆ _ma ┆ _a_ ┆ que ┆ tch │
│ ens ┆ ng_ ┆ ng_ ┆ ate ┆ dat ┆ tet ┆ ate ┆ nID ┆ nID ┆ s   ┆ --- ┆ eng ┆ f64 ┆     ┆ --- ┆ n_s ┆ e

  print(df.select(pl.col(pl.NUMERIC_DTYPES)).describe())


shape: (9, 13)
┌─────┬─────┬─────┬────────┬───────┬───────┬───────┬───────┬───────┬───────┬───────┬───────┬───────┐
│ sta ┆ PUL ┆ DOL ┆ trip_m ┆ trip_ ┆ base_ ┆ tolls ┆ bcf   ┆ sales ┆ conge ┆ airpo ┆ tips  ┆ drive │
│ tis ┆ oca ┆ oca ┆ iles   ┆ time  ┆ passe ┆ ---   ┆ ---   ┆ _tax  ┆ stion ┆ rt_fe ┆ ---   ┆ r_pay │
│ tic ┆ tio ┆ tio ┆ ---    ┆ ---   ┆ nger_ ┆ f64   ┆ f64   ┆ ---   ┆ _surc ┆ e     ┆ f64   ┆ ---   │
│ --- ┆ nID ┆ nID ┆ f64    ┆ f64   ┆ fare  ┆       ┆       ┆ f64   ┆ harge ┆ ---   ┆       ┆ f64   │
│ str ┆ --- ┆ --- ┆        ┆       ┆ ---   ┆       ┆       ┆       ┆ ---   ┆ f64   ┆       ┆       │
│     ┆ f64 ┆ f64 ┆        ┆       ┆ f64   ┆       ┆       ┆       ┆ f64   ┆       ┆       ┆       │
╞═════╪═════╪═════╪════════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
│ cou ┆ 7.4 ┆ 7.4 ┆ 7.4528 ┆ 7.452 ┆ 7.452 ┆ 7.452 ┆ 7.452 ┆ 7.452 ┆ 7.447 ┆ 3.308 ┆ 7.452 ┆ 7.452 │
│ nt  ┆ 528 ┆ 528 ┆ 7023e8 ┆ 87023 ┆ 87023 ┆ 87023 ┆ 87023 ┆ 87023 ┆ 73982 ┆

In [1]:
# prompt: Write code to export 'df.head(5)' as a csv file

df.head(5).to_csv('df_head.csv', index=False)


NameError: name 'df' is not defined

In [None]:
# Clean and add target column (target = pre-tip total)
import polars as pl, numpy as np

# In-RAM df from your peek
df = df.filter(             # a) remove impossible rows
        (pl.col("base_passenger_fare") >= 0) &
        (pl.col("trip_miles") > 0) &
        (pl.col("trip_miles") < 200) &
        (pl.col("trip_time") > 60) &
        (pl.col("trip_time") < 4*60*60)          # <4h
     ).with_columns(        # b) target = pre-tip total
        (
            pl.col("base_passenger_fare") + pl.col("tolls") +
            pl.col("bcf") + pl.col("sales_tax") +
            pl.col("congestion_surcharge") + pl.col("airport_fee")
        ).alias("target_amount")
     )

# c) Drop zones with <300 trips
zone_counts = df.group_by("PULocationID").len()
valid = zone_counts.filter(pl.col("len") >= 300)["PULocationID"]
df = df.filter(
        pl.col("PULocationID").is_in(valid) &
        pl.col("DOLocationID").is_in(valid)
     )

print("Rows after clean:", len(df))

NameError: name 'df' is not defined

In [None]:
from google.colab import auth
auth.authenticate_user()                      # OAuth popup once

In [None]:
!gcloud config set project nyc-taxi-ml

Updated property [core/project].


In [None]:
BUCKET="nyc-taxi-fhv-460946772036"            # your bucket name
SRC="/content/fhvhv_all_years.zstd.parquet"

# -m  : multi-threaded
# -o  : enable parallel composite uploads for files > 150 MB
!gsutil -m -o "GSUtil:parallel_composite_upload_threshold=150M" \
      cp $SRC gs://$BUCKET/

Copying file:///content/fhvhv_all_years.zstd.parquet [Content-Type=application/octet-stream]...


In [None]:
!gsutil -m cp /content/fhvhv_all_years.zstd.parquet gs://nyc-taxi-fhv-460946772036/

Copying file:///content/fhvhv_all_years.zstd.parquet [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [0/1 files][    0.0 B/ 19.1 GiB]   0% Done                                    ResumableUploadAbortException: 401 Anonymous caller does not have storage.objects.create access to the Google Cloud Storage object. Permission 'sto