# Part 1: Data Ingestion
## Programmatic Download (5 marks)

In [27]:
pip install pandas polars duckdb pyarrow

Note: you may need to restart the kernel to use updated packages.


In [28]:
import os
import sys
import requests
import pandas as pd
import polars as pl
from datetime import datetime
import time
from pathlib import Path

In [29]:
import requests

TRIP_DATA_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet"
ZONE_LOOKUP_URL = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"

DATA_DIR = Path("data/raw")
TRIP_DATA_PATH = DATA_DIR / "yellow_tripdata_2024-01.parquet"
ZONE_LOOKUP_PATH = DATA_DIR / "taxi_zone_lookup.csv"

In [30]:
def download_file(url: str, destination: Path, max_retries: int = 3, chunk_size: int = 8192):
    for attempt in range(max_retries):
        try:
            print(f"Downloading {url}...")
            print(f"Attempt {attempt + 1} of {max_retries}")
            
            # Start timer
            start_time = time.time()
            
            # Stream download with progress
            response = requests.get(url, stream=True, timeout=30)
            response.raise_for_status()
            
            # Get total file size
            total_size = int(response.headers.get('content-length', 0))
            
            # Download with progress
            downloaded = 0
            with open(destination, 'wb') as f:
                for chunk in response.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        downloaded += len(chunk)
                        
                        # Print progress every 5MB
                        if total_size > 0 and downloaded % (5 * 1024 * 1024) < chunk_size:
                            percent = (downloaded / total_size) * 100
                            mb_downloaded = downloaded / (1024 * 1024)
                            mb_total = total_size / (1024 * 1024)
                            print(f"  Progress: {mb_downloaded:.1f}/{mb_total:.1f} MB ({percent:.1f}%)")
            
            # Calculate download time
            download_time = time.time() - start_time
            file_size_mb = destination.stat().st_size / (1024 * 1024)
            speed = file_size_mb / download_time if download_time > 0 else 0
            
            print(f"✓ Download completed: {file_size_mb:.2f} MB in {download_time:.1f}s ({speed:.1f} MB/s)")
            return True
            
        except requests.exceptions.RequestException as e:
            print(f"✗ Download failed (attempt {attempt + 1}): {str(e)}")
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"  Waiting {wait_time}s before retry...")
                time.sleep(wait_time)
            else:
                print(f"✗ All retry attempts failed for {url}")
                return False
    
    return False

In [31]:
# Download trip data
print("="*60)
print("DOWNLOADING NYC TAXI TRIP DATA")
print("="*60)

trip_download_success = download_file(TRIP_DATA_URL, TRIP_DATA_PATH)
if not trip_download_success:
    print("✗ Failed to download trip data. Exiting...")
    sys.exit(1)

print("\n" + "="*60)
print("DOWNLOADING TAXI ZONE LOOKUP DATA")
print("="*60)

zone_download_success = download_file(ZONE_LOOKUP_URL, ZONE_DATA_PATH)
if not zone_download_success:
    print("✗ Failed to download zone data. Exiting...")
    sys.exit(1)

DOWNLOADING NYC TAXI TRIP DATA
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet...
Attempt 1 of 3


FileNotFoundError: [Errno 2] No such file or directory: 'data\\raw\\yellow_tripdata_2024-01.parquet'