In [13]:
import os
import re
import requests
import logging
import numpy as np
import pandas as pd


BASE_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data"
MONTHS = ["2025-04"]
CAB_TYPES = ["yellow", "green"]
LOCAL_DIR = "../nyc_tlc_data"
COLUMNS = [
    "driver_id",
    "pu_location_id",
    "do_location_id",
    "vendor_id",
    "ratecode_id",
    "pickup_datetime",
    "dropoff_datetime",
    "trip_distance",
    "fare_amount",
    "total_amount",
    "extra",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "ehail_fee",
    "improvement_surcharge",
    "congestion_surcharge",
    "cbd_congestion_fee",
    "passenger_count",
    "payment_type",
    "trip_type",
    "cab_type",
    "store_and_fwd_flag",
]


def download_file(cab: str, month: str):
    file_name = f"{cab}_tripdata_{month}.parquet"
    local_path = os.path.join(LOCAL_DIR, file_name)

    if not os.path.exists(local_path):
        url = f"{BASE_URL}/{file_name}"
        resp = requests.get(url, stream=True)
        resp.raise_for_status()
        with open(local_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=1024 * 1024):
                f.write(chunk)
    else:
        logging.warning(f"File {local_path} already exists. Skipping download.")
    return local_path


def prepare_month(file_path: str):

    df_iter = pd.read_parquet(file_path, engine="pyarrow")

    # Генерируем driver_id
    np.random.seed(42)
    df_iter["driver_id"] = np.random.randint(1, 1001, size=len(df_iter))

    # Приводим pickup/dropoff к datetime
    df_iter["pickup_datetime"] = pd.to_datetime(
        df_iter.get("lpep_pickup_datetime", df_iter.get("tpep_pickup_datetime")),
        errors="coerce",
    )
    df_iter["dropoff_datetime"] = pd.to_datetime(
        df_iter.get("lpep_dropoff_datetime", df_iter.get("tpep_dropoff_datetime")),
        errors="coerce",
    )

    # Дропаем строки с некорректными датами
    df_iter = df_iter.dropna(subset=["pickup_datetime", "dropoff_datetime"])

    # Функция для конвертации в snake_case
    def to_snake_case(name):
        name = re.sub(r"([A-Z]+)", r"_\1", name).lower()
        name = re.sub(r"^_", "", name)  # удаляем ведущий _
        return name

    df_iter.columns = [to_snake_case(col) for col in df_iter.columns]

    # Все колонки, содержащие "_id" — UInt32
    for col in df_iter.columns:
        if "_id" in col:
            df_iter[col] = df_iter[col].astype("UInt32")

    # passenger_count — UInt8
    if "passenger_count" in df_iter.columns:
        df_iter["passenger_count"] = df_iter["passenger_count"].astype("UInt8")

    # payment_type и trip_type — UInt8
    for col in ["payment_type", "trip_type"]:
        if col in df_iter.columns:
            df_iter[col] = df_iter[col].astype("UInt8")

    # Остальные числовые поля — float32
    float_cols = [
        "trip_distance",
        "fare_amount",
        "total_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "ehail_fee",
        "improvement_surcharge",
        "congestion_surcharge",
        "cbd_congestion_fee",
    ]
    for col in float_cols:
        if col in df_iter.columns:
            df_iter[col] = df_iter[col].astype("float32")

    # Строковые поля
    str_cols = ["cab_type", "store_and_fwd_flag"]
    for col in str_cols:
        if col in df_iter.columns:
            df_iter[col] = df_iter[col].astype("string")

    # Сохраняем только существующие колонки
    df_iter = df_iter[[col for col in COLUMNS if col in df_iter.columns]]

    return df_iter


for cab in CAB_TYPES:
    for month in MONTHS:
        file_path = download_file(cab, month)
        df = prepare_month(file_path)
        # display(df.head())
        df.info()
        print(df.isna().sum())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3970553 entries, 0 to 3970552
Data columns (total 18 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   driver_id              UInt32        
 1   vendor_id              UInt32        
 2   ratecode_id            UInt32        
 3   pickup_datetime        datetime64[us]
 4   dropoff_datetime       datetime64[us]
 5   trip_distance          float32       
 6   fare_amount            float32       
 7   total_amount           float32       
 8   extra                  float32       
 9   mta_tax                float32       
 10  tip_amount             float32       
 11  tolls_amount           float32       
 12  improvement_surcharge  float32       
 13  congestion_surcharge   float32       
 14  cbd_congestion_fee     float32       
 15  passenger_count        UInt8         
 16  payment_type           UInt8         
 17  store_and_fwd_flag     string        
dtypes: UInt32(3), UInt8(2)