In [1]:
import sys
from pathlib import Path

ROOT = Path.cwd().parent          # eine Ebene über notebooks/
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

import pandas as pd
import numpy as np

weather = pd.read_csv("../data/weather_hourly_clean.csv")
taxi = pd.read_csv("../data/Taxi_final_1M_clean.csv")

# 1. Prepare Taxi-Data

In [2]:
# Zeitspalten in echte Datumswerte umwandeln (UTC)
#    Daten werden nicht auf UTC standardisiert, sondern in der lokalen Zeitzone belassen, um 
#    leichtere interpretiertbarkeit zu gewährleisten

taxi["tpep_pickup_datetime"]  = pd.to_datetime(taxi["tpep_pickup_datetime"]) # ohne utc=True
taxi["tpep_dropoff_datetime"] = pd.to_datetime(taxi["tpep_dropoff_datetime"]) # ohne utc=True

# Auf ganze Stunde runden
taxi["pickup_hour"] = taxi["tpep_pickup_datetime"].dt.floor("H")

# ---------------------------------------------------------
# 2) Aggregation: Nachfrage pro Stunde und Zone
#    (plus optionale Zusatzfeatures)
# ---------------------------------------------------------
taxi_grouped = (
    taxi
    .groupby(["pickup_hour", "PULocationID"])
    .agg(
        trip_count      = ("PULocationID", "size"),
        mean_distance   = ("trip_distance", "mean"),
        mean_duration   = ("trip_duration", "mean"),
        mean_speed      = ("average_speed", "mean"),
        mean_fare       = ("fare_amount", "mean"),
        mean_tip        = ("tip_amount", "mean"),
        mean_passengers = ("passenger_count", "mean")
    )
    .reset_index()
)

# 1a) Stadtweite Aggregation pro Stunde
city = (
    taxi_grouped
    .groupby("pickup_hour", as_index=False)
    .agg(
        trip_count    = ("trip_count", "sum"),   # Summe aller Trips in der Stadt
        mean_distance = ("mean_distance", "mean"),
        mean_duration = ("mean_duration", "mean"),
        mean_speed    = ("mean_speed", "mean"),
        mean_fare     = ("mean_fare", "mean"),
        mean_tip      = ("mean_tip", "mean"),
        mean_passengers = ("mean_passengers", "mean"),
    )
)

print("City shape:", city.shape)

  taxi["pickup_hour"] = taxi["tpep_pickup_datetime"].dt.floor("H")


City shape: (8749, 8)


# 2. Prepare Weather Data

In [3]:
# ---------------------------------------------------------
# 3) Wetterdaten laden (stündlich, bereits bereinigt)
# ---------------------------------------------------------

# time-Spalte in datetime (UTC) umwandeln, falls nicht schon geschehen --> in unserem Fall nicht benötigt,
# da Taxi Daten bereits in America/New_York sind und die Wetter Daten in der gleichen Zone gefetcht wurden
weather["time"] = pd.to_datetime(weather["time"]) # ohne utc=True

num_cols = weather.columns.drop("time")
weather[num_cols] = weather[num_cols].apply(
    lambda s: pd.to_numeric(s, errors="coerce")
)

print("Weather shape:", weather.shape)

Weather shape: (8760, 14)


# 3. Merge Taxi x Weather

In [4]:
# --------------------------------------------------------------
# 4) Merge: Taxi-Nachfrage pro Stunde+Zone  ×  Wetter pro Stunde
# --------------------------------------------------------------
# 3. Merge Taxi x Weather (Stadt + Stunde)
df = city.merge(
    weather,
    left_on="pickup_hour",
    right_on="time",
    how="left",
)

df = df.drop(columns=["time"])
print("Merged shape (city-level):", df.shape)

Merged shape (city-level): (8749, 21)


# 4. Create Features for model

In [5]:
# 4. Create Features for model
df["hour"] = df["pickup_hour"].dt.hour
df["dow"] = df["pickup_hour"].dt.dayofweek
df["is_weekend"] = df["dow"].isin([5, 6]).astype(int)

df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
df["dow_sin"]  = np.sin(2 * np.pi * df["dow"] / 7)
df["dow_cos"]  = np.cos(2 * np.pi * df["dow"] / 7)

df.to_csv("../data/taxi_weather_merged-2.csv", index=False)