# TODO:
- check ob dieser code hier stimmt (rundungen korrekt?, grouping korrekt?, time droppen korrekt?, aggregation korrekt?, merge korrekt?, neue features korrekt?)
- check wegen timezone
- linreg model implementieren
- linreg model optimimieren

In [None]:
import sys
from pathlib import Path

ROOT = Path.cwd().parent          # eine Ebene über notebooks/
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

import pandas as pd
import numpy as np

weather = pd.read_csv("../data/weather_hourly.csv")
taxi = pd.read_csv("../data/Taxi_final_1M_clean.csv")

# 1. Prepare Taxi-Data

In [None]:
# Zeitspalten in echte Datumswerte umwandeln (UTC)
taxi["tpep_pickup_datetime"]  = pd.to_datetime(
    taxi["tpep_pickup_datetime"], utc=True
)
taxi["tpep_dropoff_datetime"] = pd.to_datetime(
    taxi["tpep_dropoff_datetime"], utc=True
)

# Auf volle Stunde runden
taxi["pickup_hour"] = taxi["tpep_pickup_datetime"].dt.floor("H")

# ---------------------------------------------------------
# 2) Aggregation: Nachfrage pro Stunde und Zone
#    (plus optionale Zusatzfeatures)
# ---------------------------------------------------------
# Minimale Variante: nur Nachfrage (trip_count)
taxi_grouped = (
    taxi
    .groupby(["pickup_hour", "PULocationID"])
    .agg(
        trip_count      = ("PULocationID", "size"),
        mean_distance   = ("trip_distance", "mean"),
        mean_duration   = ("trip_duration", "mean"),
        mean_speed      = ("average_speed", "mean"),
        mean_fare       = ("fare_amount", "mean"),
        mean_tip        = ("tip_amount", "mean"),
        mean_passengers = ("passenger_count", "mean"),
    )
    .reset_index()
)

# 2. Prepare Weather Data

In [None]:
# ---------------------------------------------------------
# 3) Wetterdaten laden (stündlich, bereits bereinigt)
# ---------------------------------------------------------

# time-Spalte in datetime (UTC) umwandeln, falls nicht schon geschehen
weather["time"] = pd.to_datetime(weather["time"], utc=True)

# Optional: sicherstellen, dass alle numerischen Spalten wirklich numerisch sind
num_cols = weather.columns.drop("time")
weather[num_cols] = weather[num_cols].apply(
    lambda s: pd.to_numeric(s, errors="coerce")
)

print("Weather shape:", weather.shape)

# 3. Merge Taxi x Weather

In [None]:
# ---------------------------------------------------------
# 4) Merge: Taxi-Nachfrage pro Stunde+Zone  ×  Wetter pro Stunde
# ---------------------------------------------------------
# Wetter ist stadtweit → gleicher Wetterwert für alle Zonen in derselben Stunde
df = taxi_grouped.merge(
    weather,
    left_on="pickup_hour",
    right_on="time",
    how="left",
)

# time-Spalte aus Wetter brauchen wir nach dem Merge nicht mehr doppelt
df = df.drop(columns=["time"])

print("Merged shape:", df.shape)

# 4. Create Features for model

In [None]:
# ---------------------------------------------------------
# 5) Zeit-Features für das Modell (sehr wichtig)
# ---------------------------------------------------------
df["hour"] = df["pickup_hour"].dt.hour
df["dow"]  = df["pickup_hour"].dt.dayofweek          # Montag=0
df["is_weekend"] = df["dow"].isin([5, 6]).astype(int)

# Zyklische Kodierung (besser für ML-Modelle)
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
df["dow_sin"]  = np.sin(2 * np.pi * df["dow"] / 7)
df["dow_cos"]  = np.cos(2 * np.pi * df["dow"] / 7)

# ---------------------------------------------------------
# 6) Optional: Datensatz speichern für weitere Modellierung
# ---------------------------------------------------------
df.to_csv("../data/taxi_weather_zone_hourly.csv", index=False)