# 0. Überblick und Setup
In diesem Notebook gehen wir auf das Vorlesungskapitel 2 mit den Inhalten **Lage + Streuung + Verteilungsform** ein.

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys, os
sys.path.append(os.path.abspath("."))
from src.utils_stats import *
from src.utils_plots import *

PATH = "../data/Taxi_final_1M.csv"
df = pd.read_csv(PATH)

print(80*"=")
print(f"Datensatz geladen: {df.shape[0]:,} Zeilen, {df.shape[1]} Spalten")
print(80*"-")
print("Erste Zeilen:", "\n", df.head(3))
print(80*"-")
print(df.describe())
print(80*"-")
print(df.info())
print(80*"=")

Datensatz geladen: 999,997 Zeilen, 17 Spalten
--------------------------------------------------------------------------------
Erste Zeilen: 
   tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  trip_distance  \
0  2023-02-06 18:31:28   2023-02-06 18:41:28              1.0            2.8   
1  2023-01-13 12:22:41   2023-01-13 12:54:42              1.0            0.0   
2  2023-01-24 12:53:51   2023-01-24 13:06:09              5.0            2.3   

   PULocationID  DOLocationID  payment_type  fare_amount  extra  tip_amount  \
0           162           113             1         11.4    2.5         2.0   
1            76            61             1         22.0    0.0         0.0   
2           142            48             2         12.1    0.0         0.0   

   tolls_amount  improvement_surcharge  total_amount  congestion_surcharge  \
0           0.0                    1.0          19.9                   2.5   
1           0.0                    1.0          23.5           

# 1. Lagekennzahlen

In [13]:
vars_num = ["trip_distance","trip_duration","average_speed","fare_amount","tip_amount","total_amount"]

def location_row(s):
    s = s.dropna()
    return pd.Series({
        "n": s.size,
        "Median": s.median(),
        "Mean": s.mean(),
        "TrimmedMean(10%)": trimmed_mean(s, 0.10)
    })

location_summary = pd.concat([location_row(df[v]).rename(v) for v in vars_num], axis=1).T
display(location_summary.style.format("{:,.2f}"))

Unnamed: 0,n,Median,Mean,TrimmedMean(10%)
trip_distance,999997.0,2.9,6.38,3.74
trip_duration,999997.0,12.6,17.45,14.13
average_speed,999677.0,24.8,inf,26.43
fare_amount,999997.0,13.5,19.46,15.65
tip_amount,999997.0,2.8,3.51,2.46
total_amount,999997.0,21.0,28.39,23.64


# 2. Streuungskennzahlen

In [14]:
def spread_summary(s):
    s = s.dropna()
    q1, q3 = s.quantile([.25, .75])
    IQR = q3 - q1
    MAD = (s - s.median()).abs().median()
    return pd.Series({
        "IQR": IQR,
        "MAD": MAD,
        "Var": np.var(s, ddof=1),
        "SD": np.std(s, ddof=1)
    })

spread_tbl = pd.concat([spread_summary(df[v]).rename(v) for v in vars_num], axis=1).T
display(spread_tbl)

Unnamed: 0,IQR,MAD,Var,SD
trip_distance,3.8,1.5,47555.684075,218.072658
trip_duration,13.1,5.9,1675.772949,40.936206
average_speed,16.3,7.7,,
fare_amount,12.6,5.6,361.109557,19.002883
tip_amount,3.4,1.8,16.998631,4.12294
total_amount,14.77,6.3,561.08871,23.687311


# 3. Ausreißer erkennen (Diagnose)

In [12]:
def tukey_fences(s, k=1.5):
    s = s.dropna()
    q1, q3 = s.quantile([.25,.75])
    I = q3 - q1
    return q1 - k * I, q3 + k * I

def modified_z(s):
    s = s.dropna()
    MAD = (s - s.median()).abs().median()
    if MAD == 0: return pd.Series(0.0, index=s.index)
    return 0.6745 * (s - s.median()) / MAD

rows = []
for v in vars_num:
    s = df[v].dropna()
    lo, hi = tukey_fences(s, 1.5)
    tukey_mask = (s < lo) | (s > hi)
    mz = modified_z(s).abs() > 3.5

    # Flags im DF (für Kap. 4)
    df[f"{v}__out_tukey"] = False
    df.loc[s.index, f"{v}__out_tukey"] = tukey_mask
    df[f"{v}__out_modz"]  = False
    df.loc[s.index, f"{v}__out_modz"]  = mz

    rows.append([v, s.size, int(tukey_mask.sum()), int(mz.sum()), lo, hi])

outlier_df = pd.DataFrame(rows, columns=["var","n","Tukey_1.5IQR","modZ>|3.5|","lo","hi"])
display(outlier_df.style.format({"lo":"{:.2f}","hi":"{:.2f}"}))

Unnamed: 0,var,n,Tukey_1.5IQR,modZ>|3.5|,lo,hi
0,trip_distance,999997,130715,136426,-4.0,11.2
1,trip_duration,999997,59672,49586,-12.05,40.35
2,average_speed,999677,68599,51124,-6.25,58.95
3,fare_amount,999997,108910,101557,-9.6,40.8
4,tip_amount,999997,76499,47659,-4.1,9.5
5,total_amount,999997,125095,120809,-6.21,52.88


# 4. Verteilungsform (Histogramm + KDE)

In [None]:
for v,u in [("trip_distance","miles"),
            ("trip_duration","min"),
            ("average_speed","mph"),
            ("fare_amount","USD"),
            ("tip_amount","USD"),
            ("total_amount","USD")]:
    hist_kde(df[v], title=v, unit=u, bw_adjust_list=(0.7, 1.0, 1.8))
    box_violin(df[v], title=v)
    plot_ecdf(df[v], title=v, unit=u)