# 0. Überblick und Setup
In diesem Notebook gehen wir auf das Vorlesungskapitel 2 mit den Inhalten **Lage + Streuung + Verteilungsform** ein.

In [7]:
import sys
from pathlib import Path

ROOT = Path.cwd().parent          # eine Ebene über notebooks/
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore, median_abs_deviation
from src.utils_stats import iqr, trimmed_mean, tukey_outliers, modified_z_score, ecdf, z_score, create_na_table
from src.utils_plots import hist_kde, box_violin, plot_ecdf, hist_with_fences, plot_qq


PATH = "../data/Taxi_final_1M.csv"
df = pd.read_csv(PATH)

print(80*"=")
print(f"Datensatz geladen: {df.shape[0]:,} Zeilen, {df.shape[1]} Spalten")
print(80*"-")
print("Erste Zeilen:", "\n", df.head(3))
print(80*"-")
print(df.describe())
print(80*"-")
print(df.info())
print(80*"=")

Datensatz geladen: 999,997 Zeilen, 17 Spalten
--------------------------------------------------------------------------------
Erste Zeilen: 
   tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  trip_distance  \
0  2023-02-06 18:31:28   2023-02-06 18:41:28              1.0            2.8   
1  2023-01-13 12:22:41   2023-01-13 12:54:42              1.0            0.0   
2  2023-01-24 12:53:51   2023-01-24 13:06:09              5.0            2.3   

   PULocationID  DOLocationID  payment_type  fare_amount  extra  tip_amount  \
0           162           113             1         11.4    2.5         2.0   
1            76            61             1         22.0    0.0         0.0   
2           142            48             2         12.1    0.0         0.0   

   tolls_amount  improvement_surcharge  total_amount  congestion_surcharge  \
0           0.0                    1.0          19.9                   2.5   
1           0.0                    1.0          23.5           

# 1. Lagekennzahlen

In [8]:
num_cols = df.select_dtypes(include="number")
lagekennzahlen_df = pd.DataFrame({
    "n": num_cols.count(),
    "Mean": num_cols.mean(),
    "Median": num_cols.median(),
    "Modus": num_cols.apply(lambda s: ", ".join(map(str, s.mode().tolist()))), # Wenn es mehrere Modi gibt diese als comma-separated Liste anzeigen
    "Trimmed Mean with 10%": num_cols.apply(lambda x: trimmed_mean(x)),
    "Min": num_cols.min(), 
    "Q1": num_cols.quantile(0.25),
    "Q3": num_cols.quantile(0.75),
    "Max": num_cols.max() 

})
format = {
    c: "{:.2f}"
    for c in lagekennzahlen_df.columns
    if pd.api.types.is_numeric_dtype(lagekennzahlen_df[c])
}

print(115*"=")
print("Überblick über fehlende Werte nach Spalte:")
display(create_na_table(num_cols))
print("\n")

print(115*"-")
print("Überblick über Lagekennzahlen nach Spalte:")
display(lagekennzahlen_df.style.format(format))
print(115*"=")

#plot_hist_with_trimmed_mean(s, alpha=0.10, bins=20,title="Getrimmter Mittelwert ist robuster gegen Ausreisser")

#plot_hist_with_outlier(s, outlier_value=100, bins=12, title="Histogramm mit Ausreisser, Mittelwert & Median")

#plot_hist_no_outlier(s, bins=20, title="Ohne Ausreisser")
#plot_hist_with_outlier_only(s, outlier_value=120, bins=20, title="Mit Ausreisser")

Überblick über fehlende Werte nach Spalte:


Unnamed: 0,column,n_missing,percent_missing
0,passenger_count,34408,3.4
1,trip_distance,0,0.0
2,PULocationID,0,0.0
3,DOLocationID,0,0.0
4,payment_type,0,0.0
5,fare_amount,0,0.0
6,extra,0,0.0
7,tip_amount,0,0.0
8,tolls_amount,0,0.0
9,improvement_surcharge,0,0.0




-------------------------------------------------------------------------------------------------------------------
Überblick über Lagekennzahlen nach Spalte:


Unnamed: 0,n,Mean,Median,Modus,Trimmed Mean with 10%,Min,Q1,Q3,Max
passenger_count,965589.0,1.37,1.0,1.0,1.17,0.0,1.0,1.0,8.0
trip_distance,999997.0,6.38,2.9,1.4,3.8,0.0,1.7,5.5,148560.1
PULocationID,999997.0,165.11,162.0,132.0,168.5,1.0,132.0,234.0,265.0
DOLocationID,999997.0,164.01,162.0,236.0,168.33,1.0,114.0,234.0,265.0
payment_type,999997.0,1.19,1.0,1.0,1.11,0.0,1.0,1.0,4.0
fare_amount,999997.0,19.46,13.5,9.3,15.89,-900.0,9.3,21.9,2449.5
extra,999997.0,1.55,1.0,0.0,1.25,-7.5,0.0,2.5,14.25
tip_amount,999997.0,3.51,2.8,0.0,2.76,-70.0,1.0,4.4,700.0
tolls_amount,999997.0,0.59,0.0,0.0,0.0,-36.05,0.0,0.0,85.0
improvement_surcharge,999997.0,0.98,1.0,1.0,1.0,-1.0,1.0,1.0,1.0




# 2. Streuungskennzahlen

In [9]:
# Klassische & robuste Streuungskennzahlen
streuungskennzahlen_df = pd.DataFrame({
    "SD": num_cols.std(),
    "Var": num_cols.var(),
    "IQR": num_cols.quantile(0.75) - num_cols.quantile(0.25),
    "MAD": num_cols.apply(lambda x: median_abs_deviation(x)),
    "Range": num_cols.max() - num_cols.min()
})
display(streuungskennzahlen_df.style.format("{:,.2f}"))

Unnamed: 0,SD,Var,IQR,MAD,Range
passenger_count,0.89,0.8,0.0,,8.0
trip_distance,218.07,47555.68,3.8,1.5,148560.1
PULocationID,63.99,4094.37,102.0,62.0,264.0
DOLocationID,69.77,4867.7,120.0,68.0,264.0
payment_type,0.56,0.31,0.0,0.0,4.0
fare_amount,19.0,361.11,12.6,5.6,3349.5
extra,1.84,3.39,2.5,1.0,21.75
tip_amount,4.12,17.0,3.4,1.8,770.0
tolls_amount,2.19,4.77,0.0,0.0,121.05
improvement_surcharge,0.2,0.04,0.0,0.0,2.0


# 3. Ausreißer erkennen (Diagnose)

In [10]:
def create_outlier_table(df):
    results = []

    for col in num_cols.columns:
        s = num_cols[col].astype(float)

        z_mask = z_score(s)

        modz_mask = modified_z_score(s)
        tukey_mask = tukey_outliers(s)

        results.append({
            "Spalte": col,
            "Z-Score": int(z_mask.sum()),
            "Modified Z-Score": int(modz_mask.sum()),
            "Tukey IQR": int(tukey_mask.sum())
        })

    outlier_table = pd.DataFrame(results).set_index("Spalte")
    display(outlier_table)

create_outlier_table(num_cols)

  x = asanyarray(arr - arrmean)


Unnamed: 0_level_0,Z-Score,Modified Z-Score,Tukey IQR
Spalte,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
passenger_count,21021,0,240076
trip_distance,24,136426,130715
PULocationID,0,0,0
DOLocationID,0,0,0
payment_type,19319,0,221690
fare_amount,12118,101557,108910
extra,16991,18729,18294
tip_amount,25905,47659,76499
tolls_amount,6187,0,81129
improvement_surcharge,11101,0,11101


# 4. Verteilungsform (Histogramm + KDE)

In [None]:
for v,u in [("trip_distance","miles"),
            ("trip_duration","min"),
            ("average_speed","mph"),
            ("fare_amount","USD"),
            ("tip_amount","USD"),
            ("total_amount","USD")]:
    hist_kde(df[v], title=v, unit=u, bw_adjust_list=(0.7, 1.0, 1.8))
    box_violin(df[v], title=v)
    plot_ecdf(df[v], title=v, unit=u)