# 01 - Dataset Overview (erweitert)
Ziel: Das Dataset umfassend verstehen, Feature-Engineering erklaeren und die Nutzbarkeit fuer das Modell sauber einordnen.


In [1]:
from pathlib import Path
import sys

ROOT = Path("..").resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))
if str((ROOT / "notebooks").resolve()) not in sys.path:
    sys.path.append(str((ROOT / "notebooks").resolve()))

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from _common import load_dataset

df, metadata = load_dataset()
df.head()


Unnamed: 0,LapNumber,Stint,TyreLife,LapTimeLag1,LapTimeLag2,LapTimeLag3,RollingMean3,EstimatedFuelWeight,FuelEffect,TireDegradation,...,Team,Compound,TrackStatusFlag,Circuit,TireAgeCategory,LapTimeSeconds,Season,RoundNumber,EventName,SessionKey
0,2.0,1.0,2.0,,,,,96.0,0.96,0.07,...,Williams,SOFT,green,Sakhir,fresh,100.548,2022,1,Bahrain Grand Prix,2022_01
1,3.0,1.0,3.0,100.548,,,100.548,94.0,0.94,0.105,...,Williams,SOFT,green,Sakhir,fresh,100.664,2022,1,Bahrain Grand Prix,2022_01
2,4.0,1.0,4.0,100.664,100.548,,100.606,92.0,0.92,0.14,...,Williams,SOFT,green,Sakhir,fresh,101.126,2022,1,Bahrain Grand Prix,2022_01
3,5.0,1.0,5.0,101.126,100.664,100.548,100.779333,90.0,0.9,0.175,...,Williams,SOFT,green,Sakhir,fresh,102.303,2022,1,Bahrain Grand Prix,2022_01
4,6.0,1.0,6.0,102.303,101.126,100.664,101.364333,88.0,0.88,0.21,...,Williams,SOFT,green,Sakhir,fresh,101.708,2022,1,Bahrain Grand Prix,2022_01


## Schnelluebersicht
Kurze Fakten zu Umfang, Saisons und Datenstruktur.


In [2]:
season_counts = df['Season'].value_counts().sort_index()
fig = px.bar(season_counts, title='Laps per Season', labels={'index':'Season','value':'Laps'})
fig


## Train/Val/Test-Split (Seasons/Rounds)
Klarer Ueberblick, welche Daten im Training, in der Validierung und im Test landen (gemaess `SplitConfig`).


In [3]:
from src.split import SplitConfig, split_by_season_round

split_cfg = SplitConfig(test_rounds=None)
train_df, val_df, test_df = split_by_season_round(df, config=split_cfg)

split_cfg_df = pd.DataFrame({
    'train_seasons': [', '.join(map(str, split_cfg.train_seasons))],
    'val_seasons': [', '.join(map(str, split_cfg.val_seasons))],
    'test_season': [split_cfg.test_season],
    'test_rounds': [split_cfg.test_rounds],
})
split_cfg_df

def _rounds_str(frame: pd.DataFrame) -> str:
    rounds = sorted(frame['RoundNumber'].unique().tolist())
    if not rounds:
        return ''
    if len(rounds) <= 12:
        return ', '.join(map(str, rounds))
    return f'{rounds[0]}-{rounds[-1]} ({len(rounds)} total)'

def summarize_split(name: str, frame: pd.DataFrame) -> dict:
    seasons = sorted(frame['Season'].unique().tolist())
    sessions = frame[['Season', 'RoundNumber']].drop_duplicates().shape[0]
    drivers = frame['Driver'].nunique() if 'Driver' in frame.columns else frame['DriverNumber'].nunique()
    return {
        'split': name,
        'laps': len(frame),
        'sessions': sessions,
        'drivers': drivers,
        'seasons': ', '.join(map(str, seasons)),
        'rounds': _rounds_str(frame),
    }

split_summary = pd.DataFrame([
    summarize_split('train', train_df),
    summarize_split('eval', val_df),
    summarize_split('test', test_df),
])
split_summary

season_summary = pd.concat([
    train_df.groupby('Season').size().rename('laps').reset_index().assign(split='train'),
    val_df.groupby('Season').size().rename('laps').reset_index().assign(split='eval'),
    test_df.groupby('Season').size().rename('laps').reset_index().assign(split='test'),
])
season_summary = season_summary[['split', 'Season', 'laps']].sort_values(['split', 'Season'])
season_summary

fig = px.bar(split_summary, x='split', y='laps', text='laps', title='Laps per Split')
fig.update_traces(textposition='outside')
fig.update_layout(xaxis_title='Split', yaxis_title='Laps', uniformtext_minsize=10, uniformtext_mode='hide')
fig


## 5. Methodik des Datensplits (Vermeidung von Betrug)
Ziel: Beantwortung der Frage: "Wie verhindern Sie, dass das Modell in die Zukunft schaut (Data Leakage)"

**Chronologische Trennung:** Training nur bis Ende 2023, Validierung 2024, Test auf der unbekannten Saison 2025.


In [ ]:
seasons = sorted(df["Season"].dropna().unique().tolist())
split_labels = []
for s in seasons:
    if s in split_cfg.train_seasons:
        split_labels.append("train")
    elif s in split_cfg.val_seasons:
        split_labels.append("val")
    elif s == split_cfg.test_season:
        split_labels.append("test")
    else:
        split_labels.append("unused")

timeline_df = pd.DataFrame({"Season": seasons, "Split": split_labels, "_y": 1})
fig = px.bar(
    timeline_df,
    x="Season",
    y="_y",
    color="Split",
    text="Split",
    title="Zeitbasierter Split (Train/Val/Test)"
)
fig.update_layout(yaxis_visible=False, yaxis_showticklabels=False, height=260)
fig


## Feature-Gruppen (Engineering)
Uebersicht ueber die Feature-Gruppen und deren Zweck.

**Zeitliche Dynamik (Lag Features)**
- LapTimeLag1/2/3 und RollingMean3 geben kurzfristige Pace-Dynamik wieder.

**Physik-basierte Features**
- Fuel: EstimatedFuelWeight, FuelEffect (leicher = schneller).
- Reifen: TireDegradation, EstimatedGrip, TyreLifeLog/Sq/Cliff (nichtlinearer Drop-off).

**Track Evolution**
- FieldLapProgress, TrackEvolution: zusaetzlicher Grip durch Gummiabrieb.

**Wetter** (falls vorhanden)
- AirTemp, TrackTemp, Humidity, TempGripEffect.

**Target Encoding (train-only)**
- Driver__te, Team__te, Circuit__te etc. (vermeidet High-Dim One-Hot, reduziert Leckage).


In [None]:
groups = {
    'lags': ['LapTimeLag1','LapTimeLag2','LapTimeLag3','RollingMean3'],
    'fuel': ['EstimatedFuelWeight','FuelEffect'],
    'tire': ['TireDegradation','EstimatedGrip','TyreLifeLog','TyreLifeSq','TyreLifeCliff','TyreLife','TireAgeCategory'],
    'track': ['SessionProgress','CumulativeFieldLaps','FieldLapProgress','TrackEvolution'],
    'weather': ['AirTemp','TrackTemp','TempGripEffect','Humidity'],
    'categorical': metadata.get('categorical_features', []),
    'target_encoding': [c + '__te' for c in metadata.get('categorical_features', [])],
}
rows = []
for group, feats in groups.items():
    for f in feats:
        rows.append({'group': group, 'feature': f, 'present': f in df.columns})
summary = pd.DataFrame(rows)
summary.head(30)


## Zielvariable (LapTimeSeconds)
Verteilung der Zielvariable sowie Unterschiede je Saison.


In [None]:
fig = px.box(df, x='Season', y='LapTimeSeconds', points='suspectedoutliers', title='LapTimeSeconds by Season')
fig


## Reifen- und Stint-Effekte
Compound-Verteilung und Einfluss der Reifenlebensdauer.


In [None]:
fig = None
if 'TyreLife' in df.columns:
    sample = df[['TyreLife','LapTimeSeconds']].dropna().sample(min(5000, len(df)), random_state=42)
    fig = px.scatter(sample, x='TyreLife', y='LapTimeSeconds', opacity=0.3, title='TyreLife vs LapTimeSeconds')
else:
    print('TyreLife not available')
if fig is not None:
    fig


In [None]:
fig = None
if 'Compound' in df.columns:
    compound_counts = df['Compound'].value_counts()
    fig = px.bar(compound_counts, title='Compound Distribution', labels={'index':'Compound','value':'Laps'})
else:
    print('Compound not available')
if fig is not None:
    fig


## Fuel-Effect (Gewicht)
Binning zeigt, wie Gewicht mit LapTime zusammenhaengt.


In [None]:
fig = None
if 'EstimatedFuelWeight' in df.columns:
    tmp = df[['EstimatedFuelWeight','LapTimeSeconds']].dropna().copy()
    tmp['fuel_bin'] = pd.qcut(tmp['EstimatedFuelWeight'], 10, duplicates='drop')
    agg = tmp.groupby('fuel_bin', dropna=False)['LapTimeSeconds'].mean().reset_index()
    fig = px.bar(agg, x='fuel_bin', y='LapTimeSeconds', title='Avg LapTime by Fuel Bin')
else:
    print('EstimatedFuelWeight not available')
if fig is not None:
    fig


## Track Evolution (Grip)
TrackEvolution/FieldLapProgress als Proxy fuer zunehmenden Grip ueber die Session.


In [None]:
fig = None
if 'FieldLapProgress' in df.columns:
    tmp = df[['FieldLapProgress','LapTimeSeconds']].dropna().copy()
    tmp['bin'] = pd.qcut(tmp['FieldLapProgress'], 10, duplicates='drop')
    agg = tmp.groupby('bin', dropna=False)['LapTimeSeconds'].mean().reset_index()
    fig = px.bar(agg, x='bin', y='LapTimeSeconds', title='Avg LapTime by FieldLapProgress')
else:
    print('FieldLapProgress not available')
if fig is not None:
    fig


## 4. Korrelationsanalyse und Feature-Relevanz
Ziel: Beantwortung der Frage: "Helfen diese Features dem Modell wirklich bei der Vorhersage"

**Heatmap:** Korrelationsmatrix zwischen den ingenierten Features und der Zielvariable LapTimeSeconds.

**Ranking:** Balkendiagramm der staerksten statistischen Korrelationen mit der Rundenzeit.


In [None]:
numeric_features = [f for f in metadata.get("numeric_features", []) if f in df.columns]
corr_cols = numeric_features + ["LapTimeSeconds"]
corr = df[corr_cols].corr(numeric_only=True)

fig = px.imshow(
    corr,
    title="Korrelationsmatrix: Engineered Features vs LapTimeSeconds",
    color_continuous_scale="RdBu",
    zmin=-1,
    zmax=1,
)
fig.update_layout(height=600)
fig

corr_target = corr["LapTimeSeconds"].drop("LapTimeSeconds")
corr_target = corr_target.sort_values(key=lambda s: s.abs(), ascending=False)
top = corr_target.head(20).reset_index()
top.columns = ["feature", "corr"]
fig2 = px.bar(
    top,
    x="corr",
    y="feature",
    orientation="h",
    title="Feature-Relevanz: Korrelation mit LapTimeSeconds (Top 20)",
)
fig2.update_layout(yaxis={"categoryorder": "total ascending"})
fig2


## Leakage-Vermeidung (train-only Engineering)
Einige Features werden **ausschliesslich** auf dem Trainingssplit berechnet (z.B. Target Encoding,
historische Pace pro Fahrer/Team). Dadurch wird verhindert, dass Informationen aus der Zukunft in
das Training einflieszen. In diesem Notebook werden nur die *basis* Features gezeigt.


## 6. Zusammenfassung der "Model-Readiness"
Ziel: Ein abschliessendes Statement zur Datenqualitaet.

**Letzter Check auf fehlende Werte (NaN-Handling)** und **Bestaetigung der finalen Feature-Liste**.


In [ ]:
numeric_features = [f for f in metadata.get("numeric_features", []) if f in df.columns]
categorical_features = [f for f in metadata.get("categorical_features", []) if f in df.columns]

feature_list = pd.DataFrame({
    "feature": numeric_features + categorical_features,
    "type": ["numeric"] * len(numeric_features) + ["categorical"] * len(categorical_features),
})
feature_list.head(30)

check_cols = ["LapTimeSeconds"] + numeric_features + categorical_features
missing = (df[check_cols].isna().mean() * 100).sort_values(ascending=False)
missing_df = missing.reset_index()
missing_df.columns = ["feature", "missing_pct"]
missing_df.head(25)


## Beispiel-Zeilen (Feature Snapshot)
Kompakter Blick auf wichtige Features.


In [None]:
cols = [
    'Season','RoundNumber','Driver','Team','Circuit','Compound',
    'LapNumber','TyreLife','EstimatedFuelWeight','TireDegradation','TrackEvolution',
    'LapTimeSeconds'
]
cols = [c for c in cols if c in df.columns]
df[cols].head(10)


## Erweiterte Analysen
Zusatzanalysen fuer Fahrer/Teams, Strecken, Zeitverlauf und Feature-Glossar.


### Fahrer/Team Summaries
Uebersicht der durchschnittlichen Rundenzeiten pro Fahrer/Team.


In [None]:
fig = None
if 'Driver' in df.columns:
    driver_mae = df.groupby('Driver')['LapTimeSeconds'].mean().sort_values()
    fig = px.bar(driver_mae.head(20), title='Top 20 Fahrer (schnellste durchschnittliche LapTime)', labels={'index':'Driver','value':'LapTimeSeconds'})
else:
    print('Driver not available')
if fig is not None:
    fig


In [None]:
fig = None
if 'Team' in df.columns:
    team_mae = df.groupby('Team')['LapTimeSeconds'].mean().sort_values()
    fig = px.bar(team_mae, title='Durchschnittliche LapTime pro Team', labels={'index':'Team','value':'LapTimeSeconds'})
else:
    print('Team not available')
if fig is not None:
    fig


### Strecken-Schwierigkeit (Circuit Difficulty Proxy)
Durchschnittliche LapTime pro Circuit als grober Proxy fuer Streckencharakteristik.


In [None]:
fig = None
if 'Circuit' in df.columns:
    circuit_mean = df.groupby('Circuit')['LapTimeSeconds'].mean().sort_values()
    fig = px.bar(circuit_mean, title='Avg LapTime by Circuit', labels={'index':'Circuit','value':'LapTimeSeconds'})
else:
    print('Circuit not available')
if fig is not None:
    fig


### Zeitverlauf innerhalb eines Rennens
Beispielhafte Entwicklung der LapTime ueber Runden (Random Circuit/Driver Sample).


In [None]:
fig = None
if all(c in df.columns for c in ['Season','RoundNumber','Driver','LapNumber','LapTimeSeconds']):
    sample = df.dropna(subset=['Season','RoundNumber','Driver']).sample(1, random_state=42).iloc[0]
    mask = (df['Season'] == sample['Season']) & (df['RoundNumber'] == sample['RoundNumber']) & (df['Driver'] == sample['Driver'])
    sub = df.loc[mask].sort_values('LapNumber')
    title = 'Time Series: {} (Season {}, Round {})'.format(sample['Driver'], int(sample['Season']), int(sample['RoundNumber']))
    fig = px.line(sub, x='LapNumber', y='LapTimeSeconds', title=title)
else:
    print('Required columns not available')
if fig is not None:
    fig


### Feature Glossar (Kurzbeschreibung)
Kompakte Erklaerung der wichtigsten engineered Features.


In [None]:
glossary = [
    ('LapTimeLag1/2/3', 'Vorherige Rundenzeiten als kurzfristige Pace-Info'),
    ('RollingMean3', 'Glaettung der Pace ueber die letzten 3 Runden'),
    ('EstimatedFuelWeight', 'Schaetzung verbleibendes Fuelgewicht (kg)'),
    ('FuelEffect', 'Normierter Fuel-Effekt (0-1)'),
    ('TireDegradation', 'Schaetzung kumulierte Reifenabnutzung'),
    ('EstimatedGrip', 'Proxy fuer verbleibenden Grip'),
    ('TyreLifeLog/Sq/Cliff', 'Nichtlineare Reifenalter-Features (Drop-off)'),
    ('TrackEvolution', 'Rubbering-In Effekt durch kumulierte Runden'),
    ('FieldLapProgress', 'Fortschritt der Session bezogen auf alle Fahrer'),
    ('Target Encodings', 'Zielwert-basierte Kodierung von Kategorien (train-only)'),
]
pd.DataFrame(glossary, columns=['Feature', 'Nutzen']).head(20)


In [None]:
fig = px.histogram(df, x='LapTimeSeconds', nbins=80, title='LapTimeSeconds Distribution')
fig
