# 01 - Dataset Overview (erweitert)
Ziel: Das Dataset umfassend verstehen, Feature-Engineering erklaeren und die Nutzbarkeit fuer das Modell sauber einordnen.


In [ ]:
from pathlib import Path
import sys

ROOT = Path("..").resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))
if str((ROOT / "notebooks").resolve()) not in sys.path:
    sys.path.append(str((ROOT / "notebooks").resolve()))

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from _common import load_dataset

df, metadata = load_dataset()
df.head()


## Schnelluebersicht
Kurze Fakten zu Umfang, Saisons und Datenstruktur.


In [ ]:
season_counts = df['Season'].value_counts().sort_index()
fig = px.bar(season_counts, title='Laps per Season', labels={'index':'Season','value':'Laps'})
fig


## Train/Val/Test-Split (Seasons/Rounds)
Klarer Ueberblick, welche Daten im Training, in der Validierung und im Test landen (gemaess `SplitConfig`).


In [None]:
from src.split import SplitConfig, split_by_season_round

split_cfg = SplitConfig()
train_df, val_df, test_df = split_by_season_round(df, config=split_cfg)

split_cfg_df = pd.DataFrame({
    'train_seasons': [', '.join(map(str, split_cfg.train_seasons))],
    'val_seasons': [', '.join(map(str, split_cfg.val_seasons))],
    'test_season': [split_cfg.test_season],
    'test_rounds': [split_cfg.test_rounds],
})
split_cfg_df

def _rounds_str(frame: pd.DataFrame) -> str:
    rounds = sorted(frame['RoundNumber'].unique().tolist())
    if not rounds:
        return ''
    if len(rounds) <= 12:
        return ', '.join(map(str, rounds))
    return f'{rounds[0]}-{rounds[-1]} ({len(rounds)} total)'

def summarize_split(name: str, frame: pd.DataFrame) -> dict:
    seasons = sorted(frame['Season'].unique().tolist())
    sessions = frame[['Season', 'RoundNumber']].drop_duplicates().shape[0]
    drivers = frame['Driver'].nunique() if 'Driver' in frame.columns else frame['DriverNumber'].nunique()
    return {
        'split': name,
        'laps': len(frame),
        'sessions': sessions,
        'drivers': drivers,
        'seasons': ', '.join(map(str, seasons)),
        'rounds': _rounds_str(frame),
    }

split_summary = pd.DataFrame([
    summarize_split('train', train_df),
    summarize_split('eval', val_df),
    summarize_split('test', test_df),
])
split_summary

season_summary = pd.concat([
    train_df.groupby('Season').size().rename('laps').reset_index().assign(split='train'),
    val_df.groupby('Season').size().rename('laps').reset_index().assign(split='eval'),
    test_df.groupby('Season').size().rename('laps').reset_index().assign(split='test'),
])
season_summary = season_summary[['split', 'Season', 'laps']].sort_values(['split', 'Season'])
season_summary

fig = px.bar(split_summary, x='split', y='laps', text='laps', title='Laps per Split')
fig.update_traces(textposition='outside')
fig.update_layout(xaxis_title='Split', yaxis_title='Laps', uniformtext_minsize=10, uniformtext_mode='hide')
fig


## Datenqualitaet und Missing Values
Wir analysieren fehlende Werte je Feature, um saubere Feature-Auswahl zu ermoeglichen.


In [ ]:
top_missing = missing_df.head(20)
fig = px.bar(top_missing, x='missing_ratio', y='feature', orientation='h', title='Top 20 Missing Ratios')
fig


## Feature-Gruppen (Engineering)
Uebersicht ueber die Feature-Gruppen und deren Zweck.

**Zeitliche Dynamik (Lag Features)**
- LapTimeLag1/2/3 und RollingMean3 geben kurzfristige Pace-Dynamik wieder.

**Physik-basierte Features**
- Fuel: EstimatedFuelWeight, FuelEffect (leicher = schneller).
- Reifen: TireDegradation, EstimatedGrip, TyreLifeLog/Sq/Cliff (nichtlinearer Drop-off).

**Track Evolution**
- FieldLapProgress, TrackEvolution: zusaetzlicher Grip durch Gummiabrieb.

**Wetter** (falls vorhanden)
- AirTemp, TrackTemp, Humidity, TempGripEffect.

**Target Encoding (train-only)**
- Driver__te, Team__te, Circuit__te etc. (vermeidet High-Dim One-Hot, reduziert Leckage).


In [ ]:
groups = {
    'lags': ['LapTimeLag1','LapTimeLag2','LapTimeLag3','RollingMean3'],
    'fuel': ['EstimatedFuelWeight','FuelEffect'],
    'tire': ['TireDegradation','EstimatedGrip','TyreLifeLog','TyreLifeSq','TyreLifeCliff','TyreLife','TireAgeCategory'],
    'track': ['SessionProgress','CumulativeFieldLaps','FieldLapProgress','TrackEvolution'],
    'weather': ['AirTemp','TrackTemp','TempGripEffect','Humidity'],
    'categorical': metadata.get('categorical_features', []),
    'target_encoding': [c + '__te' for c in metadata.get('categorical_features', [])],
}
rows = []
for group, feats in groups.items():
    for f in feats:
        rows.append({'group': group, 'feature': f, 'present': f in df.columns})
summary = pd.DataFrame(rows)
summary.head(30)


## Zielvariable (LapTimeSeconds)
Verteilung der Zielvariable sowie Unterschiede je Saison.


In [ ]:
fig = px.box(df, x='Season', y='LapTimeSeconds', points='suspectedoutliers', title='LapTimeSeconds by Season')
fig


## Reifen- und Stint-Effekte
Compound-Verteilung und Einfluss der Reifenlebensdauer.


In [ ]:
fig = None
if 'TyreLife' in df.columns:
    sample = df[['TyreLife','LapTimeSeconds']].dropna().sample(min(5000, len(df)), random_state=42)
    fig = px.scatter(sample, x='TyreLife', y='LapTimeSeconds', opacity=0.3, title='TyreLife vs LapTimeSeconds')
else:
    print('TyreLife not available')
if fig is not None:
    fig


In [None]:
fig = None
if 'Compound' in df.columns:
    compound_counts = df['Compound'].value_counts()
    fig = px.bar(compound_counts, title='Compound Distribution', labels={'index':'Compound','value':'Laps'})
else:
    print('Compound not available')
if fig is not None:
    fig


## Fuel-Effect (Gewicht)
Binning zeigt, wie Gewicht mit LapTime zusammenhaengt.


In [ ]:
fig = None
if 'EstimatedFuelWeight' in df.columns:
    tmp = df[['EstimatedFuelWeight','LapTimeSeconds']].dropna().copy()
    tmp['fuel_bin'] = pd.qcut(tmp['EstimatedFuelWeight'], 10, duplicates='drop')
    agg = tmp.groupby('fuel_bin', dropna=False)['LapTimeSeconds'].mean().reset_index()
    fig = px.bar(agg, x='fuel_bin', y='LapTimeSeconds', title='Avg LapTime by Fuel Bin')
else:
    print('EstimatedFuelWeight not available')
if fig is not None:
    fig


## Track Evolution (Grip)
TrackEvolution/FieldLapProgress als Proxy fuer zunehmenden Grip ueber die Session.


In [ ]:
fig = None
if 'FieldLapProgress' in df.columns:
    tmp = df[['FieldLapProgress','LapTimeSeconds']].dropna().copy()
    tmp['bin'] = pd.qcut(tmp['FieldLapProgress'], 10, duplicates='drop')
    agg = tmp.groupby('bin', dropna=False)['LapTimeSeconds'].mean().reset_index()
    fig = px.bar(agg, x='bin', y='LapTimeSeconds', title='Avg LapTime by FieldLapProgress')
else:
    print('FieldLapProgress not available')
if fig is not None:
    fig


## Feature-Korrelationen (Top)
Welche numerischen Features korrelieren am staerksten mit der Zielvariable?


In [ ]:
fig = None
numeric_cols = [c for c in df.columns if df[c].dtype != 'object']
corr = df[numeric_cols].corr(numeric_only=True)['LapTimeSeconds'].drop('LapTimeSeconds').sort_values(key=lambda s: s.abs(), ascending=False)
top = corr.head(15).reset_index()
top.columns = ['feature','corr']
fig = px.bar(top, x='corr', y='feature', orientation='h', title='Top Correlations with LapTimeSeconds')
if fig is not None:
    fig


## Leakage-Vermeidung (train-only Engineering)
Einige Features werden **ausschliesslich** auf dem Trainingssplit berechnet (z.B. Target Encoding,
historische Pace pro Fahrer/Team). Dadurch wird verhindert, dass Informationen aus der Zukunft in
das Training einflieszen. In diesem Notebook werden nur die *basis* Features gezeigt.


## Beispiel-Zeilen (Feature Snapshot)
Kompakter Blick auf wichtige Features.


In [ ]:
cols = [
    'Season','RoundNumber','Driver','Team','Circuit','Compound',
    'LapNumber','TyreLife','EstimatedFuelWeight','TireDegradation','TrackEvolution',
    'LapTimeSeconds'
]
cols = [c for c in cols if c in df.columns]
df[cols].head(10)


## Erweiterte Analysen
Zusatzanalysen fuer Fahrer/Teams, Strecken, Zeitverlauf und Feature-Glossar.


### Fahrer/Team Summaries
Uebersicht der durchschnittlichen Rundenzeiten pro Fahrer/Team.


In [None]:
fig = None
if 'Driver' in df.columns:
    driver_mae = df.groupby('Driver')['LapTimeSeconds'].mean().sort_values()
    fig = px.bar(driver_mae.head(20), title='Top 20 Fahrer (schnellste durchschnittliche LapTime)', labels={'index':'Driver','value':'LapTimeSeconds'})
else:
    print('Driver not available')
if fig is not None:
    fig


In [ ]:
fig = None
if 'Team' in df.columns:
    team_mae = df.groupby('Team')['LapTimeSeconds'].mean().sort_values()
    fig = px.bar(team_mae, title='Durchschnittliche LapTime pro Team', labels={'index':'Team','value':'LapTimeSeconds'})
else:
    print('Team not available')
if fig is not None:
    fig


### Strecken-Schwierigkeit (Circuit Difficulty Proxy)
Durchschnittliche LapTime pro Circuit als grober Proxy fuer Streckencharakteristik.


In [ ]:
fig = None
if 'Circuit' in df.columns:
    circuit_mean = df.groupby('Circuit')['LapTimeSeconds'].mean().sort_values()
    fig = px.bar(circuit_mean, title='Avg LapTime by Circuit', labels={'index':'Circuit','value':'LapTimeSeconds'})
else:
    print('Circuit not available')
if fig is not None:
    fig


### Zeitverlauf innerhalb eines Rennens
Beispielhafte Entwicklung der LapTime ueber Runden (Random Circuit/Driver Sample).


In [ ]:
fig = None
if all(c in df.columns for c in ['Season','RoundNumber','Driver','LapNumber','LapTimeSeconds']):
    sample = df.dropna(subset=['Season','RoundNumber','Driver']).sample(1, random_state=42).iloc[0]
    mask = (df['Season'] == sample['Season']) & (df['RoundNumber'] == sample['RoundNumber']) & (df['Driver'] == sample['Driver'])
    sub = df.loc[mask].sort_values('LapNumber')
    title = 'Time Series: {} (Season {}, Round {})'.format(sample['Driver'], int(sample['Season']), int(sample['RoundNumber']))
    fig = px.line(sub, x='LapNumber', y='LapTimeSeconds', title=title)
else:
    print('Required columns not available')
if fig is not None:
    fig


### Feature Glossar (Kurzbeschreibung)
Kompakte Erklaerung der wichtigsten engineered Features.


In [ ]:
glossary = [
    ('LapTimeLag1/2/3', 'Vorherige Rundenzeiten als kurzfristige Pace-Info'),
    ('RollingMean3', 'Glaettung der Pace ueber die letzten 3 Runden'),
    ('EstimatedFuelWeight', 'Schaetzung verbleibendes Fuelgewicht (kg)'),
    ('FuelEffect', 'Normierter Fuel-Effekt (0-1)'),
    ('TireDegradation', 'Schaetzung kumulierte Reifenabnutzung'),
    ('EstimatedGrip', 'Proxy fuer verbleibenden Grip'),
    ('TyreLifeLog/Sq/Cliff', 'Nichtlineare Reifenalter-Features (Drop-off)'),
    ('TrackEvolution', 'Rubbering-In Effekt durch kumulierte Runden'),
    ('FieldLapProgress', 'Fortschritt der Session bezogen auf alle Fahrer'),
    ('Target Encodings', 'Zielwert-basierte Kodierung von Kategorien (train-only)'),
]
pd.DataFrame(glossary, columns=['Feature', 'Nutzen']).head(20)


In [None]:
fig = px.histogram(df, x='LapTimeSeconds', nbins=80, title='LapTimeSeconds Distribution')
fig
