# Initial Exploratory Data Analysis (EDA)

This notebook performs initial exploration of the GR Cup racing datasets.

## Objectives:
1. Basic statistics per track
2. Driver performance distributions
3. Weather patterns across venues
4. Telemetry parameter availability


In [1]:
# Import libraries
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add src to path
sys.path.insert(0, str(Path().absolute().parent.parent))

from src.data_processing.data_loader import DataLoader, validate_data_completeness

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Helper function to show Plotly figures with fallback
def show_plotly_fig(fig, save_html_name=None):
    """
    Show Plotly figure with fallback to matplotlib/seaborn if nbformat not available.
    
    Args:
        fig: Plotly figure object
        save_html_name: Optional filename to save HTML version
    """
    try:
        fig.show()
    except (ValueError, ImportError) as e:
        error_msg = str(e).lower()
        if "nbformat" in error_msg:
            print("⚠ Interactive Plotly requires nbformat in kernel environment.")
            print("   Fix: Restart kernel, then run: !pip install nbformat>=4.2.0")
            print("   Using matplotlib fallback...\n")
        
        # Save as HTML (works without nbformat, interactive in browser)
        if save_html_name:
            try:
                fig.write_html(save_html_name)
                print(f"   ✓ Saved interactive plot to: {save_html_name} (open in browser)")
            except Exception:
                pass
        
        # Note: For complex plots, we can't always convert to matplotlib
        # So we just save as HTML and inform user
        print("   Note: Complex Plotly visualizations saved as HTML.")
        print("   Install nbformat in kernel to see interactive plots inline.")

print("Libraries imported successfully!")
print("✓ Plotly helper function loaded")


Libraries imported successfully!
✓ Plotly helper function loaded


## 1. Initialize Data Loader


In [2]:
# Initialize loader with project root as base path
# Find project root by looking for config/config.yaml file
current_dir = Path().absolute()
project_root = current_dir

# Search up the directory tree for config/config.yaml
max_levels = 5  # Don't go too far up
for _ in range(max_levels):
    if (project_root / "config" / "config.yaml").exists():
        break
    if project_root == project_root.parent:  # Reached filesystem root
        break
    project_root = project_root.parent
else:
    # Fallback: assume we're in notebooks/exploration/ and go up 2 levels
    project_root = current_dir.parent.parent

loader = DataLoader(base_path=str(project_root))
print(f"Base path: {loader.base_path}")
print(f"Venues available: {', '.join(loader.venues)}")


Base path: C:\Users\apand\OneDrive\Desktop\Let's Hack
Venues available: barber, COTA, indianapolis, virginia-international-raceway


## 2. Data Completeness Validation


In [3]:
# Validate data completeness
validation_df = validate_data_completeness(loader)

print("Data Completeness Summary:")
print("=" * 80)
print(f"Total file checks: {len(validation_df)}")

# Check if DataFrame is empty or has no 'found' column
if validation_df.empty or 'found' not in validation_df.columns:
    print("No validation data available. Check if venues are configured correctly.")
    print(f"Venues found: {len(loader.venues)}")
    if len(loader.venues) == 0:
        print("WARNING: No venues detected. Please check your configuration.")
else:
    print(f"Files found: {validation_df['found'].sum()}")
    print(f"Files missing: {(~validation_df['found']).sum()}")
    print(f"Overall completeness: {validation_df['found'].mean() * 100:.1f}%")
    
    # Display breakdown by venue
    print("\nBreakdown by Venue:")
    print("-" * 80)
    for venue in loader.venues:
        venue_data = validation_df[validation_df['venue'] == venue]
        if len(venue_data) > 0:
            found = venue_data['found'].sum()
            total = len(venue_data)
            print(f"{venue:30s}: {found:2d}/{total:2d} files ({found/total*100:.1f}%)")
        else:
            print(f"{venue:30s}: No data available")


INFO:src.data_processing.data_loader:Loaded results: 22 rows from 03_Provisional Results_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded results by class: 22 rows from 05_Provisional Results by Class_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded weather: 43 rows from 26_Weather_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded best laps: 22 rows from 99_Best 10 Laps By Driver_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded analysis: 579 rows from 23_AnalysisEnduranceWithSections_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded results: 22 rows from 03_Provisional Results_Race 2_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded results by class: 22 rows from 05_Provisional Results by Class_Race 2_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded weather: 44 rows from 26_Weather_Race 2_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded best laps: 22 rows from 99_Best 10 Laps

Data Completeness Summary:
Total file checks: 40
Files found: 40
Files missing: 0
Overall completeness: 100.0%

Breakdown by Venue:
--------------------------------------------------------------------------------
barber                        : 10/10 files (100.0%)
COTA                          : 10/10 files (100.0%)
indianapolis                  : 10/10 files (100.0%)
virginia-international-raceway: 10/10 files (100.0%)


In [4]:
# ============================================
# Visualization: Data Completeness Heatmap
# ============================================

pivot_data = validation_df.pivot_table(
    index='venue',
    columns='file_type',
    values='found',
    aggfunc='mean'
)

print("Data Completeness Heatmap Data:")
print("=" * 80)
print(pivot_data)
print()

import plotly.express as px

try:
    fig = px.imshow(
        pivot_data,
        labels=dict(x="File Type", y="Venue", color="Found"),
        title="Data Completeness Heatmap",
        color_continuous_scale="RdYlGn"
    )
    fig.update_layout(height=400)

    # Primary display
    show_plotly_fig(fig, "data_completeness_heatmap.html")

except Exception as e:
    error_msg = str(e).lower()

    if "nbformat" in error_msg:
        print("⚠ Interactive Plotly visualization requires nbformat in the environment.")
        print("   Fix: Restart kernel & run: pip install nbformat>=4.2.0")
        print("\nUsing matplotlib fallback...\n")

    elif "kaleido" in error_msg:
        print("⚠ Image export requires kaleido.")
        print("Using matplotlib fallback...\n")

    else:
        print("⚠ Plotly failed:", e)
        print("Using matplotlib fallback...\n")

    # Fallback to seaborn/matplotlib
    import matplotlib.pyplot as plt
    import seaborn as sns

    plt.figure(figsize=(12, 6))
    sns.heatmap(
        pivot_data,
        annot=True,
        fmt='.1f',
        cmap='RdYlGn',
        vmin=0,
        vmax=1,
        cbar_kws={'label': 'Found (1.0 = 100%)'},
        linewidths=0.5
    )
    plt.title("Data Completeness Heatmap", fontsize=14, fontweight='bold')
    plt.xlabel("File Type", fontsize=12)
    plt.ylabel("Venue", fontsize=12)
    plt.tight_layout()
    plt.show()

    # Also export Plotly HTML if possible
    try:
        print("\nAlso saving Plotly HTML version...")
        fig.write_html("data_completeness_heatmap.html")
        print("✓ Saved: data_completeness_heatmap.html")
    except Exception:
        pass


Data Completeness Heatmap Data:
file_type                       analysis  best_laps  results  \
venue                                                          
COTA                                 1.0        1.0      1.0   
barber                               1.0        1.0      1.0   
indianapolis                         1.0        1.0      1.0   
virginia-international-raceway       1.0        1.0      1.0   

file_type                       results_by_class  weather  
venue                                                      
COTA                                         1.0      1.0  
barber                                       1.0      1.0  
indianapolis                                 1.0      1.0  
virginia-international-raceway               1.0      1.0  



## 3. Basic Statistics Per Track


In [5]:
# Load results for all venues
all_results = {}
for venue in loader.venues:
    for race in ["Race 1", "Race 2"]:
        results = loader.load_results_file(venue, race, "provisional")
        if not results.empty:
            key = f"{venue}_{race}"
            all_results[key] = results

print(f"Loaded results from {len(all_results)} race(s)")


INFO:src.data_processing.data_loader:Loaded results: 22 rows from 03_Provisional Results_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded results: 22 rows from 03_Provisional Results_Race 2_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded results: 31 rows from 03_Provisional Results_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded results: 31 rows from 03_Provisional Results_ Race 2_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded results: 29 rows from 03_Provisional Results_Race 1.CSV
INFO:src.data_processing.data_loader:Loaded results: 29 rows from 03_Provisional Results_Race 2.CSV
INFO:src.data_processing.data_loader:Loaded results: 24 rows from 03_Provisional Results_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded results: 24 rows from 03_Provisional Results_Race 2_Anonymized.CSV


Loaded results from 8 race(s)


In [6]:
# Combine all results
if all_results:
    combined_results = pd.concat(all_results.values(), ignore_index=True)
    
    print("Combined Results Statistics:")
    print("=" * 80)
    print(f"Total races: {combined_results.groupby(['venue', 'race']).ngroups}")
    print(f"Total drivers: {combined_results['NUMBER'].nunique() if 'NUMBER' in combined_results.columns else 'N/A'}")
    print(f"Total entries: {len(combined_results)}")
    
    # Basic statistics
    if 'LAPS' in combined_results.columns:
        print(f"\nLaps Statistics:")
        print(combined_results['LAPS'].describe())
    
    if 'FL_TIME_seconds' in combined_results.columns:
        print(f"\nFastest Lap Time Statistics (seconds):")
        print(combined_results['FL_TIME_seconds'].describe())
else:
    print("No results data found")


Combined Results Statistics:
Total races: 8
Total drivers: 36
Total entries: 212

Laps Statistics:
count    212.000000
mean      20.363208
std        6.253353
min        0.000000
25%       17.000000
50%       21.000000
75%       26.000000
max       28.000000
Name: LAPS, dtype: float64

Fastest Lap Time Statistics (seconds):
count    205.000000
mean     121.584688
std       22.000403
min       97.304000
25%      100.322000
50%      128.497000
75%      148.693000
max      153.790000
Name: FL_TIME_seconds, dtype: float64


In [7]:
# Visualization: Lap distribution by venue
if all_results and 'LAPS' in combined_results.columns:
    fig = px.box(
        combined_results,
        x='venue',
        y='LAPS',
        color='race',
        title="Laps Completed Distribution by Venue",
        labels={'LAPS': 'Number of Laps', 'venue': 'Venue'}
    )
    fig.update_layout(height=500)
    show_plotly_fig(fig, "laps_distribution_by_venue.html")


## 4. Driver Performance Distributions


In [8]:
# Load best laps data
all_best_laps = {}
for venue in loader.venues:
    for race in ["Race 1", "Race 2"]:
        best_laps = loader.load_best_laps(venue, race)
        if not best_laps.empty:
            key = f"{venue}_{race}"
            all_best_laps[key] = best_laps

if all_best_laps:
    combined_best_laps = pd.concat(all_best_laps.values(), ignore_index=True)
    print(f"Loaded best laps data from {len(all_best_laps)} race(s)")
    print(f"Total drivers: {combined_best_laps['NUMBER'].nunique() if 'NUMBER' in combined_best_laps.columns else 'N/A'}")
else:
    print("No best laps data found")


INFO:src.data_processing.data_loader:Loaded best laps: 22 rows from 99_Best 10 Laps By Driver_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded best laps: 22 rows from 99_Best 10 Laps By Driver_Race 2_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded best laps: 31 rows from 99_Best 10 Laps By Driver_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded best laps: 31 rows from 99_Best 10 Laps By Driver_ Race 2_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded best laps: 29 rows from 99_Best 10 Laps By Driver_Race 1.CSV
INFO:src.data_processing.data_loader:Loaded best laps: 29 rows from 99_Best 10 Laps By Driver_Race 2.CSV
INFO:src.data_processing.data_loader:Loaded best laps: 23 rows from 99_Best 10 Laps By Driver_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded best laps: 21 rows from 99_Best 10 Laps By Driver_Race 2_Anonymized.CSV


Loaded best laps data from 8 race(s)
Total drivers: 36


In [9]:
# Visualization: Best lap times distribution
if all_best_laps and 'BESTLAP_1_seconds' in combined_best_laps.columns:
    fig = px.histogram(
        combined_best_laps,
        x='BESTLAP_1_seconds',
        color='venue',
        nbins=30,
        title="Distribution of Best Lap Times by Venue",
        labels={'BESTLAP_1_seconds': 'Best Lap Time (seconds)', 'count': 'Number of Drivers'}
    )
    fig.update_layout(height=500)
    show_plotly_fig(fig, "best_lap_times_distribution.html")


## 5. Weather Patterns Across Venues


In [10]:
# Load weather data for all venues
all_weather = {}
for venue in loader.venues:
    for race in ["Race 1", "Race 2"]:
        weather = loader.load_weather_data(venue, race)
        if not weather.empty:
            key = f"{venue}_{race}"
            all_weather[key] = weather

if all_weather:
    combined_weather = pd.concat(all_weather.values(), ignore_index=True)
    print(f"Loaded weather data from {len(all_weather)} race(s)")
    print(f"\nWeather Statistics:")
    print("=" * 80)
    
    if 'AIR_TEMP' in combined_weather.columns:
        print("\nAir Temperature:")
        print(combined_weather['AIR_TEMP'].describe())
    
    if 'TRACK_TEMP' in combined_weather.columns:
        print("\nTrack Temperature:")
        print(combined_weather['TRACK_TEMP'].describe())
    
    if 'HUMIDITY' in combined_weather.columns:
        print("\nHumidity:")
        print(combined_weather['HUMIDITY'].describe())
    
    if 'WIND_SPEED' in combined_weather.columns:
        print("\nWind Speed:")
        print(combined_weather['WIND_SPEED'].describe())
else:
    print("No weather data found")


INFO:src.data_processing.data_loader:Loaded weather: 43 rows from 26_Weather_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded weather: 44 rows from 26_Weather_Race 2_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded weather: 44 rows from 26_Weather_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded weather: 45 rows from 26_Weather_ Race 2_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded weather: 45 rows from 26_Weather_Race 1.CSV
INFO:src.data_processing.data_loader:Loaded weather: 43 rows from 26_Weather_Race 2.CSV
INFO:src.data_processing.data_loader:Loaded weather: 44 rows from 26_Weather_Race 1_Anonymized.CSV
INFO:src.data_processing.data_loader:Loaded weather: 43 rows from 26_Weather_Race 2_Anonymized.CSV


Loaded weather data from 8 race(s)

Weather Statistics:

Air Temperature:
count    351.000000
mean      25.832365
std        7.027824
min       11.520000
25%       21.230000
50%       28.510000
75%       30.905000
max       34.190000
Name: AIR_TEMP, dtype: float64

Track Temperature:
count    351.000000
mean      25.678348
std       18.687994
min        0.000000
25%       13.900000
50%       39.000000
75%       42.200000
max       50.800000
Name: TRACK_TEMP, dtype: float64

Humidity:
count    351.000000
mean      62.708604
std       10.295905
min       43.670000
25%       56.955000
50%       60.950000
75%       68.130000
max       83.150000
Name: HUMIDITY, dtype: float64

Wind Speed:
count    351.000000
mean       7.416410
std        4.666854
min        0.360000
25%        3.960000
50%        6.120000
75%       10.440000
max       23.760000
Name: WIND_SPEED, dtype: float64


In [11]:
# Visualization: Weather comparison across venues
if all_weather and 'AIR_TEMP' in combined_weather.columns:
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Air Temperature', 'Track Temperature', 'Humidity', 'Wind Speed'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Air Temperature
    for venue in combined_weather['venue'].unique():
        venue_data = combined_weather[combined_weather['venue'] == venue]
        fig.add_trace(
            go.Box(y=venue_data['AIR_TEMP'], name=venue),
            row=1, col=1
        )
    
    # Track Temperature
    for venue in combined_weather['venue'].unique():
        venue_data = combined_weather[combined_weather['venue'] == venue]
        fig.add_trace(
            go.Box(y=venue_data['TRACK_TEMP'], name=venue, showlegend=False),
            row=1, col=2
        )
    
    # Humidity
    for venue in combined_weather['venue'].unique():
        venue_data = combined_weather[combined_weather['venue'] == venue]
        fig.add_trace(
            go.Box(y=venue_data['HUMIDITY'], name=venue, showlegend=False),
            row=2, col=1
        )
    
    # Wind Speed
    for venue in combined_weather['venue'].unique():
        venue_data = combined_weather[combined_weather['venue'] == venue]
        fig.add_trace(
            go.Box(y=venue_data['WIND_SPEED'], name=venue, showlegend=False),
            row=2, col=2
        )
    
    fig.update_layout(height=800, title_text="Weather Patterns Across Venues")
    show_plotly_fig(fig, "weather_patterns_across_venues.html")


## 6. Telemetry Parameter Availability


In [12]:
# Load telemetry samples
telemetry_samples = {}
for venue in loader.venues:
    for race in ["Race 1", "Race 2"]:
        try:
            telemetry = loader.load_telemetry(venue, race, sample_size=10000)
            if not telemetry.empty:
                key = f"{venue}_{race}"
                telemetry_samples[key] = telemetry
        except Exception as e:
            print(f"Error loading telemetry for {venue} {race}: {e}")

if telemetry_samples:
    print(f"Loaded telemetry samples from {len(telemetry_samples)} race(s)")
    
    # Get unique parameters
    all_params = set()
    for telemetry in telemetry_samples.values():
        if 'parameter_name' in telemetry.columns:
            all_params.update(telemetry['parameter_name'].unique())
    
    print(f"\nUnique telemetry parameters found: {len(all_params)}")
    print("\nParameters:")
    for param in sorted(all_params):
        print(f"  - {param}")
else:
    print("No telemetry data found")


INFO:src.data_processing.data_loader:Loaded telemetry sample: 10000 rows from R1_barber_telemetry_data.csv
INFO:src.data_processing.data_loader:Loaded telemetry sample: 10000 rows from R2_barber_telemetry_data.csv
INFO:src.data_processing.data_loader:Loaded telemetry sample: 10000 rows from R1_cota_telemetry_data.csv
INFO:src.data_processing.data_loader:Loaded telemetry sample: 10000 rows from R2_cota_telemetry_data.csv
INFO:src.data_processing.data_loader:Loaded telemetry sample: 10000 rows from R1_indianapolis_motor_speedway_telemetry.csv
INFO:src.data_processing.data_loader:Loaded telemetry sample: 10000 rows from R2_indianapolis_motor_speedway_telemetry.csv


INFO:src.data_processing.data_loader:Loaded telemetry sample: 10000 rows from R1_vir_telemetry_data.csv
INFO:src.data_processing.data_loader:Loaded telemetry sample: 10000 rows from R2_vir_telemetry_data.csv


Loaded telemetry samples from 8 race(s)

Unique telemetry parameters found: 0

Parameters:


In [13]:
# Visualization: Parameter frequency
if telemetry_samples and 'parameter_name' in list(telemetry_samples.values())[0].columns:
    param_counts = {}
    for telemetry in telemetry_samples.values():
        param_counts_race = telemetry['parameter_name'].value_counts()
        for param, count in param_counts_race.items():
            param_counts[param] = param_counts.get(param, 0) + count
    
    param_df = pd.DataFrame(list(param_counts.items()), columns=['Parameter', 'Count'])
    param_df = param_df.sort_values('Count', ascending=False)
    
    fig = px.bar(
        param_df.head(20),
        x='Parameter',
        y='Count',
        title="Top 20 Most Frequent Telemetry Parameters",
        labels={'Count': 'Number of Measurements'}
    )
    fig.update_xaxes(tickangle=45)
    fig.update_layout(height=600)
    show_plotly_fig(fig, "telemetry_parameter_frequency.html")


## 7. Summary and Key Insights


In [14]:
print("=" * 80)
print("EDA SUMMARY")
print("=" * 80)
print()
print("Key Findings:")
print("1. Data completeness across venues")
print("2. Driver performance distributions")
print("3. Weather patterns and variations")
print("4. Telemetry parameter availability")
print()
print("Next Steps:")
print("- Phase 1: Track DNA Extraction")
print("- Phase 2: Driver Transfer Learning")
print("- Phase 3: Track-Specific AI Coaches")


EDA SUMMARY

Key Findings:
1. Data completeness across venues
2. Driver performance distributions
3. Weather patterns and variations
4. Telemetry parameter availability

Next Steps:
- Phase 1: Track DNA Extraction
- Phase 2: Driver Transfer Learning
- Phase 3: Track-Specific AI Coaches
