# Exploratory Data Analysis - Part 1

Analyzing grid position advantage and race outcome patterns in F1 (2018-2024).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path

# Visualization settings
sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

%matplotlib inline

## Load Processed Data

In [None]:
# Load the cleaned dataset
data_path = Path('../data/processed/processed_race_data.csv')

if not data_path.exists():
    print(f"ERROR: {data_path} not found")
    print("Run the data cleaning notebook first.")
else:
    df_all = pd.read_csv(data_path)
    print(f"Loaded: {data_path}")
    print(f"Total records: {len(df_all):,}")

In [None]:
# Create separate datasets
df_finished = df_all[df_all['completed_race'] == True].copy()

print("Dataset Split:")
print(f"All records: {len(df_all):,}")
print(f"Finished races: {len(df_finished):,}")
print(f"DNFs: {len(df_all) - len(df_finished):,}")
print(f"DNF rate: {((len(df_all) - len(df_finished)) / len(df_all)) * 100:.1f}%")

In [None]:
# Quick data check
print("\nData overview:")
print(df_finished.head())
print("\nColumns available:")
print(df_finished.columns.tolist())

## Position Change Analysis

In [None]:
# Position change already calculated in cleaning, but verify
if 'position_change' not in df_finished.columns:
    df_finished['position_change'] = df_finished['GridPosition'] - df_finished['Position']

# Basic statistics
print("Position Change Statistics:")
print(f"Mean: {df_finished['position_change'].mean():.2f}")
print(f"Median: {df_finished['position_change'].median():.1f}")
print(f"Std Dev: {df_finished['position_change'].std():.2f}")
print(f"\n95th percentile gain: {df_finished['position_change'].quantile(0.95):.0f}")
print(f"5th percentile loss: {df_finished['position_change'].quantile(0.05):.0f}")
print(f"\nMax gain: {df_finished['position_change'].max():.0f}")
print(f"Max loss: {df_finished['position_change'].min():.0f}")

In [None]:
# Distribution of outcomes
improved = (df_finished['position_change'] > 0).sum()
maintained = (df_finished['position_change'] == 0).sum()
declined = (df_finished['position_change'] < 0).sum()
total = len(df_finished)

print("Race Outcome Distribution:")
print(f"Improved position: {improved} ({improved/total*100:.1f}%)")
print(f"Maintained position: {maintained} ({maintained/total*100:.1f}%)")
print(f"Lost position: {declined} ({declined/total*100:.1f}%)")

In [None]:
# Histogram of position changes
fig, ax = plt.subplots(figsize=(12, 6))

df_finished['position_change'].hist(bins=40, range=(-20, 20), 
                                     edgecolor='black', ax=ax)
ax.axvline(0, color='red', linestyle='--', linewidth=2, label='No change')
ax.set_xlabel('Position Change (Grid - Finish)', fontweight='bold')
ax.set_ylabel('Frequency', fontweight='bold')
ax.set_title('Distribution of Position Changes', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Position change by starting grid
fig, ax = plt.subplots(figsize=(14, 7))

grid_positions = sorted(df_finished['GridPosition'].unique())
data_for_box = [df_finished[df_finished['GridPosition'] == p]['position_change'].values 
                for p in grid_positions]

bp = ax.boxplot(data_for_box, positions=grid_positions, widths=0.6, 
                patch_artist=True, showfliers=True)

for patch in bp['boxes']:
    patch.set_facecolor('lightblue')
    patch.set_alpha(0.7)

ax.axhline(0, color='red', linestyle='--', alpha=0.5, label='No change')
ax.set_xlabel('Starting Grid Position', fontweight='bold')
ax.set_ylabel('Position Change', fontweight='bold')
ax.set_title('Position Change by Starting Grid Position', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\nKey insight: Middle grid positions show highest variance.")

# Calculate within 3 positions statistic
within_3 = (df_finished['position_change'].abs() <= 3).sum()
within_3_pct = (within_3 / total) * 100
print(f"\nDrivers finishing within ±3 positions: {within_3} ({within_3_pct:.1f}%)")

In [None]:
# Heatmap: grid position vs position change magnitude
heatmap_data = pd.crosstab(
    df_finished['GridPosition'],
    pd.cut(df_finished['position_change'], bins=range(-15, 16, 1))
)

fig, ax = plt.subplots(figsize=(14, 8))
sns.heatmap(heatmap_data.T, cmap='RdYlGn', center=0, cbar_kws={'label': 'Frequency'}, ax=ax)
ax.set_xlabel('Grid Position', fontweight='bold')
ax.set_ylabel('Position Change', fontweight='bold')
ax.set_title('Heatmap: Grid Position vs Position Change', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## Grid Position Statistics

In [None]:
def calculate_grid_stats(df):
    """
    Calculate comprehensive statistics for each grid position.
    """
    stats_list = []
    
    for grid_pos in range(1, 21):
        grid_data = df[df['GridPosition'] == grid_pos]
        
        if len(grid_data) == 0:
            continue
        
        stats_dict = {
            'grid_position': grid_pos,
            'count': len(grid_data),
            'avg_finish': grid_data['Position'].mean(),
            'median_finish': grid_data['Position'].median(),
            'std_finish': grid_data['Position'].std(),
            'best_finish': grid_data['Position'].min(),
            'worst_finish': grid_data['Position'].max(),
            'win_rate': (grid_data['Position'] == 1).mean() * 100,
            'podium_rate': (grid_data['Position'] <= 3).mean() * 100,
            'points_rate': (grid_data['Position'] <= 10).mean() * 100,
            'avg_pos_change': grid_data['position_change'].mean()
        }
        
        stats_list.append(stats_dict)
    
    return pd.DataFrame(stats_list)

grid_stats = calculate_grid_stats(df_finished)
print("Grid Position Statistics:")
print(grid_stats.to_string(index=False))

In [None]:
# Correlation between grid and finish
correlation = df_finished['GridPosition'].corr(df_finished['Position'])
print(f"\nCorrelation (Grid vs Finish): {correlation:.3f}")
print(f"Interpretation: {correlation**2:.1%} of finish position variance explained by grid position")

In [None]:
# Average finish by grid position
fig, ax = plt.subplots(figsize=(12, 7))

# Plot actual average finish
ax.plot(grid_stats['grid_position'], grid_stats['avg_finish'], 
        marker='o', linewidth=2, markersize=8, label='Average Finish')

# Reference line (no change)
ax.plot([1, 20], [1, 20], 'r--', alpha=0.5, linewidth=2, label='No Change (y=x)')

# Add error bars (standard error)
stderr = grid_stats['std_finish'] / np.sqrt(grid_stats['count'])
ax.fill_between(grid_stats['grid_position'], 
                grid_stats['avg_finish'] - stderr,
                grid_stats['avg_finish'] + stderr,
                alpha=0.2)

ax.set_xlabel('Starting Grid Position', fontweight='bold')
ax.set_ylabel('Average Finish Position', fontweight='bold')
ax.set_title('Grid Position vs Average Finish', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
ax.set_xlim(0, 21)
ax.set_ylim(0, 21)

plt.tight_layout()
plt.show()

In [None]:
# Four-panel visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Average finish by grid
axes[0, 0].plot(grid_stats['grid_position'], grid_stats['avg_finish'], 
                marker='o', linewidth=2, markersize=6)
axes[0, 0].plot([1, 20], [1, 20], 'r--', alpha=0.5)
axes[0, 0].set_xlabel('Grid Position')
axes[0, 0].set_ylabel('Average Finish')
axes[0, 0].set_title('Average Finish by Grid Position', fontweight='bold')
axes[0, 0].grid(alpha=0.3)

# 2. Standard deviation (consistency)
axes[0, 1].bar(grid_stats['grid_position'], grid_stats['std_finish'], 
               color='orange', edgecolor='black')
axes[0, 1].set_xlabel('Grid Position')
axes[0, 1].set_ylabel('Std Dev of Finish Position')
axes[0, 1].set_title('Consistency by Grid Position', fontweight='bold')
axes[0, 1].grid(alpha=0.3, axis='y')

# 3. Win probability
colors = ['green' if x <= 3 else 'yellow' if x <= 6 else 'red' 
          for x in grid_stats['grid_position']]
axes[1, 0].bar(grid_stats['grid_position'], grid_stats['win_rate'], 
               color=colors, edgecolor='black')
axes[1, 0].set_xlabel('Grid Position')
axes[1, 0].set_ylabel('Win Rate (%)')
axes[1, 0].set_title('Win Probability by Grid Position', fontweight='bold')
axes[1, 0].grid(alpha=0.3, axis='y')

# 4. Points probability
axes[1, 1].bar(grid_stats['grid_position'], grid_stats['points_rate'], 
               color='purple', edgecolor='black', alpha=0.7)
axes[1, 1].set_xlabel('Grid Position')
axes[1, 1].set_ylabel('Points Rate (%)')
axes[1, 1].set_title('Points Finish Probability by Grid Position', fontweight='bold')
axes[1, 1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Win Probability Analysis

In [None]:
# Pole to win conversion
pole_starts = df_finished[df_finished['GridPosition'] == 1]
pole_wins = (pole_starts['Position'] == 1).sum()
pole_win_rate = (pole_wins / len(pole_starts)) * 100

print(f"Pole Position Analysis:")
print(f"Pole starts: {len(pole_starts)}")
print(f"Wins from pole: {pole_wins}")
print(f"Pole-to-win conversion: {pole_win_rate:.1f}%")

In [None]:
# Pole win rate by year
pole_by_year = []
for year in sorted(df_finished['year'].unique()):
    year_data = df_finished[df_finished['year'] == year]
    pole_data = year_data[year_data['GridPosition'] == 1]
    wins = (pole_data['Position'] == 1).sum()
    rate = (wins / len(pole_data) * 100) if len(pole_data) > 0 else 0
    pole_by_year.append({'year': year, 'win_rate': rate, 'wins': wins, 'poles': len(pole_data)})

pole_df = pd.DataFrame(pole_by_year)
print("\nPole-to-win rate by year:")
print(pole_df.to_string(index=False))

In [None]:
# Win distribution across grid positions
wins_by_grid = df_finished.groupby('GridPosition').apply(
    lambda x: (x['Position'] == 1).sum()
).reset_index(name='wins')

total_wins = wins_by_grid['wins'].sum()
wins_by_grid['win_probability'] = (wins_by_grid['wins'] / 
                                   grid_stats['count'].values) * 100
wins_by_grid['cumulative_pct'] = (wins_by_grid['wins'].cumsum() / total_wins) * 100

print("\nWins by starting position:")
print(wins_by_grid.to_string(index=False))

print(f"\nCumulative stats:")
print(f"P1-P3 account for: {wins_by_grid.iloc[:3]['wins'].sum() / total_wins * 100:.1f}% of wins")
print(f"P1-P5 account for: {wins_by_grid.iloc[:5]['wins'].sum() / total_wins * 100:.1f}% of wins")

In [None]:
# Exceptional wins (P6 or lower)
exceptional_wins = df_finished[(df_finished['Position'] == 1) & 
                               (df_finished['GridPosition'] >= 6)]

print(f"\nExceptional Wins (from P6 or lower): {len(exceptional_wins)}")
if len(exceptional_wins) > 0:
    print("\nDetails:")
    print(exceptional_wins[['year', 'race_name', 'FullName', 'TeamName', 
                           'GridPosition', 'Position']].to_string(index=False))

In [None]:
# Win vs Podium probability comparison
fig, ax = plt.subplots(figsize=(14, 7))

x = grid_stats['grid_position']
width = 0.35

ax.bar(x - width/2, grid_stats['win_rate'], width, 
       label='Win Rate', color='gold', edgecolor='black')
ax.bar(x + width/2, grid_stats['podium_rate'], width, 
       label='Podium Rate', color='silver', edgecolor='black')

ax.set_xlabel('Grid Position', fontweight='bold')
ax.set_ylabel('Probability (%)', fontweight='bold')
ax.set_title('Win vs Podium Probability by Grid Position', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## DNF Analysis

In [None]:
# Overall DNF statistics
total_entries = len(df_all)
dnf_count = (df_all['is_dnf'] == True).sum()
dnf_rate_overall = (dnf_count / total_entries) * 100

print(f"Overall DNF Statistics:")
print(f"Total entries: {total_entries:,}")
print(f"DNFs: {dnf_count:,}")
print(f"DNF rate: {dnf_rate_overall:.1f}%")

In [None]:
# DNF by grid position
dnf_by_grid = df_all.groupby('GridPosition').agg({
    'is_dnf': ['sum', 'count']
}).reset_index()
dnf_by_grid.columns = ['grid_position', 'dnf_count', 'total_starts']
dnf_by_grid['dnf_rate'] = (dnf_by_grid['dnf_count'] / dnf_by_grid['total_starts']) * 100

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(dnf_by_grid['grid_position'], dnf_by_grid['dnf_rate'], 
       color='crimson', edgecolor='black', alpha=0.7)
ax.axhline(dnf_rate_overall, color='blue', linestyle='--', 
           linewidth=2, label=f'Overall avg: {dnf_rate_overall:.1f}%')
ax.set_xlabel('Grid Position', fontweight='bold')
ax.set_ylabel('DNF Rate (%)', fontweight='bold')
ax.set_title('DNF Rate by Starting Grid Position', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
# Chi-square test for DNF uniformity
observed = dnf_by_grid['dnf_count'].values
expected_rate = dnf_count / total_entries
expected = dnf_by_grid['total_starts'].values * expected_rate

chi2, p_value = stats.chisquare(observed, expected)
print(f"\nChi-square test for DNF uniformity:")
print(f"Chi-square statistic: {chi2:.2f}")
print(f"P-value: {p_value:.4f}")
if p_value < 0.05:
    print("Result: DNF rate varies significantly by grid position")
else:
    print("Result: DNF rate is roughly uniform across grid positions")

In [None]:
# DNF evolution over years
dnf_by_year = df_all.groupby('year').agg({
    'is_dnf': ['sum', 'count']
}).reset_index()
dnf_by_year.columns = ['year', 'dnf_count', 'total']
dnf_by_year['dnf_rate'] = (dnf_by_year['dnf_count'] / dnf_by_year['total']) * 100

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(dnf_by_year['year'], dnf_by_year['dnf_rate'], 
        marker='o', linewidth=2, markersize=8)

# Add trend line
z = np.polyfit(dnf_by_year['year'], dnf_by_year['dnf_rate'], 1)
p = np.poly1d(z)
ax.plot(dnf_by_year['year'], p(dnf_by_year['year']), 
        "r--", alpha=0.5, linewidth=2, label='Trend')

# Annotate regulation change
ax.axvline(2022, color='green', linestyle=':', linewidth=2, alpha=0.7)
ax.text(2022, ax.get_ylim()[1] * 0.95, 'New regs', 
        ha='center', fontweight='bold', color='green')

ax.set_xlabel('Year', fontweight='bold')
ax.set_ylabel('DNF Rate (%)', fontweight='bold')
ax.set_title('DNF Rate Evolution (2018-2024)', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\nDNF rate by year:")
print(dnf_by_year.to_string(index=False))

In [None]:
# DNF by circuit
dnf_by_circuit = df_all.groupby('circuit').agg({
    'is_dnf': ['sum', 'count']
}).reset_index()
dnf_by_circuit.columns = ['circuit', 'dnf_count', 'total']
dnf_by_circuit['dnf_rate'] = (dnf_by_circuit['dnf_count'] / dnf_by_circuit['total']) * 100
dnf_by_circuit = dnf_by_circuit.sort_values('dnf_rate', ascending=True)

# Show top and bottom 10
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Highest DNF circuits
top_dnf = dnf_by_circuit.tail(10)
axes[0].barh(top_dnf['circuit'], top_dnf['dnf_rate'], color='red', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('DNF Rate (%)', fontweight='bold')
axes[0].set_title('Top 10 Highest DNF Circuits', fontweight='bold')
axes[0].grid(alpha=0.3, axis='x')

# Lowest DNF circuits
bottom_dnf = dnf_by_circuit.head(10)
axes[1].barh(bottom_dnf['circuit'], bottom_dnf['dnf_rate'], color='green', edgecolor='black', alpha=0.7)
axes[1].set_xlabel('DNF Rate (%)', fontweight='bold')
axes[1].set_title('Top 10 Safest Circuits', fontweight='bold')
axes[1].grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## Key Findings Summary

In [None]:
print("="*70)
print("EXPLORATORY ANALYSIS - KEY FINDINGS")
print("="*70)

print("\n1. POSITION CHANGE:")
print(f"   - Mean change: {df_finished['position_change'].mean():.2f} positions")
print(f"   - {improved/total*100:.1f}% improve, {declined/total*100:.1f}% decline")
print(f"   - Typical volatility: ±{df_finished['position_change'].std():.1f} positions")

print("\n2. GRID ADVANTAGE:")
print(f"   - Grid-finish correlation: {correlation:.3f}")
print(f"   - {correlation**2*100:.1f}% of variance explained by grid position")
print(f"   - Pole-to-win rate: {pole_win_rate:.1f}%")

print("\n3. WIN DISTRIBUTION:")
print(f"   - P1-P3 wins: {wins_by_grid.iloc[:3]['wins'].sum() / total_wins * 100:.1f}%")
print(f"   - Exceptional wins (P6+): {len(exceptional_wins)}")

print("\n4. DNF PATTERNS:")
print(f"   - Overall DNF rate: {dnf_rate_overall:.1f}%")
print(f"   - Highest DNF circuit: {dnf_by_circuit.iloc[-1]['circuit']} ({dnf_by_circuit.iloc[-1]['dnf_rate']:.1f}%)")
print(f"   - Safest circuit: {dnf_by_circuit.iloc[0]['circuit']} ({dnf_by_circuit.iloc[0]['dnf_rate']:.1f}%)")

print("\n" + "="*70)