# Circuit Analysis - Grid Position Impact by Track

Analyzing how different circuits affect overtaking opportunities and grid position advantage.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path

sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

%matplotlib inline

## Load Data

In [None]:
data_path = Path('../data/processed/processed_race_data.csv')

df_all = pd.read_csv(data_path)
df_finished = df_all[df_all['completed_race'] == True].copy()

print(f"Total records: {len(df_all):,}")
print(f"Finished races: {len(df_finished):,}")
print(f"Unique circuits: {df_all['circuit'].nunique()}")
print(f"Years: {sorted(df_all['year'].unique())}")

## Circuit-Level Statistics

In [None]:
circuit_stats = []

for circuit in sorted(df_all['circuit'].unique()):
    circuit_data = df_all[df_all['circuit'] == circuit]
    finished_data = df_finished[df_finished['circuit'] == circuit]
    
    # Basic stats
    num_races = circuit_data['race_name'].nunique()
    total_entries = len(circuit_data)
    years_active = sorted(circuit_data['year'].unique())
    
    # Overtaking metrics
    if len(finished_data) > 0:
        avg_pos_change = finished_data['position_change'].abs().mean()
        std_pos_change = finished_data['position_change'].std()
        variance_pos_change = finished_data['position_change'].var()
        improved_pct = (finished_data['position_change'] > 0).mean() * 100
    else:
        avg_pos_change = std_pos_change = variance_pos_change = improved_pct = 0
    
    # Grid position importance
    pole_data = finished_data[finished_data['GridPosition'] == 1]
    pole_wins = (pole_data['Position'] == 1).sum()
    pole_win_rate = (pole_wins / len(pole_data) * 100) if len(pole_data) > 0 else 0
    
    if len(finished_data) > 0:
        correlation = finished_data['GridPosition'].corr(finished_data['Position'])
        avg_finish_from_pole = pole_data['Position'].mean() if len(pole_data) > 0 else 0
    else:
        correlation = 0
        avg_finish_from_pole = 0
    
    top3_grid = finished_data[finished_data['GridPosition'] <= 3]
    top3_wins = (top3_grid['Position'] == 1).sum()
    top3_win_rate = (top3_wins / num_races * 100) if num_races > 0 else 0
    
    # Race characteristics
    dnf_count = (circuit_data['is_dnf'] == True).sum()
    dnf_rate = (dnf_count / len(circuit_data) * 100) if len(circuit_data) > 0 else 0
    avg_dnfs_per_race = dnf_count / num_races if num_races > 0 else 0
    
    # Points from outside top 10 grid
    outside_top10 = finished_data[finished_data['GridPosition'] > 10]
    points_rate = (outside_top10['Position'] <= 10).mean() * 100 if len(outside_top10) > 0 else 0
    
    # Exceptional results
    exceptional_wins = len(finished_data[(finished_data['Position'] == 1) & 
                                         (finished_data['GridPosition'] >= 6)])
    exceptional_podiums = len(finished_data[(finished_data['Position'] <= 3) & 
                                            (finished_data['GridPosition'] >= 10)])
    
    circuit_stats.append({
        'circuit': circuit,
        'num_races': num_races,
        'total_entries': total_entries,
        'years_active': len(years_active),
        'avg_pos_change': avg_pos_change,
        'std_pos_change': std_pos_change,
        'variance_pos_change': variance_pos_change,
        'improved_pct': improved_pct,
        'pole_win_rate': pole_win_rate,
        'grid_finish_correlation': correlation,
        'avg_finish_from_pole': avg_finish_from_pole,
        'top3_win_rate': top3_win_rate,
        'avg_dnfs_per_race': avg_dnfs_per_race,
        'dnf_rate': dnf_rate,
        'points_from_p11plus': points_rate,
        'exceptional_wins': exceptional_wins,
        'exceptional_podiums': exceptional_podiums
    })

circuit_df = pd.DataFrame(circuit_stats)

print("Circuit Statistics Summary:")
print(circuit_df.to_string(index=False))

In [None]:
# Save circuit statistics
output_path = Path('../data/processed/circuit_statistics.csv')
circuit_df.to_csv(output_path, index=False)
print(f"\nSaved to: {output_path}")

In [None]:
# Identify extremes
print("\n" + "="*70)
print("CIRCUIT EXTREMES")
print("="*70)

most_processional = circuit_df.loc[circuit_df['grid_finish_correlation'].idxmax()]
print(f"\nMost processional (highest correlation):")
print(f"  {most_processional['circuit']}: {most_processional['grid_finish_correlation']:.3f}")

most_overtaking = circuit_df.loc[circuit_df['variance_pos_change'].idxmax()]
print(f"\nMost overtaking (highest variance):")
print(f"  {most_overtaking['circuit']}: variance={most_overtaking['variance_pos_change']:.2f}")

highest_pole_win = circuit_df.loc[circuit_df['pole_win_rate'].idxmax()]
print(f"\nHighest pole win rate:")
print(f"  {highest_pole_win['circuit']}: {highest_pole_win['pole_win_rate']:.1f}%")

lowest_pole_win = circuit_df[circuit_df['pole_win_rate'] > 0].loc[
    circuit_df[circuit_df['pole_win_rate'] > 0]['pole_win_rate'].idxmin()
]
print(f"\nLowest pole win rate:")
print(f"  {lowest_pole_win['circuit']}: {lowest_pole_win['pole_win_rate']:.1f}%")

print("\n" + "="*70)

## Circuit Comparison Visualizations

In [None]:
# Pole win rate comparison
sorted_circuits = circuit_df[circuit_df['pole_win_rate'] > 0].sort_values('pole_win_rate', ascending=True)

fig, ax = plt.subplots(figsize=(10, 12))

colors = ['green' if x < 40 else 'yellow' if x < 70 else 'red' 
          for x in sorted_circuits['pole_win_rate']]

ax.barh(sorted_circuits['circuit'], sorted_circuits['pole_win_rate'], 
        color=colors, edgecolor='black', alpha=0.7)

overall_avg = circuit_df['pole_win_rate'].mean()
ax.axvline(overall_avg, color='blue', linestyle='--', linewidth=2, 
           label=f'Average: {overall_avg:.1f}%')

ax.set_xlabel('Pole-to-Win Conversion Rate (%)', fontweight='bold')
ax.set_ylabel('Circuit', fontweight='bold')
ax.set_title('Pole Position Advantage by Circuit', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

In [None]:
# Overtaking Difficulty Index
# Normalize metrics to 0-100 scale
def normalize_metric(series, higher_is_harder=True):
    min_val = series.min()
    max_val = series.max()
    normalized = (series - min_val) / (max_val - min_val) * 100
    if not higher_is_harder:
        normalized = 100 - normalized
    return normalized

# Calculate components
circuit_df['correlation_score'] = normalize_metric(circuit_df['grid_finish_correlation'], True)
circuit_df['variance_score'] = normalize_metric(circuit_df['variance_pos_change'], False)
circuit_df['pole_score'] = normalize_metric(circuit_df['pole_win_rate'], True)

# Combined index
circuit_df['overtaking_difficulty'] = (
    circuit_df['correlation_score'] + 
    circuit_df['variance_score'] + 
    circuit_df['pole_score']
) / 3

sorted_difficulty = circuit_df.sort_values('overtaking_difficulty', ascending=True)

fig, ax = plt.subplots(figsize=(10, 12))

colors_difficulty = ['green' if x < 40 else 'yellow' if x < 60 else 'red' 
                     for x in sorted_difficulty['overtaking_difficulty']]

ax.barh(sorted_difficulty['circuit'], sorted_difficulty['overtaking_difficulty'],
        color=colors_difficulty, edgecolor='black', alpha=0.7)

ax.set_xlabel('Overtaking Difficulty Index (0=Easy, 100=Hard)', fontweight='bold')
ax.set_ylabel('Circuit', fontweight='bold')
ax.set_title('Overtaking Difficulty Index by Circuit', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("\nTop 5 Easiest Circuits for Overtaking:")
print(sorted_difficulty.head(5)[['circuit', 'overtaking_difficulty']].to_string(index=False))

print("\nTop 5 Hardest Circuits for Overtaking:")
print(sorted_difficulty.tail(5)[['circuit', 'overtaking_difficulty']].to_string(index=False))

In [None]:
# Circuit characteristics scatter plot
fig, ax = plt.subplots(figsize=(14, 10))

scatter = ax.scatter(circuit_df['pole_win_rate'], 
                     circuit_df['variance_pos_change'],
                     s=circuit_df['num_races'] * 50,
                     alpha=0.6, edgecolors='black', linewidths=1.5)

# Label circuits
for idx, row in circuit_df.iterrows():
    ax.annotate(row['circuit'], 
                (row['pole_win_rate'], row['variance_pos_change']),
                fontsize=9, ha='center', va='bottom')

# Add quadrant lines
median_pole = circuit_df['pole_win_rate'].median()
median_variance = circuit_df['variance_pos_change'].median()

ax.axvline(median_pole, color='gray', linestyle='--', alpha=0.5)
ax.axhline(median_variance, color='gray', linestyle='--', alpha=0.5)

# Quadrant labels
ax.text(ax.get_xlim()[1] * 0.95, ax.get_ylim()[1] * 0.95, 
        'Fast but raceable', ha='right', va='top', fontsize=10, style='italic')
ax.text(ax.get_xlim()[0] * 1.05, ax.get_ylim()[1] * 0.95, 
        'Chaotic', ha='left', va='top', fontsize=10, style='italic')
ax.text(ax.get_xlim()[1] * 0.95, ax.get_ylim()[0] * 1.05, 
        'Processional', ha='right', va='bottom', fontsize=10, style='italic')

ax.set_xlabel('Pole-to-Win Rate (%)', fontweight='bold')
ax.set_ylabel('Position Change Variance', fontweight='bold')
ax.set_title('Circuit Racing Characteristics', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# DNF rate by circuit
sorted_dnf = circuit_df.sort_values('dnf_rate', ascending=True)

fig, ax = plt.subplots(figsize=(10, 12))

ax.barh(sorted_dnf['circuit'], sorted_dnf['dnf_rate'],
        color='crimson', edgecolor='black', alpha=0.7)

overall_dnf = df_all['is_dnf'].mean() * 100
ax.axvline(overall_dnf, color='blue', linestyle='--', linewidth=2,
           label=f'Average: {overall_dnf:.1f}%')

ax.set_xlabel('DNF Rate (%)', fontweight='bold')
ax.set_ylabel('Circuit', fontweight='bold')
ax.set_title('Circuit Reliability - DNF Rates', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()