# Advanced EDA & Statistical Testing

Deep statistical analysis of grid position effects, team performance, and validation testing.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_rel, ttest_ind, pearsonr, spearmanr
from scipy.stats import f_oneway, chi2_contingency, kstest, levene
from pathlib import Path

sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

%matplotlib inline

In [None]:
# Load data
data_path = Path('../data/processed/processed_race_data.csv')

df_all = pd.read_csv(data_path)
df_finished = df_all[df_all['completed_race'] == True].copy()

print(f"Total records: {len(df_all):,}")
print(f"Finished races: {len(df_finished):,}")
print(f"Years: {sorted(df_all['year'].unique())}")

## Grid Side Analysis (Clean vs Dirty)

Odd positions (P1, P3, P5...) are on the racing line (clean). Even positions (P2, P4, P6...) are off-line (dirty).

In [None]:
# Add grid side classification
df_finished['grid_side'] = df_finished['GridPosition'].apply(
    lambda x: 'clean' if x % 2 == 1 else 'dirty'
)

print("Grid side distribution:")
print(df_finished['grid_side'].value_counts())

# Overall comparison
clean_stats = df_finished[df_finished['grid_side'] == 'clean']['position_change'].describe()
dirty_stats = df_finished[df_finished['grid_side'] == 'dirty']['position_change'].describe()

print("\nPosition change by grid side:")
print("\nClean side (odd positions):")
print(clean_stats)
print("\nDirty side (even positions):")
print(dirty_stats)

In [None]:
# Paired comparison - compare each row of grid
paired_analysis = []

for row in range(1, 11):  # Rows 1-10 (P1/P2, P3/P4, ... P19/P20)
    clean_pos = row * 2 - 1  # Odd position
    dirty_pos = row * 2      # Even position
    
    clean_data = df_finished[df_finished['GridPosition'] == clean_pos]
    dirty_data = df_finished[df_finished['GridPosition'] == dirty_pos]
    
    if len(clean_data) > 0 and len(dirty_data) > 0:
        clean_avg_change = clean_data['position_change'].mean()
        dirty_avg_change = dirty_data['position_change'].mean()
        
        clean_avg_finish = clean_data['Position'].mean()
        dirty_avg_finish = dirty_data['Position'].mean()
        
        # Advantage is how much better clean side performs
        advantage = dirty_avg_change - clean_avg_change
        
        paired_analysis.append({
            'row': row,
            'clean_pos': clean_pos,
            'dirty_pos': dirty_pos,
            'clean_avg_change': clean_avg_change,
            'dirty_avg_change': dirty_avg_change,
            'clean_advantage': advantage,
            'clean_avg_finish': clean_avg_finish,
            'dirty_avg_finish': dirty_avg_finish
        })

paired_df = pd.DataFrame(paired_analysis)

print("\nPaired grid row analysis:")
print(paired_df.to_string(index=False))

print(f"\nOverall clean side advantage: {paired_df['clean_advantage'].mean():.3f} positions")

In [None]:
# Paired t-test
clean_changes = df_finished[df_finished['grid_side'] == 'clean']['position_change'].values
dirty_changes = df_finished[df_finished['grid_side'] == 'dirty']['position_change'].values

# Since not truly paired (different sample sizes), use independent t-test
t_stat, p_value = ttest_ind(clean_changes, dirty_changes)

print(f"\nStatistical test (clean vs dirty):")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")

if p_value < 0.05:
    print("Result: Clean side advantage is statistically significant")
else:
    print("Result: No statistically significant difference")

In [None]:
# Grid side advantage by circuit
circuit_grid_side = []

for circuit in df_finished['circuit'].unique():
    circuit_data = df_finished[df_finished['circuit'] == circuit]
    
    clean = circuit_data[circuit_data['grid_side'] == 'clean']['position_change'].mean()
    dirty = circuit_data[circuit_data['grid_side'] == 'dirty']['position_change'].mean()
    
    advantage = dirty - clean
    
    circuit_grid_side.append({
        'circuit': circuit,
        'clean_avg': clean,
        'dirty_avg': dirty,
        'clean_advantage': advantage
    })

grid_side_df = pd.DataFrame(circuit_grid_side).sort_values('clean_advantage', ascending=False)

fig, ax = plt.subplots(figsize=(10, 12))

colors = ['green' if x > 0 else 'red' for x in grid_side_df['clean_advantage']]
ax.barh(grid_side_df['circuit'], grid_side_df['clean_advantage'],
        color=colors, edgecolor='black', alpha=0.7)

ax.axvline(0, color='black', linestyle='-', linewidth=1)
ax.set_xlabel('Clean Side Advantage (positions)', fontweight='bold')
ax.set_ylabel('Circuit', fontweight='bold')
ax.set_title('Grid Side Effect by Circuit', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("\nTop 5 circuits with strongest clean side advantage:")
print(grid_side_df.head(5)[['circuit', 'clean_advantage']].to_string(index=False))

## Team Performance Analysis

In [None]:
# Team performance metrics
team_stats = []

for team in df_finished['TeamName'].unique():
    team_data = df_finished[df_finished['TeamName'] == team]
    team_all = df_all[df_all['TeamName'] == team]
    
    avg_grid = team_data['GridPosition'].mean()
    avg_finish = team_data['Position'].mean()
    performance_delta = avg_grid - avg_finish  # Positive = gaining positions
    
    consistency = team_data['Position'].std()
    dnf_rate = (team_all['is_dnf'].sum() / len(team_all) * 100)
    
    num_races = len(team_all)
    wins = (team_data['Position'] == 1).sum()
    podiums = (team_data['Position'] <= 3).sum()
    points_finishes = (team_data['Position'] <= 10).sum()
    
    win_rate = (wins / num_races * 100) if num_races > 0 else 0
    podium_rate = (podiums / num_races * 100) if num_races > 0 else 0
    points_rate = (points_finishes / num_races * 100) if num_races > 0 else 0
    
    team_stats.append({
        'team': team,
        'races': num_races,
        'avg_grid': avg_grid,
        'avg_finish': avg_finish,
        'performance_delta': performance_delta,
        'consistency': consistency,
        'dnf_rate': dnf_rate,
        'win_rate': win_rate,
        'podium_rate': podium_rate,
        'points_rate': points_rate
    })

team_df = pd.DataFrame(team_stats).sort_values('performance_delta', ascending=False)

print("Team Performance Analysis:")
print(team_df.to_string(index=False))

In [None]:
# Team performance scatter plot
fig, ax = plt.subplots(figsize=(14, 10))

# Color code by performance tier
def get_tier_color(avg_finish):
    if avg_finish <= 5:
        return 'gold'
    elif avg_finish <= 10:
        return 'silver'
    else:
        return '#cd7f32'

colors = [get_tier_color(x) for x in team_df['avg_finish']]

scatter = ax.scatter(team_df['avg_grid'], team_df['avg_finish'],
                     s=team_df['races'] * 2, alpha=0.6,
                     c=colors, edgecolors='black', linewidths=1.5)

# Reference line (no change)
ax.plot([0, 20], [0, 20], 'r--', alpha=0.5, linewidth=2, label='No change')

# Label teams
for idx, row in team_df.iterrows():
    ax.annotate(row['team'], (row['avg_grid'], row['avg_finish']),
                fontsize=9, ha='center', va='bottom')

ax.set_xlabel('Average Grid Position', fontweight='bold')
ax.set_ylabel('Average Finish Position', fontweight='bold')
ax.set_title('Team Performance: Grid vs Finish', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

# Add quadrant labels
ax.text(2, 18, 'Over-performing', fontsize=11, style='italic', color='green')
ax.text(18, 2, 'Under-performing', fontsize=11, style='italic', color='red')

plt.tight_layout()
plt.show()

In [None]:
# Position change by team
sorted_teams = team_df.sort_values('performance_delta', ascending=True)

fig, ax = plt.subplots(figsize=(10, 12))

colors_perf = ['green' if x > 0 else 'red' for x in sorted_teams['performance_delta']]
ax.barh(sorted_teams['team'], sorted_teams['performance_delta'],
        color=colors_perf, edgecolor='black', alpha=0.7)

ax.axvline(0, color='black', linestyle='-', linewidth=1)
ax.set_xlabel('Average Position Change', fontweight='bold')
ax.set_ylabel('Team', fontweight='bold')
ax.set_title('Team Performance Efficiency', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("\nTop 5 over-performing teams:")
print(team_df.head(5)[['team', 'performance_delta', 'avg_grid', 'avg_finish']].to_string(index=False))

In [None]:
# Team performance over time (major teams)
major_teams = ['Mercedes', 'Red Bull Racing', 'Ferrari', 'McLaren', 'Alpine']
available_major = [t for t in major_teams if t in df_finished['TeamName'].values]

fig, ax = plt.subplots(figsize=(14, 7))

for team in available_major:
    team_yearly = []
    
    for year in sorted(df_finished['year'].unique()):
        year_team = df_finished[(df_finished['year'] == year) & 
                                (df_finished['TeamName'] == team)]
        if len(year_team) > 0:
            avg_finish = year_team['Position'].mean()
            team_yearly.append({'year': year, 'avg_finish': avg_finish})
    
    if len(team_yearly) > 0:
        team_yearly_df = pd.DataFrame(team_yearly)
        ax.plot(team_yearly_df['year'], team_yearly_df['avg_finish'],
                marker='o', linewidth=2, markersize=6, label=team)

ax.set_xlabel('Year', fontweight='bold')
ax.set_ylabel('Average Finish Position', fontweight='bold')
ax.set_title('Team Performance Evolution (2018-2024)', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)
ax.invert_yaxis()  # Lower is better

plt.tight_layout()
plt.show()

## Statistical Significance Tests

In [None]:
# Comprehensive statistical testing
print("="*70)
print("STATISTICAL SIGNIFICANCE TESTS")
print("="*70)

results = []

# 1. Correlation tests
print("\n1. CORRELATION TESTS")
print("-" * 70)

pearson_r, pearson_p = pearsonr(df_finished['GridPosition'], df_finished['Position'])
print(f"\nPearson correlation:")
print(f"  r = {pearson_r:.4f}")
print(f"  p-value = {pearson_p:.6f}")
print(f"  R² = {pearson_r**2:.4f} ({pearson_r**2*100:.1f}% variance explained)")

results.append({
    'test': 'Pearson Correlation',
    'statistic': pearson_r,
    'p_value': pearson_p,
    'significant': pearson_p < 0.05
})

spearman_r, spearman_p = spearmanr(df_finished['GridPosition'], df_finished['Position'])
print(f"\nSpearman rank correlation:")
print(f"  ρ = {spearman_r:.4f}")
print(f"  p-value = {spearman_p:.6f}")

results.append({
    'test': 'Spearman Correlation',
    'statistic': spearman_r,
    'p_value': spearman_p,
    'significant': spearman_p < 0.05
})

In [None]:
# 2. ANOVA - finish positions different across grid positions?
print("\n2. ANOVA TEST")
print("-" * 70)

grid_groups = [df_finished[df_finished['GridPosition'] == i]['Position'].values 
               for i in range(1, 21) if len(df_finished[df_finished['GridPosition'] == i]) > 0]

f_stat, anova_p = f_oneway(*grid_groups)

print(f"\nOne-way ANOVA (finish position by grid position):")
print(f"  F-statistic = {f_stat:.4f}")
print(f"  p-value = {anova_p:.6f}")

if anova_p < 0.05:
    print(f"  Result: Finish positions significantly differ across grid positions")
else:
    print(f"  Result: No significant difference")

results.append({
    'test': 'ANOVA',
    'statistic': f_stat,
    'p_value': anova_p,
    'significant': anova_p < 0.05
})

In [None]:
# 3. Chi-square tests
print("\n3. CHI-SQUARE TESTS")
print("-" * 70)

# Pole position advantage
pole_data = df_finished[df_finished['GridPosition'] == 1]
pole_wins = (pole_data['Position'] == 1).sum()
pole_non_wins = len(pole_data) - pole_wins

# Expected if random (1/20 chance)
expected_wins = len(pole_data) / 20
expected_non_wins = len(pole_data) - expected_wins

chi2_pole, p_pole = stats.chisquare([pole_wins, pole_non_wins], 
                                     [expected_wins, expected_non_wins])

print(f"\nPole position advantage vs random chance:")
print(f"  Observed wins: {pole_wins}")
print(f"  Expected wins (random): {expected_wins:.1f}")
print(f"  χ² = {chi2_pole:.4f}")
print(f"  p-value = {p_pole:.6f}")

if p_pole < 0.05:
    print(f"  Result: Pole advantage is statistically significant")

results.append({
    'test': 'Chi-Square (Pole)',
    'statistic': chi2_pole,
    'p_value': p_pole,
    'significant': p_pole < 0.05
})

In [None]:
# 4. Normality test on position change
print("\n4. NORMALITY TEST")
print("-" * 70)

ks_stat, ks_p = kstest(df_finished['position_change'], 'norm',
                       args=(df_finished['position_change'].mean(),
                             df_finished['position_change'].std()))

print(f"\nKolmogorov-Smirnov test (position change normality):")
print(f"  KS-statistic = {ks_stat:.4f}")
print(f"  p-value = {ks_p:.6f}")

if ks_p < 0.05:
    print(f"  Result: Position change is NOT normally distributed")
else:
    print(f"  Result: Position change follows normal distribution")

results.append({
    'test': 'K-S Normality',
    'statistic': ks_stat,
    'p_value': ks_p,
    'significant': ks_p < 0.05
})

In [None]:
# 5. Variance equality test
print("\n5. VARIANCE EQUALITY TEST")
print("-" * 70)

# Compare variance across first 5 grid positions
variance_groups = [df_finished[df_finished['GridPosition'] == i]['position_change'].values 
                   for i in range(1, 6)]

lev_stat, lev_p = levene(*variance_groups)

print(f"\nLevene's test (variance equality across P1-P5):")
print(f"  Statistic = {lev_stat:.4f}")
print(f"  p-value = {lev_p:.6f}")

if lev_p < 0.05:
    print(f"  Result: Variances are NOT equal across grid positions")
else:
    print(f"  Result: Variances are equal")

results.append({
    'test': "Levene's Test",
    'statistic': lev_stat,
    'p_value': lev_p,
    'significant': lev_p < 0.05
})

In [None]:
# Summary table
print("\n" + "="*70)
print("STATISTICAL TEST SUMMARY")
print("="*70 + "\n")

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

print(f"\nSignificant results: {results_df['significant'].sum()}/{len(results_df)}")