In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, date
import time
import logging
import json
from typing import Dict, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class F1DriverAgeScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
        }
        self.driver_birth_dates = {}
        self.driver_death_dates = {}
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        
    def normalize_driver_name(self, name: str) -> str:
        """Normalize driver names for matching"""
        # Remove special characters and convert to lowercase
        name = re.sub(r'[^\w\s-]', '', name.lower())
        # Replace multiple spaces with single space
        name = re.sub(r'\s+', ' ', name).strip()
        return name
    
    def get_driver_info_wikipedia(self, driver_name: str) -> Tuple[Optional[date], Optional[date]]:
        """Try to get birth and death dates from Wikipedia"""
        try:
            # Format name for Wikipedia URL (capitalize each word, replace spaces with underscores)
            wiki_name = '_'.join(word.capitalize() for word in driver_name.split())
            
            # Try different URL patterns
            urls_to_try = [
                f"https://en.wikipedia.org/wiki/{wiki_name}",
                f"https://en.wikipedia.org/wiki/{wiki_name}_(racing_driver)",
                f"https://en.wikipedia.org/wiki/{wiki_name}_(race_car_driver)",
            ]
            
            for url in urls_to_try:
                try:
                    response = self.session.get(url, timeout=10)
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.content, 'html.parser')
                        
                        birth_date = None
                        death_date = None
                        
                        # Look for birth/death dates in infobox
                        infobox = soup.find('table', {'class': 'infobox'})
                        if infobox:
                            # Look for birth date
                            bday_span = infobox.find('span', {'class': 'bday'})
                            if bday_span:
                                birth_date_str = bday_span.text
                                birth_date = datetime.strptime(birth_date_str, '%Y-%m-%d').date()
                            
                            # Look for death date
                            death_span = infobox.find('span', {'class': 'dday'})
                            if not death_span:
                                # Try alternative death date patterns
                                for span in infobox.find_all('span'):
                                    if span.get('style') and 'display:none' in span.get('style'):
                                        date_text = span.text
                                        if re.match(r'\d{4}-\d{2}-\d{2}', date_text):
                                            # Check if this is after birth date (likely death date)
                                            test_date = datetime.strptime(date_text, '%Y-%m-%d').date()
                                            if birth_date and test_date > birth_date:
                                                death_date = test_date
                                                break
                            else:
                                death_date_str = death_span.text
                                death_date = datetime.strptime(death_date_str, '%Y-%m-%d').date()
                        
                        if birth_date:
                            return birth_date, death_date
                    
                    time.sleep(0.5)  # Be respectful to Wikipedia
                except Exception as e:
                    continue
            
            return None, None
        except Exception as e:
            logger.debug(f"Wikipedia error for {driver_name}: {str(e)}")
            return None, None
    
    def get_driver_info_statsf1(self, driver_name: str) -> Tuple[Optional[date], Optional[date]]:
        """Try to get birth and death dates from StatsF1.com"""
        try:
            # Format name for StatsF1 URL
            formatted_name = driver_name.lower().replace(' ', '-').replace("'", "")
            url = f"https://www.statsf1.com/en/pilote/{formatted_name}.aspx"
            
            response = self.session.get(url, timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                birth_date = None
                death_date = None
                
                # Look for dates in the driver info section
                info_table = soup.find('table', {'class': 'infos'})
                if info_table:
                    for row in info_table.find_all('tr'):
                        cells = row.find_all('td')
                        if len(cells) >= 2:
                            label = cells[0].text.strip().lower()
                            if 'birth' in label or 'né' in label:
                                date_text = cells[1].text.strip()
                                # Extract date pattern
                                date_match = re.search(r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})', date_text)
                                if date_match:
                                    day, month, year = date_match.groups()
                                    birth_date = date(int(year), int(month), int(day))
                            elif 'death' in label or 'mort' in label or 'décès' in label:
                                date_text = cells[1].text.strip()
                                date_match = re.search(r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})', date_text)
                                if date_match:
                                    day, month, year = date_match.groups()
                                    death_date = date(int(year), int(month), int(day))
                
                return birth_date, death_date
            
            time.sleep(0.5)
            return None, None
        except Exception as e:
            logger.debug(f"StatsF1 error for {driver_name}: {str(e)}")
            return None, None
    
    def load_known_driver_data(self) -> Tuple[Dict[str, date], Dict[str, date]]:
        """Load dictionaries of known F1 driver birth and death dates"""
        # Known birth dates
        known_births = {
            'michael-schumacher': date(1969, 1, 3),
            'lewis-hamilton': date(1985, 1, 7),
            'sebastian-vettel': date(1987, 7, 3),
            'fernando-alonso': date(1981, 7, 29),
            'kimi-raikkonen': date(1979, 10, 17),
            'max-verstappen': date(1997, 9, 30),
            'charles-leclerc': date(1997, 10, 16),
            'carlos-sainz-jr': date(1994, 9, 1),
            'daniel-ricciardo': date(1989, 7, 1),
            'lando-norris': date(1999, 11, 13),
            'sergio-perez': date(1990, 1, 26),
            'valtteri-bottas': date(1989, 8, 28),
            'pierre-gasly': date(1996, 2, 7),
            'esteban-ocon': date(1996, 9, 17),
            'lance-stroll': date(1998, 10, 29),
            'george-russell': date(1998, 2, 15),
            'nicholas-latifi': date(1995, 6, 29),
            'alexander-albon': date(1996, 3, 23),
            'antonio-giovinazzi': date(1993, 12, 14),
            'kevin-magnussen': date(1992, 10, 5),
            'romain-grosjean': date(1986, 4, 17),
            'nico-hulkenberg': date(1987, 8, 19),
            'daniil-kvyat': date(1994, 4, 26),
            'robert-kubica': date(1984, 12, 7),
            'felipe-massa': date(1981, 4, 25),
            'nico-rosberg': date(1985, 6, 27),
            'jenson-button': date(1980, 1, 19),
            'mark-webber': date(1976, 8, 27),
            'rubens-barrichello': date(1972, 5, 23),
            'david-coulthard': date(1971, 3, 27),
            'mika-hakkinen': date(1968, 9, 28),
            'jacques-villeneuve': date(1971, 4, 9),
            'damon-hill': date(1960, 9, 17),
            'nigel-mansell': date(1953, 8, 8),
            'alain-prost': date(1955, 2, 24),
            'ayrton-senna': date(1960, 3, 21),
            'nelson-piquet': date(1952, 8, 17),
            'niki-lauda': date(1949, 2, 22),
            'james-hunt': date(1947, 8, 29),
            'mario-andretti': date(1940, 2, 28),
            'jackie-stewart': date(1939, 6, 11),
            'graham-hill': date(1929, 2, 15),
            'jim-clark': date(1936, 3, 4),
            'stirling-moss': date(1929, 9, 17),
            'juan-manuel-fangio': date(1911, 6, 24),
            'alberto-ascari': date(1918, 7, 13),
            'pastor-maldonado': date(1985, 3, 9),
            'paul-di-resta': date(1986, 4, 16),
            'adrian-sutil': date(1983, 1, 11),
            'kamui-kobayashi': date(1986, 9, 13),
            'jean-eric-vergne': date(1990, 4, 25),
            'jules-bianchi': date(1989, 8, 3),
            'will-stevens': date(1991, 6, 28),
            'roberto-merhi': date(1991, 3, 22),
            'marcus-ericsson': date(1990, 9, 2),
            'pascal-wehrlein': date(1994, 10, 18),
            'stoffel-vandoorne': date(1992, 3, 26),
            'jolyon-palmer': date(1991, 1, 20),
            'rio-haryanto': date(1993, 1, 22),
            'nikita-mazepin': date(1999, 3, 2),
            'mick-schumacher': date(1999, 3, 22),
            'yuki-tsunoda': date(2000, 5, 11),
            'zhou-guanyu': date(1999, 5, 30),
            'nyck-de-vries': date(1995, 2, 6),
            'logan-sargeant': date(2000, 12, 31),
            'oscar-piastri': date(2001, 4, 6),
            'liam-lawson': date(2002, 2, 11),
        }
        
        # Known death dates for deceased drivers
        known_deaths = {
            'ayrton-senna': date(1994, 5, 1),
            'jules-bianchi': date(2015, 7, 17),
            'niki-lauda': date(2019, 5, 20),
            'james-hunt': date(1993, 6, 15),
            'graham-hill': date(1975, 11, 29),
            'jim-clark': date(1968, 4, 7),
            'stirling-moss': date(2020, 4, 12),
            'juan-manuel-fangio': date(1995, 7, 17),
            'alberto-ascari': date(1955, 5, 26),
            'gilles-villeneuve': date(1982, 5, 8),
            'ronnie-peterson': date(1978, 9, 11),
            'francois-cevert': date(1973, 10, 6),
            'jochen-rindt': date(1970, 9, 5),
            'lorenzo-bandini': date(1967, 5, 10),
            'wolfgang-von-trips': date(1961, 9, 10),
            'peter-collins': date(1958, 8, 3),
            'mike-hawthorn': date(1959, 1, 22),
            'luigi-musso': date(1958, 7, 6),
            'stuart-lewis-evans': date(1958, 10, 25),
            'eugenio-castellotti': date(1957, 3, 14),
            'tom-pryce': date(1977, 3, 5),
            'roger-williamson': date(1973, 7, 29),
            'piers-courage': date(1970, 6, 21),
            'jo-siffert': date(1971, 10, 24),
            'pedro-rodriguez': date(1971, 7, 11),
            'bruce-mclaren': date(1970, 6, 2),
            'jo-schlesser': date(1968, 7, 8),
            'ludovico-scarfiotti': date(1968, 6, 8),
            'chris-bristow': date(1960, 6, 19),
            'alan-stacey': date(1960, 6, 19),
            'harry-schell': date(1960, 5, 13),
            'jean-behra': date(1959, 8, 1),
            'pat-oconnor': date(1958, 5, 30),
            'bill-vukovich': date(1955, 5, 30),
            'onofre-marimon': date(1954, 7, 31),
        }
        
        return known_births, known_deaths
    
    def get_driver_dates(self, driver_id: str, driver_name: str = None) -> Tuple[Optional[date], Optional[date]]:
        """Get birth and death dates for a driver using multiple sources"""
        # Check cache first
        birth_date = self.driver_birth_dates.get(driver_id)
        death_date = self.driver_death_dates.get(driver_id)
        
        if birth_date:
            return birth_date, death_date
        
        # Check known dates
        known_births, known_deaths = self.load_known_driver_data()
        
        if driver_id in known_births:
            self.driver_birth_dates[driver_id] = known_births[driver_id]
            birth_date = known_births[driver_id]
            
            if driver_id in known_deaths:
                self.driver_death_dates[driver_id] = known_deaths[driver_id]
                death_date = known_deaths[driver_id]
            
            return birth_date, death_date
        
        # If we have a driver name, try to scrape
        if driver_name:
            # Try Wikipedia first
            birth_date, death_date = self.get_driver_info_wikipedia(driver_name)
            if birth_date:
                self.driver_birth_dates[driver_id] = birth_date
                if death_date:
                    self.driver_death_dates[driver_id] = death_date
                logger.info(f"Found dates for {driver_name}: Born {birth_date}, Died {death_date if death_date else 'Still alive'}")
                return birth_date, death_date
            
            # Try StatsF1
            birth_date, death_date = self.get_driver_info_statsf1(driver_name)
            if birth_date:
                self.driver_birth_dates[driver_id] = birth_date
                if death_date:
                    self.driver_death_dates[driver_id] = death_date
                logger.info(f"Found dates for {driver_name}: Born {birth_date}, Died {death_date if death_date else 'Still alive'}")
                return birth_date, death_date
        
        return None, None
    
    def calculate_age(self, birth_date: date, reference_date: date) -> Optional[int]:
        """Calculate age at a given date"""
        if not birth_date or not reference_date:
            return None
        
        age = reference_date.year - birth_date.year
        if reference_date.month < birth_date.month or \
           (reference_date.month == birth_date.month and reference_date.day < birth_date.day):
            age -= 1
        return age
    
    def process_race_data(self, race_file: str, output_file: str):
        """Process race data and add age information"""
        logger.info(f"Processing race data from {race_file}")
        
        try:
            # Read the race data
            df_races = pd.read_csv(race_file)
            logger.info(f"Loaded {len(df_races)} race records")
            
            # Add age column
            ages = []
            
            for idx, row in df_races.iterrows():
                if idx % 100 == 0:
                    logger.info(f"Processing race record {idx}/{len(df_races)}")
                
                driver_id = row.get('driverId', '')
                year = row.get('year', None)
                round_num = row.get('round', 1)
                
                if driver_id and year:
                    # Get driver birth date
                    birth_date, _ = self.get_driver_dates(driver_id)
                    
                    if birth_date:
                        # Estimate race date (assuming races spread throughout the year)
                        race_month = min(12, round_num)  # Simple approximation
                        race_date = date(int(year), race_month, 15)  # Mid-month approximation
                        age = self.calculate_age(birth_date, race_date)
                        ages.append(age)
                    else:
                        ages.append(None)
                else:
                    ages.append(None)
            
            df_races['age_during_race'] = ages
            
            # Save to new CSV
            df_races.to_csv(output_file, index=False)
            logger.info(f"Saved race data with ages to {output_file}")
            
            # Print statistics
            valid_ages = [a for a in ages if a is not None]
            logger.info(f"Successfully calculated ages for {len(valid_ages)}/{len(df_races)} records")
            if valid_ages:
                logger.info(f"Age range: {min(valid_ages)} to {max(valid_ages)} years")
            
            return df_races
            
        except Exception as e:
            logger.error(f"Error processing race data: {str(e)}")
            raise
    
    def process_driver_data(self, driver_file: str, driver_season_file: str, 
                           output_driver_file: str, output_season_file: str):
        """Process driver data and add DOB, DOD, current age and F1 entry/retirement ages"""
        logger.info(f"Processing driver data from {driver_file} and {driver_season_file}")
        
        try:
            # Read the data
            df_drivers = pd.read_csv(driver_file)
            df_seasons = pd.read_csv(driver_season_file)
            
            logger.info(f"Loaded {len(df_drivers)} drivers and {len(df_seasons)} season records")
            
            # Process driver dataset
            dobs = []
            dods = []
            current_ages = []
            f1_entry_ages = []
            retirement_ages = []
            
            for idx, row in df_drivers.iterrows():
                if idx % 50 == 0:
                    logger.info(f"Processing driver {idx}/{len(df_drivers)}")
                
                driver_id = row.get('driverid', '')
                driver_name = row.get('name', '')
                
                if driver_id:
                    # Get birth and death dates
                    birth_date, death_date = self.get_driver_dates(driver_id, driver_name)
                    
                    if birth_date:
                        # Format dates for CSV
                        dobs.append(birth_date.strftime('%Y-%m-%d'))
                        dods.append(death_date.strftime('%Y-%m-%d') if death_date else None)
                        
                        # Current age (as of today, or at death)
                        if death_date:
                            # Age at death
                            current_age = self.calculate_age(birth_date, death_date)
                        else:
                            # Current age if alive
                            current_age = self.calculate_age(birth_date, date.today())
                        current_ages.append(current_age)
                        
                        # Get driver's seasons
                        driver_seasons = df_seasons[df_seasons['driverId'] == driver_id]
                        
                        if not driver_seasons.empty:
                            # F1 entry age (first year)
                            first_year = driver_seasons['year'].min()
                            entry_date = date(int(first_year), 3, 1)  # Approximate season start
                            entry_age = self.calculate_age(birth_date, entry_date)
                            f1_entry_ages.append(entry_age)
                            
                            # Retirement age (if retired - check if last year is before 2024)
                            last_year = driver_seasons['year'].max()
                            if last_year < 2024:  # Assuming retired if not active recently
                                retirement_date = date(int(last_year), 11, 30)  # Approximate season end
                                retirement_age = self.calculate_age(birth_date, retirement_date)
                                retirement_ages.append(retirement_age)
                            else:
                                retirement_ages.append(None)  # Still active
                        else:
                            f1_entry_ages.append(None)
                            retirement_ages.append(None)
                    else:
                        dobs.append(None)
                        dods.append(None)
                        current_ages.append(None)
                        f1_entry_ages.append(None)
                        retirement_ages.append(None)
                else:
                    dobs.append(None)
                    dods.append(None)
                    current_ages.append(None)
                    f1_entry_ages.append(None)
                    retirement_ages.append(None)
            
            # Add columns to driver dataset
            df_drivers['date_of_birth'] = dobs
            df_drivers['date_of_death'] = dods
            df_drivers['current_age'] = current_ages
            df_drivers['age_f1_entry'] = f1_entry_ages
            df_drivers['age_retirement'] = retirement_ages
            
            # Save driver data
            df_drivers.to_csv(output_driver_file, index=False)
            logger.info(f"Saved driver data with ages to {output_driver_file}")
            
            # Process driver season data - add current age and age during season
            season_ages = []
            current_ages_season = []
            
            for idx, row in df_seasons.iterrows():
                if idx % 100 == 0:
                    logger.info(f"Processing season record {idx}/{len(df_seasons)}")
                
                driver_id = row.get('driverId', '')
                year = row.get('year', None)
                
                if driver_id and year:
                    birth_date, death_date = self.get_driver_dates(driver_id)
                    
                    if birth_date:
                        # Age at mid-season
                        season_date = date(int(year), 7, 1)
                        age = self.calculate_age(birth_date, season_date)
                        season_ages.append(age)
                        
                        # Current age (as of today, or at death)
                        if death_date:
                            current_age = self.calculate_age(birth_date, death_date)
                        else:
                            current_age = self.calculate_age(birth_date, date.today())
                        current_ages_season.append(current_age)
                    else:
                        season_ages.append(None)
                        current_ages_season.append(None)
                else:
                    season_ages.append(None)
                    current_ages_season.append(None)
            
            df_seasons['age_during_season'] = season_ages
            df_seasons['current_age'] = current_ages_season
            
            # Save season data
            df_seasons.to_csv(output_season_file, index=False)
            logger.info(f"Saved season data with ages to {output_season_file}")
            
            # Print statistics
            valid_dobs = [d for d in dobs if d is not None]
            valid_dods = [d for d in dods if d is not None]
            valid_current = [a for a in current_ages if a is not None]
            valid_entry = [a for a in f1_entry_ages if a is not None]
            valid_retirement = [a for a in retirement_ages if a is not None]
            
            logger.info(f"\nStatistics:")
            logger.info(f"Birth dates found: {len(valid_dobs)}/{len(df_drivers)}")
            logger.info(f"Death dates found: {len(valid_dods)}/{len(df_drivers)}")
            logger.info(f"Current ages calculated: {len(valid_current)}/{len(df_drivers)}")
            logger.info(f"F1 entry ages calculated: {len(valid_entry)}/{len(df_drivers)}")
            logger.info(f"Retirement ages calculated: {len(valid_retirement)}/{len(df_drivers)}")
            
            if valid_entry:
                logger.info(f"F1 entry age range: {min(valid_entry)} to {max(valid_entry)} years")
            if valid_retirement:
                logger.info(f"Retirement age range: {min(valid_retirement)} to {max(valid_retirement)} years")
            
            return df_drivers, df_seasons
            
        except Exception as e:
            logger.error(f"Error processing driver data: {str(e)}")
            raise

# Main execution
def main():
    """Main function to run the scraper"""
    scraper = F1DriverAgeScraper()
    
    # Process race data
    try:
        logger.info("Starting F1 driver age data collection...")
        
        # Process race details
        scraper.process_race_data(
            race_file='Race Details.csv',
            output_file='Race_Details_with_Ages.csv'
        )
        
        # Process driver data
        scraper.process_driver_data(
            driver_file='Driver Dataset.csv',
            driver_season_file='Driver_Season.csv',
            output_driver_file='Driver_Dataset_with_Ages.csv',
            output_season_file='Driver_Season_with_Ages.csv'
        )
        
        logger.info("✅ Successfully completed all data processing!")
        logger.info("Output files created:")
        logger.info("  - Race_Details_with_Ages.csv")
        logger.info("  - Driver_Dataset_with_Ages.csv (with DOB and DOD)")
        logger.info("  - Driver_Season_with_Ages.csv (with current age)")
        
    except Exception as e:
        logger.error(f"❌ Failed to process data: {str(e)}")
        raise

if __name__ == "__main__":
    main()
    

2025-11-03 23:37:44,366 - INFO - Starting F1 driver age data collection...
2025-11-03 23:37:44,367 - INFO - Processing race data from Race Details.csv
2025-11-03 23:37:44,376 - INFO - Loaded 7909 race records
2025-11-03 23:37:44,378 - INFO - Processing race record 0/7909
2025-11-03 23:37:44,382 - INFO - Processing race record 100/7909
2025-11-03 23:37:44,384 - INFO - Processing race record 200/7909
2025-11-03 23:37:44,386 - INFO - Processing race record 300/7909
2025-11-03 23:37:44,388 - INFO - Processing race record 400/7909
2025-11-03 23:37:44,390 - INFO - Processing race record 500/7909
2025-11-03 23:37:44,393 - INFO - Processing race record 600/7909
2025-11-03 23:37:44,395 - INFO - Processing race record 700/7909
2025-11-03 23:37:44,397 - INFO - Processing race record 800/7909
2025-11-03 23:37:44,399 - INFO - Processing race record 900/7909
2025-11-03 23:37:44,402 - INFO - Processing race record 1000/7909
2025-11-03 23:37:44,404 - INFO - Processing race record 1100/7909
2025-11-03 