# Motorsport.com F1 News Crawler

This notebook crawls F1 news articles from motorsport.com based on specific filters (Year, Team, Content Type). Refactored to use direct URL lists and improved extraction logic.

In [9]:
import os
import time
import traceback
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options

In [10]:
# Config
SOURCE = "motorsport_com"
OUTPUT_DIR = f"(ENG)F1_{SOURCE}"
BASE_URL = "https://www.motorsport.com/f1/news/"

# Ensure output directory exists
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Crawler Configuration
# CRAWL_LIST: List of (TeamName, Year, URL)
# Please populate this list with the filtered URLs you want to crawl.
# Using 3 links for testing as requested.
CRAWL_LIST = [
    ("Sauber", "2020", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=2871&filters%5Bteam%5D%5B%5D=17888"),
    ("Alpine", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=20186"),
    ("Aston Martin", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=97"),
    ("Ferrari", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=3"),
    ("Scuderia Ferrari", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=40043"),
    ("HAAS", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=471"),
    ("Sauber", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=9"),
    ("McLaren", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=10"),
    ("Mercedes", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=5"),
    ("Red Bull Racing", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=4"),
    ("Racing Bulls", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=12"),
    ("Alfa Romeo", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=17888"),
    ("Williams", "2022", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4021&filters%5Bteam%5D%5B%5D=13"),

    ("Alpine", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=20186"),
    ("Aston Martin", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=97"),
    ("Ferrari", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=3"),
    ("Scuderia Ferrari", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=40043"),
    ("HAAS", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=471"),
    ("Kick Sauber", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=9"),
    ("McLaren", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=10"),
    ("Mercedes", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=5"),
    ("Red Bull Racing", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=4"),
    ("Alfa Romeo", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=17888"),
    ("Racing Bulls", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=12"),
    ("Williams", "2023", "https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=4446&filters%5Bteam%5D%5B%5D=13"),

    ("Alpine", "2024", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4660&filters%5Bteam%5D%5B%5D=20186"),
    ("Aston Martin", "2024", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4660&filters%5Bteam%5D%5B%5D=97"),
    ("Ferrari", "2024", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4660&filters%5Bteam%5D%5B%5D=3"),
    ("Scuderia Ferrari", "2024", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4660&filters%5Bteam%5D%5B%5D=40043"),
    ("HAAS", "2024", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4660&filters%5Bteam%5D%5B%5D=471"),
    ("Kick Sauber", "2024", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4660&filters%5Bteam%5D%5B%5D=9"),
    ("McLaren", "2024", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4660&filters%5Bteam%5D%5B%5D=10"),
    ("Mercedes", "2024", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4660&filters%5Bteam%5D%5B%5D=5"),
    ("Red Bull Racing", "2024", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4660&filters%5Bteam%5D%5B%5D=4"),
    ("RB Racing", "2024", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4660&filters%5Bteam%5D%5B%5D=5255"),
    ("Racing Bulls", "2024", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4660&filters%5Bteam%5D%5B%5D=12"),
    ("Williams", "2024", "https://www.motorsport.com/f1/live-text/f1-abu-dhabi-gp-live-commentary-and-updates-race-1123853/1123854/"),

    ("Alpine", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=20186"),
    ("Aston Martin", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=97"),
    ("Ferrari", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=3"),
    ("Scuderia Ferrari", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=40043"),
    ("HAAS", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=471"),
    ("Kick Sauber", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=9"),
    ("McLaren", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=10"),
    ("Mercedes", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=5"),
    ("Red Bull Racing", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=4"),
    ("Alfa Romeo", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=17888"),
    ("Racing Bulls", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=12"),
    ("Williams", "2025", "https://www.motorsport.com/f1/news/?filters%5Bchampionship%5D=4855&filters%5Bteam%5D%5B%5D=13")
]

# Team Mapping for filename resolution (Team Name -> Filename Key)
TEAMS_MAPPING = {
    "Mercedes": "Mercedes",
    "Red Bull Racing": "Red_Bull",
    "RB Racing": "Red_Bull",
    "Red Bull": "Red_Bull",
    "Scuderia Ferrari": "Ferrari",
    "Ferrari": "Ferrari",
    "McLaren": "McLaren",
    "Alpine": "Alpine",
    "Renault": "Alpine",
    "Lotus": "Alpine",
    "Toleman": "Alpine",
    "Aston Martin": "Aston_Martin",
    "Racing Point": "Aston_Martin",
    "Williams": "Williams",
    "Haas F1 Team": "Haas",
    "Haas": "Haas",
    "Alfa Romeo": "Sauber",
    "Kick Sauber": "Sauber",
    "Sauber": "Sauber",
    "AlphaTauri": "RB",
    "Visa Cash App RB": "RB",
    "Racing Bulls": "RB",
    "RB": "RB"
}

In [11]:
def setup_driver():
    options = Options()
    # options.add_argument("--headless") # Uncomment for headless mode
    options.add_argument("--start-maximized")
    options.add_argument("--disable-notifications")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    # IMPORTANT: Eager strategy waits for DOMContentLoaded but not all images/stylesheets
    # This prevents the 'Read timed out' errors when ads/trackers take too long.
    options.page_load_strategy = 'eager'
    
    driver = webdriver.Chrome(options=options)
    driver.set_page_load_timeout(120) # 120 seconds max for page load
    return driver

def handle_popups(driver):
    # Attempt to close cookie banners or ads
    try:
        # Example selectors - adjust based on actual site structure
        # TrustArc, OneTrust, or custom buttons
        accept_btns = driver.find_elements(By.XPATH, "//button[contains(text(), 'Accept all') or contains(text(), 'Agree')]")
        for btn in accept_btns:
            if btn.is_displayed():
                btn.click()
                print("INFO: Cookie popup closed.")
                time.sleep(1)
    except Exception:
        pass
    
    try:
        # Generic close buttons
        close_btns = driver.find_elements(By.XPATH, "//button[contains(@class, 'close')]")
        for btn in close_btns:
            if btn.is_displayed():
                btn.click()
                print("INFO: Ad/Modal closed.")
    except Exception:
        pass

In [12]:
def extract_article_body(driver):
    """
    Extracts pure body text using JS to remove <a> tags and widgets within paragraphs.
    """
    try:
        # Wait for article body to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "p")))
        
        # Use JS to grab text content after cleaning <a>, script, style, etc.
        # We check a list of potential container selectors.
        
        script = """
        // List of candidate selectors for the article body container
        var candidates = [
            '.ms-article-content', // Found in 2024 HTML (single dash)
            '.msnt-styled-content', // Another class on the same div
            '.ms-article__body',   // Parent wrapper (double underscore)
            '.ms-article__content', // Legacy/Fallback
            '.ms-article-body',    // Legacy/Fallback
            '.ms-content-body',
            'article', 
            '.ms-article',
            'main',
            '.ms-grid-article'
        ];
        
        var contentDiv = null;
        for (var i = 0; i < candidates.length; i++) {
            contentDiv = document.querySelector(candidates[i]);
            if (contentDiv) {
                // Verify it has paragraphs
                if (contentDiv.querySelectorAll('p').length > 0) {
                    break;
                }
                // If it has no Ps, it's probably not the right container or empty
                contentDiv = null;
            }
        }
        
        if (!contentDiv) return [];
        
        var paragraphs = contentDiv.querySelectorAll('p');
        var results = [];
        
        paragraphs.forEach(function(p) {
            // Clone the node to avoid modifying the visual page (optional, but good practice)
            var clone = p.cloneNode(true);
            
            // Remove unwanted elements from the clone
            var badSelectors = 'a, script, style, iframe, .widget, .social-embed, .ms-related-articles';
            var junk = clone.querySelectorAll(badSelectors);
            junk.forEach(el => el.remove());
            
            // Get text content
            var text = clone.textContent.trim();
            // Basic filtering for empty or 'read more' type lines
            if (text.length > 20) {
                results.push(text);
            }
        });
        return results;
        """
        
        body_text_list = driver.execute_script(script)
        return "\n\n".join(body_text_list)
            
    except Exception as e:
        print(f"WARNING: Failed to extract content. {e}")
        return ""


In [13]:
def crawl_list_item(team_name, year, url, output_dir, teams_mapping):
    """
    Process a single list item (Team, Year, URL). 
    Returns True if successful, False if failed/crashed.
    """
    driver = None
    
    def restart_driver(old_driver=None):
        if old_driver:
            try:
                old_driver.quit()
            except:
                pass
        return setup_driver()

    try:
        print(f"\nINFO: Processing {team_name} - {year}")
        print(f"      URL: {url}")
        
        driver = restart_driver()
        
        driver.get(url)
        handle_popups(driver)
        
        # Wait for grid to load
        wait = WebDriverWait(driver, 10)
        
        try:
            grid_container = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, ".ms-content__main")
            ))
        except Exception:
            print("WARNING: Could not find .ms-content__main, trying fallback...")
            grid_container = driver.find_element(By.TAG_NAME, "body")
        
        # Find article links
        articles = grid_container.find_elements(By.CSS_SELECTOR, "a.ms-item")
        
        article_links = [a.get_attribute('href') for a in articles if a.get_attribute('href')]
        article_links = list(dict.fromkeys(article_links))
        
        print(f"INFO: Found {len(article_links)} articles.")
        
        # Close list page driver to save resources while iterating deeply
        driver.quit()
        driver = None
        
        # Process articles with periodic restarts
        driver = restart_driver()
        
        for i, link in enumerate(article_links):
            # Restart driver every 5 articles to prevent memory leaks/zombie hangs
            if i > 0 and i % 5 == 0:
                print("INFO: Periodic driver restart to ensure stability...")
                driver = restart_driver(driver)
                
            try:
                driver.get(link)
                
                # Force stop loading once we have the bare minimum
                # Waiting for body container or at least valid P tags first
                try:
                    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "p")))
                    # Check for main content specific class if possible to be sure
                    # But simply stopping here is often enough to kill ads
                    driver.execute_script("window.stop();")
                except:
                    pass # Timeout waiting for P handled by extract_article_body returning empty
                
                title = driver.title
                content = extract_article_body(driver)
                
                if content:
                    text_block = f"======\nSOURCE: {link}\nDATE: {year}\n\n{content}\n"
                    
                    file_key = teams_mapping.get(team_name, team_name.replace(" ", "_"))
                    filename = f"{file_key}.txt"
                    filepath = os.path.join(output_dir, filename)
                    
                    with open(filepath, "a", encoding="utf-8") as f:
                        f.write(text_block)
                    
                    print(f"SUCCESS: Saved '{title}' to {filename}")
                else:
                    print(f"WARNING: No content extracted for '{title}'")
                
                # No need to go back since we have the list of links and just .get(next_link)
                time.sleep(1)
                
            except Exception as e:
                print(f"ERROR: Failed processing article {link}: {e}")
                # If a specific article crashed the driver, we MUST restart it for the next one
                print("INFO: Restarting driver due to error...")
                driver = restart_driver(driver)

    except Exception as e:
        print(f"ERROR: Failed processing list item {team_name}/{year}: {e}")
    finally:
        if driver:
            try:
                driver.quit()
            except:
                pass

def crawl_main():
    # Main Loop
    for team_name, year, url in CRAWL_LIST:
        crawl_list_item(team_name, year, url, OUTPUT_DIR, TEAMS_MAPPING)
        time.sleep(2) # Cooldown between teams
    print("Done.")

In [14]:
if __name__ == "__main__":
    crawl_main()


INFO: Processing Sauber - 2020
      URL: https://www.motorsport.com/f1/news/?filters%5Bdiscovery_type%5D%5B%5D=1&filters%5Bchampionship%5D=2871&filters%5Bteam%5D%5B%5D=17888
INFO: Found 9 articles.
SUCCESS: Saved 'Kubica gets another Alfa Romeo F1 practice outing in Bahrain' to Sauber.txt
SUCCESS: Saved 'F4 videos helped Giovinazzi pull off charging Imola F1 start' to Sauber.txt
SUCCESS: Saved 'FIA to review "concerning" Giovinazzi loose wheel incident' to Sauber.txt
SUCCESS: Saved 'Russell thankful for F1 halo after "scary" Giovinazzi crash' to Sauber.txt
SUCCESS: Saved 'F1 News: Kubica returns to Alfa Romeo cockpit for Silverstone FP1' to Sauber.txt
INFO: Periodic driver restart to ensure stability...
SUCCESS: Saved 'Kubica to drive Raikkonen’s Alfa in Hungarian GP practice' to Sauber.txt
SUCCESS: Saved 'Alfa Romeo fined for Raikkonen's wheel loss' to Sauber.txt
SUCCESS: Saved 'Alfa Romeo: It would've been "unfair" to race without McLaren' to Sauber.txt
SUCCESS: Saved 'Kubica ends 