In [1]:
pip install selenium webdriver-manager pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
#!/usr/bin/env python
# coding: utf-8

# **Overview**
# Scrapes upcoming UFC fights from `ufc.com/events`, mainly for missing fight matchups.

# **Imports**
import pandas as pd
import asyncio
import aiohttp
import nest_asyncio
import os
import yaml
import re  # Added for regex extraction
from datetime import datetime
from bs4 import BeautifulSoup
import scrape_ufc_stats_library as LIB  # ✅ Import the library functions
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

BASE_URL = "https://www.ufc.com"
EVENTS_URL = f"{BASE_URL}/events#events-list-upcoming"

In [3]:
# Initialize ChromeDriver
def init_driver():
    options = webdriver.ChromeOptions()
    options.headless = True  # Set to False if you want to see the browser
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [4]:
# Function to fetch upcoming UFC events
def fetch_upcoming_events():
    print("🔍 Fetching upcoming UFC events...")

    EVENTS_URL = "https://www.ufc.com/events#events-list-upcoming"
    driver = init_driver()
    driver.get(EVENTS_URL)

    try:
        # Wait until event elements are loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.c-card-event--result"))
        )

        event_cards = driver.find_elements(By.CSS_SELECTOR, "div.c-card-event--result")

        if not event_cards:
            print("❌ No event cards found! UFC may have changed the structure.")
            return pd.DataFrame()

        events = []
        for event in event_cards:
            try:
                event_name = event.find_element(By.CSS_SELECTOR, "h3.c-card-event--result__headline").text.strip()
                event_date = event.find_element(By.CSS_SELECTOR, "div.c-card-event--result__date").text.strip()
                event_location = event.find_element(By.CSS_SELECTOR, "div.c-card-event--result__location").text.strip()
                event_url = event.find_element(By.CSS_SELECTOR, "a.c-card-event--result__logo-link").get_attribute("href")

                # Skip "TBD vs TBD" events
                if "TBD vs TBD" in event_name:
                    continue

                # Check if it's a PPV or Fight Night
                if "UFC Fight Night" in event_url:
                    formatted_name = f"UFC Fight Night: {event_name}"
                else:
                    formatted_name = f"UFC {event_name}"

                events.append({
                    "EVENT": formatted_name,
                    "DATE": event_date,
                    "LOCATION": event_location,
                    "URL": event_url
                })

            except Exception as e:
                print(f"⚠️ Error parsing event: {e}")

        driver.quit()
        
        if events:
            print(f"✅ Successfully extracted {len(events)} upcoming UFC events!")
        else:
            print("❌ No upcoming events found!")
        
        return pd.DataFrame(events)

    except Exception as e:
        print(f"❌ Error fetching events: {e}")
        driver.quit()
        return pd.DataFrame()

In [5]:
# ✅ Function to Fetch Upcoming Fights for Each Event
def fetch_upcoming_fights(events_df):
    print("🔍 Fetching upcoming UFC fights...")
    driver = init_driver()

    all_fights = []

    try:
        for _, event in events_df.iterrows():
            event_name = event["EVENT"]
            event_date = event["DATE"]
            event_location = event["LOCATION"]
            event_url = event["URL"]

            print(f"🔍 Scraping fights for {event_name}...")

            driver.get(event_url)
            time.sleep(5)

            soup = BeautifulSoup(driver.page_source, "html.parser")
            fight_cards = soup.find_all("div", class_="c-listing-fight")

            if not fight_cards:
                print(f"⚠️ No fights found for {event_name}. Skipping...")
                continue

            for fight in fight_cards:
                fight_id = fight.get("data-fmid", "N/A")
                fight_url = f"{event_url}#{fight_id}"

                weight_class_tag = fight.find("div", class_="c-listing-fight__class-text")
                weight_class = weight_class_tag.text.strip() if weight_class_tag else "N/A"

                red_fighter_tag = fight.find("div", class_="c-listing-fight__corner-name--red")
                red_fighter = red_fighter_tag.text.strip() if red_fighter_tag else "N/A"

                blue_fighter_tag = fight.find("div", class_="c-listing-fight__corner-name--blue")
                blue_fighter = blue_fighter_tag.text.strip() if blue_fighter_tag else "N/A"

                red_profile_tag = fight.find("a", class_="c-listing-fight__corner-name--red")
                red_profile_url = f"https://www.ufc.com{red_profile_tag['href']}" if red_profile_tag else "N/A"

                blue_profile_tag = fight.find("a", class_="c-listing-fight__corner-name--blue")
                blue_profile_url = f"https://www.ufc.com{blue_profile_tag['href']}" if blue_profile_tag else "N/A"

                all_fights.append({
                    "Event": event_name,
                    "Date": event_date,
                    "Location": event_location,
                    "Bout": f"{red_fighter} vs {blue_fighter}",
                    "Red Fighter": red_fighter,
                    "Blue Fighter": blue_fighter,
                    "Weight Class": weight_class,
                    "Fight URL": fight_url,
                    "Red URL": red_profile_url,
                    "Blue URL": blue_profile_url
                })

        print(f"✅ Successfully extracted {len(all_fights)} upcoming fights!")
        return pd.DataFrame(all_fights)

    finally:
        driver.quit()

In [6]:
# ✅ Function to Fetch Fighter Stats
def fetch_fighter_stats(fighter_url):
    print(f"🔍 Fetching stats for {fighter_url}...")
    driver = init_driver()
    driver.get(fighter_url)
    time.sleep(3)  # Wait for JavaScript to load
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    stats = {
        "Age": "N/A", "Record": "N/A", "Country": "N/A", "Height": "N/A", "Weight": "N/A",
        "Reach": "N/A", "Leg Reach": "N/A", "KO/TKO %": "N/A", "Submission %": "N/A",
        "Decision %": "N/A", "Average Fight Time": "N/A", "Knockdown Avg/15 Min": "N/A",
        "Significant Strikes Landed Per Min": "N/A", "Significant Strike Accuracy": "N/A",
        "Significant Strike Absorbed Per Min": "N/A", "Significant Strike Defense": "N/A",
        "Takedown Avg/15 Min": "N/A", "Takedown Accuracy": "N/A", "Takedown Defense": "N/A",
        "Submission Avg/15 Min": "N/A"
    }

    stats_table = soup.find_all("div", class_="c-stat-compare__group")
    for stat in stats_table:
        label = stat.find("div", class_="c-stat-compare__label").text.strip()
        value = stat.find("div", class_="c-stat-compare__number").text.strip()
        if label in stats:
            stats[label] = value

    return stats

In [7]:
# ✅ Running the Scraper
events_df = fetch_upcoming_events()
fights_df = fetch_upcoming_fights(events_df)

🔍 Fetching upcoming UFC events...
❌ Error fetching events: name 'WebDriverWait' is not defined
🔍 Fetching upcoming UFC fights...
✅ Successfully extracted 0 upcoming fights!


In [8]:
events_df

In [9]:
fights_df

In [10]:
# Save CSV files
events_df.to_csv("datasets/upcoming_events_ufc.csv", index=False)
fights_df.to_csv("datasets/upcoming_fights_ufc.csv", index=False)

In [11]:
print("✅ Data saved to CSV files!")

✅ Data saved to CSV files!


In [12]:
# Helper function to fetch and parse a page
def get_soup(url):
    """Fetches a webpage and returns a BeautifulSoup object."""
    response = requests.get(url)
    return BeautifulSoup(response.content, "html.parser")

async def get_soup_async(url, session):
    """Asynchronous function to fetch a webpage and return a BeautifulSoup object."""
    async with session.get(url) as response:
        if response.status == 200:
            return BeautifulSoup(await response.text(), "html.parser")
        return None

In [13]:
# Function to determine event type (PPV or Fight Night)
def determine_event_type(event_url, bout_name):
    if "fight-night" in event_url:
        return f"UFC Fight Night: {bout_name}"
    ppv_match = re.search(r"ufc-(\d+)", event_url)
    if ppv_match:
        return f"UFC {ppv_match.group(1)}: {bout_name}"
    return f"UFC Event: {bout_name}"

In [14]:
def fetch_upcoming_events():
    """Fetches upcoming UFC events using Selenium (for JavaScript-rendered content)."""
    print("🔍 Fetching upcoming UFC events...")

    # Set up Selenium with headless Chrome
    options = Options()
    options.headless = True  # Change to False if you want to see the browser
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        EVENTS_URL = "https://www.ufc.com/events#events-list-upcoming"
        driver.get(EVENTS_URL)
        time.sleep(5)  # Wait for JavaScript to load content

        # Get page source after JavaScript execution
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find event cards
        event_cards = soup.find_all("div", class_="c-card-event--result")

        if not event_cards:
            print("❌ No upcoming events found! UFC structure may have changed.")
            return pd.DataFrame()

        events = []
        for event in event_cards:
            event_name_tag = event.find("h3", class_="c-card-event--result__headline")
            event_name = event_name_tag.text.strip() if event_name_tag else "Unknown Event"

            event_date_tag = event.find("div", class_="c-card-event--result__date")
            event_date = event_date_tag.text.strip() if event_date_tag else "Unknown Date"

            event_location_tag = event.find("div", class_="c-card-event--result__location")
            event_location = event_location_tag.text.strip() if event_location_tag else "Unknown Location"

            event_link_tag = event.find("a", class_="c-card-event--result__logo-link")
            event_url = f"https://www.ufc.com{event_link_tag['href']}" if event_link_tag else "Unknown URL"

            # Exclude "TBD vs TBD" and past events
            if "TBD vs TBD" not in event_name:
                events.append({
                    "EVENT": event_name,
                    "DATE": event_date,
                    "LOCATION": event_location,
                    "URL": event_url
                })

        print(f"✅ Successfully extracted {len(events)} upcoming UFC events!")
        return pd.DataFrame(events)

    finally:
        driver.quit()  # Close the browser

In [15]:
# Function to extract fight details for an event
def extract_fight_urls(event_url):
    print(f"🔍 Extracting fights from {event_url}")
    soup = get_soup(event_url)
    fight_containers = soup.find_all("div", class_="c-listing-fight")
    
    fights = []
    for fight in fight_containers:
        fight_id = fight.get("data-fmid")
        if not fight_id:
            continue
        fight_url = f"{event_url}#{fight_id}"
        fights.append(fight_url)
    
    return fights

In [16]:
def fetch_upcoming_fights(events_df):
    """Fetches upcoming fights for each event."""
    print("🔍 Fetching upcoming UFC fights...")

    options = Options()
    options.headless = True
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    all_fights = []

    try:
        for _, event in events_df.iterrows():
            event_name = event["EVENT"]
            event_date = event["DATE"]
            event_location = event["LOCATION"]
            event_url = event["URL"]

            print(f"🔍 Scraping fights for {event_name}...")

            driver.get(event_url)
            time.sleep(5)  # Wait for JavaScript to load

            soup = BeautifulSoup(driver.page_source, "html.parser")

            # Find all fights
            fight_cards = soup.find_all("div", class_="c-listing-fight")

            if not fight_cards:
                print(f"⚠️ No fights found for {event_name}. Skipping...")
                continue

            for fight in fight_cards:
                fight_id = fight.get("data-fmid", "N/A")
                fight_url = f"{event_url}#{fight_id}"

                weight_class_tag = fight.find("div", class_="c-listing-fight__class-text")
                weight_class = weight_class_tag.text.strip() if weight_class_tag else "N/A"

                red_fighter_tag = fight.find("div", class_="c-listing-fight__corner-name--red")
                red_fighter = red_fighter_tag.text.strip() if red_fighter_tag else "N/A"

                blue_fighter_tag = fight.find("div", class_="c-listing-fight__corner-name--blue")
                blue_fighter = blue_fighter_tag.text.strip() if blue_fighter_tag else "N/A"

                red_profile_tag = fight.find("a", class_="c-listing-fight__corner-name--red")
                red_profile_url = f"https://www.ufc.com{red_profile_tag['href']}" if red_profile_tag else "N/A"

                blue_profile_tag = fight.find("a", class_="c-listing-fight__corner-name--blue")
                blue_profile_url = f"https://www.ufc.com{blue_profile_tag['href']}" if blue_profile_tag else "N/A"

                all_fights.append({
                    "Event": event_name,
                    "Date": event_date,
                    "Location": event_location,
                    "Bout": f"{red_fighter} vs {blue_fighter}",
                    "Red Fighter": red_fighter,
                    "Blue Fighter": blue_fighter,
                    "Weight Class": weight_class,
                    "Fight URL": fight_url,
                    "Red URL": red_profile_url,
                    "Blue URL": blue_profile_url
                })

        print(f"✅ Successfully extracted {len(all_fights)} upcoming fights!")
        return pd.DataFrame(all_fights)

    finally:
        driver.quit()  # Close the browser

In [17]:
def fetch_fighter_stats(fighter_url):
    """Fetches detailed fighter stats from UFC profile page."""
    print(f"🔍 Fetching stats for {fighter_url}...")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=Options())
    driver.get(fighter_url)
    time.sleep(3)  # Wait for JavaScript to load

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    stats = {
        "Age": "N/A", "Record": "N/A", "Country": "N/A", "Height": "N/A", "Weight": "N/A",
        "Reach": "N/A", "Leg Reach": "N/A", "KO/TKO %": "N/A", "Submission %": "N/A",
        "Decision %": "N/A", "Average Fight Time": "N/A", "Knockdown Avg/15 Min": "N/A",
        "Significant Strikes Landed Per Min": "N/A", "Significant Strike Accuracy": "N/A",
        "Significant Strike Absorbed Per Min": "N/A", "Significant Strike Defense": "N/A",
        "Takedown Avg/15 Min": "N/A", "Takedown Accuracy": "N/A", "Takedown Defense": "N/A",
        "Submission Avg/15 Min": "N/A"
    }

    # Extract fighter stats from the page
    stats_table = soup.find_all("div", class_="c-stat-compare__group")
    for stat in stats_table:
        label = stat.find("div", class_="c-stat-compare__label").text.strip()
        value = stat.find("div", class_="c-stat-compare__number").text.strip()
        if label in stats:
            stats[label] = value

    return stats

In [18]:
# Running the scraper
upcoming_events_df = fetch_upcoming_events()
if not upcoming_events_df.empty:
    upcoming_events_df.to_csv("upcoming_events_ufc.csv", index=False)
    print("✅ Saved upcoming events.")
    upcoming_fights_df = asyncio.run(fetch_upcoming_fights(upcoming_events_df))
    if not upcoming_fights_df.empty:
        upcoming_fights_df.to_csv("upcoming_fights_ufc.csv", index=False)
        print("✅ Saved upcoming fights.")
else:
    print("❌ No events to scrape fights from.")


🔍 Fetching upcoming UFC events...
❌ No upcoming events found! UFC structure may have changed.
❌ No events to scrape fights from.


In [19]:
# Load config
config = yaml.safe_load(open("scrape_ufc_stats_config.yaml"))

In [20]:
async def fetch_ufc_events():
    """Fetch UFC events using API instead of scraping the HTML page."""
    url = "https://www.ufc.com/api/events"  # Replace this with the actual API URL
    
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            if response.status != 200:
                print(f"❌ API Request Failed: {response.status}")
                return pd.DataFrame()

            data = await response.json()
    
    # Debug: Print the first event to see structure
    print("🔍 First Event:", data[0] if data else "No data returned!")

    # Extract relevant event details (modify based on actual API response structure)
    events = []
    for event in data:
        events.append({
            "EVENT": event.get("title", "Unknown"),
            "DATE": event.get("date", "Unknown"),
            "LOCATION": event.get("location", "Unknown"),
            "URL": event.get("url", "Unknown"),
        })

    return pd.DataFrame(events)

# Run the function
asyncio.run(fetch_ufc_events())

RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
async def fetch_event_fights(event, session):
    """Fetches fight details from a single event page."""
    event_url = event["URL"]
    fight_data = []

    soup = await LIB.get_soup_async(event_url, session)
    if soup is None:
        print(f"❌ Failed to fetch event page: {event_url}")
        return []

    # ✅ Extract Fight URLs
    fight_urls = LIB.extract_fight_urls_ufc_com(soup, event_url)

    tasks = []  # Store async fight fetching tasks
    for fight in fight_urls:
        fight_url = fight["fight_url"]
        if fight_url == "N/A" or not fight_url:
            print(f"⚠️ Skipping fight with missing URL in {event['EVENT']}")
            continue

        tasks.append(fetch_fight_info(fight_url, event, session))

    fight_results = await asyncio.gather(*tasks)  # Run concurrently
    for fight_info in fight_results:
        if fight_info:
            fight_data.append(fight_info)

    return fight_data

In [None]:
async def fetch_fight_details(events):
    """Fetches fight details from UFC event pages."""
    fight_data = []

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_event_fights(event, session) for event in events if event["URL"] and event["URL"] != "N/A"]
        all_fights = await asyncio.gather(*tasks)

        for fight_list in all_fights:
            fight_data.extend(fight_list)

    return pd.DataFrame(fight_data)

In [None]:
async def fetch_event_fights(event, session):
    """Fetches fight details from a single event page."""
    event_url = event["URL"]
    fight_data = []

    soup = await LIB.get_soup_async(event_url, session)
    if soup is None:
        print(f"❌ Failed to fetch event page: {event_url}")
        return []

    fight_urls = LIB.extract_fight_urls_ufc_com(soup, event_url)

    tasks = [fetch_fight_info(fight["fight_url"], event, session) for fight in fight_urls if fight["fight_url"] and fight["fight_url"] != "N/A"]
    fight_results = await asyncio.gather(*tasks)

    return [fight for fight in fight_results if fight]

In [None]:
async def fetch_fight_info(fight_url, event, session):
    """Fetches detailed fight info from a fight URL."""
    print(f"🔍 Fetching fight details: {fight_url}")

    fight_soup = await LIB.get_soup_async(fight_url, session)
    if fight_soup is None:
        print(f"❌ Failed to fetch fight page: {fight_url}")
        return None

    fight_info = LIB.extract_fight_data_ufc_com(fight_soup)
    if not fight_info or fight_info["BOUT"] == "N/A":
        print(f"⚠️ No valid fight details found for {fight_url}")
        return None

    # ✅ Extract weight class and title fight status
    weight_class_raw = fight_info["WEIGHT CLASS"]
    weight_class = re.sub(r" Title Bout", "", weight_class_raw)  # ✅ Remove "Title Bout"
    is_title_fight = "Yes" if "Title Bout" in weight_class_raw else "No"

    # ✅ Extract number of rounds (default to 3)
    num_rounds = 5 if is_title_fight == "Yes" else 3

    # ✅ Extract fighter profile URLs
    base_url = "https://www.ufc.com"
    red_fighter_profile = fight_info.get("RED FIGHTER PROFILE", "N/A")
    blue_fighter_profile = fight_info.get("BLUE FIGHTER PROFILE", "N/A")

    red_fighter_profile = f"{base_url}{red_fighter_profile}" if red_fighter_profile and not red_fighter_profile.startswith(base_url) else red_fighter_profile
    blue_fighter_profile = f"{base_url}{blue_fighter_profile}" if blue_fighter_profile and not blue_fighter_profile.startswith(base_url) else blue_fighter_profile

    # ✅ Fetch fighter stats concurrently
    red_stats_task = LIB.extract_fighter_stats(red_fighter_profile, session) if red_fighter_profile != "N/A" else None
    blue_stats_task = LIB.extract_fighter_stats(blue_fighter_profile, session) if blue_fighter_profile != "N/A" else None

    red_fighter_stats, blue_fighter_stats = await asyncio.gather(
        red_stats_task, blue_stats_task
    ) if red_stats_task and blue_stats_task else ({}, {})

    # ✅ Calculate Fighter Ages
    fight_date = datetime.strptime(event["DATE"], "%B %d, %Y")
    red_age = calculate_age(red_fighter_stats.get("DOB", "N/A"), fight_date)
    blue_age = calculate_age(blue_fighter_stats.get("DOB", "N/A"), fight_date)

    return {
        "Event": event["EVENT"],
        "Date": event["DATE"],
        "Location": event["LOCATION"],
        "Weight Class": weight_class,
        "Title Fight": is_title_fight,
        "Rounds": num_rounds,
        "Bout": fight_info["BOUT"],

        "Red Fighter": fight_info.get("RED FIGHTER", "N/A"),
        "Red Age": red_age,
        "Blue Fighter": fight_info.get("BLUE FIGHTER", "N/A"),
        "Blue Age": blue_age,

        "Red Record": red_fighter_stats.get("RECORD", "N/A"),
        "Blue Record": blue_fighter_stats.get("RECORD", "N/A"),
        "Red Country": fight_info.get("RED COUNTRY", "N/A"),
        "Blue Country": fight_info.get("BLUE COUNTRY", "N/A"),

        "Red Odds": fight_info.get("RED ODDS", "N/A"),
        "Blue Odds": fight_info.get("BLUE ODDS", "N/A"),

        "ODDS": fight_info.get("ODDS", "N/A"),
        "ODDS TO WIN BY KO": fight_info.get("ODDS TO WIN BY KO", "N/A"),
        "ODDS TO WIN BY SUBMISSION": fight_info.get("ODDS TO WIN BY SUBMISSION", "N/A"),
        "ODDS TO WIN BY DECISION": fight_info.get("ODDS TO WIN BY DECISION", "N/A"),

        "Red Money Line": fight_info.get("RED MONEY LINE", "N/A"),
        "Blue Money Line": fight_info.get("BLUE MONEY LINE", "N/A"),
    }

In [None]:
def calculate_age(dob, fight_date):
    """Calculates fighter's age at fight date from DOB."""
    if dob == "N/A":
        return "N/A"
    try:
        birth_date = datetime.strptime(dob, "%B %d, %Y")
        age = fight_date.year - birth_date.year - ((fight_date.month, fight_date.day) < (birth_date.month, birth_date.day))
        return age
    except:
        return "N/A"

In [None]:
async def main():
    print("🔍 Fetching upcoming UFC events from UFC.com...")
    event_df = await fetch_ufc_events()
    if event_df is None or event_df.empty:
        print("❌ No upcoming events found.")
        return

    event_df.to_csv(config["upcoming_events_ufc_file"], index=False)
    print("🔍 Fetching fight details...")
    fight_details = await fetch_fight_details(event_df.to_dict("records"))
    LIB.save_fight_data(fight_details, config["upcoming_fights_ufc_file"])

In [None]:
asyncio.run(main())