In [1]:
#!/usr/bin/env python
# coding: utf-8

# **Overview**
# Scrapes upcoming UFC fights from `ufcstats.com`, including fight matchups and fighter statistics.

# Imports
import pandas as pd
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import os
import yaml
import scrape_ufc_stats_library as LIB  # Custom library for parsing functions
from datetime import datetime

In [2]:
# Load configuration
config = yaml.safe_load(open("scrape_ufc_stats_config.yaml"))

# Ensure event folders exist
os.makedirs("datasets", exist_ok=True)

In [3]:
# Load existing events
existing_events_file = config['upcoming_events_ufcstats_file']
if os.path.exists(existing_events_file):
    existing_events_df = pd.read_csv(existing_events_file)
    existing_event_urls = set(existing_events_df['URL'])
else:
    existing_event_urls = set()

# Load scraped progress file
progress_file = config['scraped_progress_file']
if os.path.exists(progress_file):
    progress_df = pd.read_csv(progress_file)
    completed_events = set(progress_df[progress_df["TYPE"] == "event"]["URL"])
else:
    completed_events = set()

In [4]:
# **1. Define URL for Upcoming Events**
ufcstats_upcoming_url = "http://ufcstats.com/statistics/events/upcoming"

In [5]:
# **2. Async Function to Fetch and Parse Events**
# ✅ Fetch Upcoming Events (with session)
async def fetch_upcoming_events(session):
    """Scrape upcoming UFC events from UFCStats."""
    soup = await LIB.get_soup_async(config['upcoming_events_ufcstats_url'], session)

    if soup is None:
        print("❌ Failed to fetch upcoming UFC events.")
        return pd.DataFrame()  # Return empty DataFrame if failed

    event_df = LIB.parse_upcoming_events(soup)
    new_events_df = event_df[~event_df['URL'].isin(existing_event_urls | completed_events)]

    for event_url in new_events_df['URL']:
        LIB.save_progress("event", event_url, config)

    return new_events_df

In [6]:
# **3. Async Function to Fetch and Parse Fight Details**
# ✅ Fetch Fight Details (with session)
async def fetch_fight_details(event_urls, session):
    """Scrape fight matchups and fighter statistics for upcoming events."""
    soups = await LIB.fetch_all_soups(event_urls, session)
    fight_details = []

    for url, soup in zip(event_urls, soups):
        if soup is None:
            print(f"⚠️ Skipping event due to failed fetch: {url}")
            continue

        try:
            event_name = soup.find('h2', class_='b-content__title').text.strip()
            fight_blocks = soup.find_all('tr', class_='b-fight-details__table-row')

            for fight in fight_blocks:
                fighters = fight.find_all('a', class_='b-link b-link_style_black')
                if len(fighters) == 2:
                    fighter1 = fighters[0].text.strip()
                    fighter2 = fighters[1].text.strip()

                    weight_class = fight.find('td', class_='b-fight-details__table-col').text.strip()

                    # ✅ Fetch Fighter Stats Asynchronously
                    fighter1_stats = await LIB.fetch_fighter_stats(fighters[0]['href'], session)
                    fighter2_stats = await LIB.fetch_fighter_stats(fighters[1]['href'], session)

                    fight_details.append({
                        "EVENT": event_name,
                        "FIGHTER 1": fighter1,
                        "FIGHTER 2": fighter2,
                        "WEIGHTCLASS": weight_class,
                        **fighter1_stats,
                        **fighter2_stats,
                        "URL": url
                    })

        except Exception as e:
            print(f"❌ Error parsing fight details for {url}: {e}")

    return pd.DataFrame(fight_details)

In [7]:
# **4. Execute Async Scraper**
async def main():
    """Main function to scrape and save upcoming UFC fights from UFCStats."""
    print("🔍 Fetching upcoming UFC events from UFCStats...")

    # ✅ Create a single session for efficiency
    async with aiohttp.ClientSession() as session:
        event_df = await fetch_upcoming_events(session)

        if event_df.empty:
            print("⚠️ No upcoming events found.")
            return

        print(f"✅ Found {len(event_df)} upcoming events.")
        print("🔍 Fetching fight details...")

        fight_df = await fetch_fight_details(event_df["URL"].tolist(), session)

        # Merge event details with fight matchups
        final_df = pd.merge(event_df, fight_df, on="EVENT", how="left")

        # ✅ Save to CSV
        final_df.to_csv("datasets/upcoming_fights_ufcstats.csv", index=False)
        print("✅ UFCStats upcoming fights data saved!")

In [8]:
# **5. Run Scraper**
# ✅ Run Scraper in Jupyter Notebook
import nest_asyncio
nest_asyncio.apply()

# ✅ FIX: Run async function properly inside Jupyter
await main()

🔍 Fetching upcoming UFC events from UFCStats...
✅ Found 10 upcoming events.
🔍 Fetching fight details...
Error fetching URL: http://ufcstats.com/event-details/e6015889f50075d2, attempt 1
Error fetching URL: http://ufcstats.com/event-details/e6015889f50075d2, attempt 2
Error fetching URL: http://ufcstats.com/event-details/e6015889f50075d2, attempt 3
Error fetching URL: http://ufcstats.com/event-details/e6015889f50075d2, attempt 1
Error fetching URL: http://ufcstats.com/event-details/e6015889f50075d2, attempt 2
Error fetching URL: http://ufcstats.com/event-details/e6015889f50075d2, attempt 3
Error fetching URL: http://ufcstats.com/event-details/e6015889f50075d2, attempt 1
Error fetching URL: http://ufcstats.com/event-details/e6015889f50075d2, attempt 2
Error fetching URL: http://ufcstats.com/event-details/e6015889f50075d2, attempt 3
❌ Failed to fetch event page: http://ufcstats.com/event-details/e6015889f50075d2
Error fetching URL: http://ufcstats.com/event-details/ce7871949b0ed2bf, attem

KeyError: 'EVENT'