In [1]:
#!/usr/bin/env python
# coding: utf-8

# imports
import pandas as pd
from tqdm.notebook import tqdm_notebook
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import scrape_ufc_stats_library as LIB
import yaml
import nest_asyncio
import time
import random
import os
from scrape_ufc_stats_library import parse_fight_results_and_stats_async

In [2]:
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

In [3]:
# Load configuration
config = yaml.safe_load(open('scrape_ufc_stats_config.yaml'))

In [4]:
# Load existing fight details
existing_fight_file = config['fight_details_file_name']
if os.path.exists(existing_fight_file):
    existing_fights_df = pd.read_csv(existing_fight_file)
    existing_fight_urls = set(existing_fights_df['URL'])
else:
    existing_fight_urls = set()

# Load scraped progress file
progress_file = config['scraped_progress_file']
if os.path.exists(progress_file):
    progress_df = pd.read_csv(progress_file)
    completed_fights = set(progress_df[progress_df["TYPE"] == "fight"]["URL"])
else:
    completed_fights = set()

# Get fight URLs
# Ensure all URLs are valid strings and not placeholders like '...'
all_fight_urls = [...]  # Populate dynamically

new_fight_urls = [
    url for url in all_fight_urls
    if isinstance(url, str) and url.startswith("http") and url not in existing_fight_urls and url not in completed_fights
]

# Print a few URLs to verify
print(f"✅ Valid fight URLs to scrape: {new_fight_urls[:5]}")


new_fight_urls = [url for url in all_fight_urls if url not in existing_fight_urls and url not in completed_fights]

async def scrape_fights():
    soups = await LIB.fetch_all_soups(new_fight_urls)
    for url, soup in zip(new_fight_urls, soups):
        if soup:
            # Process fight data (replace with actual parsing function)
            save_progress("fight", url, config)

asyncio.run(scrape_fights())

✅ Valid fight URLs to scrape: []
Error fetching URL: Ellipsis, attempt 1
Error fetching URL: Ellipsis, attempt 2
Error fetching URL: Ellipsis, attempt 3
Error fetching URL: Ellipsis, attempt 1
Error fetching URL: Ellipsis, attempt 2
Error fetching URL: Ellipsis, attempt 3
Error fetching URL: Ellipsis, attempt 1
Error fetching URL: Ellipsis, attempt 2
Error fetching URL: Ellipsis, attempt 3
❌ Failed to fetch event page: Ellipsis


In [5]:
# Asynchronous get_soup replacement
async def get_soup_async(url, session, retries=3):
    for attempt in range(retries):
        try:
            async with session.get(url) as response:
                if response.status == 200:
                    html = await response.text()
                    return BeautifulSoup(html, 'html.parser')
                elif response.status == 429:  # Too Many Requests
                    print(f"Rate-limited on {url}. Retrying in {2 ** attempt} seconds...")
                    await asyncio.sleep(2 ** attempt)  # Exponential backoff
        except Exception as e:
            print(f"Error fetching URL: {url}, attempt {attempt + 1}")
    print(f"Failed to fetch URL after {retries} attempts: {url}")
    return None

In [6]:
# Fetch multiple soups concurrently
async def fetch_all_soups(urls):
    connector = aiohttp.TCPConnector(limit=5)  # Limit simultaneous requests
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
        soups = []
        for url in urls:
            soup = await get_soup_async(url, session)
            soups.append(soup)
            # Add a random delay between 1 and 3 seconds
            await asyncio.sleep(random.uniform(1, 3))
        return soups

In [7]:
# Fallback mechanism for missing data
def use_backup_scraper(url, column_names):
    # Replace this with the backup scraper logic for the relevant data
    # For now, return a placeholder DataFrame
    backup_data = pd.DataFrame(columns=column_names)
    # Example: fetch data from Sherdog, UFC Athletes, or other sources
    backup_data.loc[0] = ["Backup Value"] * len(column_names)
    return backup_data

In [8]:
# Ensure an event loop is available
try:
    loop = asyncio.get_running_loop()
except RuntimeError:
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

In [9]:
# ## Parse Event Details
# Includes: Event, URL, Date, Location

# Define URL to parse
events_url = config['completed_events_all_url']

# Fetch and parse event details
soup = LIB.get_soup(events_url)
raw_event_details_df = LIB.parse_event_details(soup)

# Load existing event details
existing_event_file = config['event_details_file_name']
if os.path.exists(existing_event_file):
    existing_event_details_df = pd.read_csv(existing_event_file)
    existing_event_urls = set(existing_event_details_df['URL'])
else:
    existing_event_details_df = pd.DataFrame(columns=raw_event_details_df.columns)
    existing_event_urls = set()

# Identify new events
new_events_df = raw_event_details_df[~raw_event_details_df['URL'].isin(existing_event_urls)]

# Append new events to the dataset (newest events first)
if not new_events_df.empty:
    full_event_details_df = pd.concat([new_events_df, existing_event_details_df], ignore_index=True)
    full_event_details_df.to_csv(existing_event_file, index=False)
else:
    full_event_details_df = existing_event_details_df

# **Always Display All Event Data**
display(full_event_details_df)

Unnamed: 0,EVENT,URL,DATE,LOCATION
0,UFC Fight Night: Adesanya vs. Imavov,http://ufcstats.com/event-details/80dbeb1dd5b5...,"February 01, 2025","Riyadh, Riyadh, Saudi Arabia"
1,UFC 311: Makhachev vs. Moicano,http://ufcstats.com/event-details/39f68882def7...,"January 18, 2025","Inglewood, California, USA"
2,UFC Fight Night: Dern vs. Ribas 2,http://ufcstats.com/event-details/81ddc98fceb3...,"January 11, 2025","Las Vegas, Nevada, USA"
3,UFC Fight Night: Covington vs. Buckley,http://ufcstats.com/event-details/72c9c2eadfc3...,"December 14, 2024","Tampa, Florida, USA"
4,UFC 310: Pantoja vs. Asakura,http://ufcstats.com/event-details/ad23903ef3af...,"December 07, 2024","Las Vegas, Nevada, USA"
...,...,...,...,...
713,UFC 6: Clash of the Titans,http://ufcstats.com/event-details/1c3f5e85b59e...,"July 14, 1995","Casper, Wyoming, USA"
714,UFC 5: The Return of the Beast,http://ufcstats.com/event-details/dedc3bb440d0...,"April 07, 1995","Charlotte, North Carolina, USA"
715,UFC 4: Revenge of the Warriors,http://ufcstats.com/event-details/b60391da771d...,"December 16, 1994","Tulsa, Oklahoma, USA"
716,UFC 3: The American Dream,http://ufcstats.com/event-details/1a49e0670dfa...,"September 09, 1994","Charlotte, North Carolina, USA"


In [10]:
# Ensure "data" folder exists before doing anything
os.makedirs("data", exist_ok=True)

# ✅ Step 1: Run Event Scraper First
events_url = config['completed_events_all_url']
soup = LIB.get_soup(events_url)
raw_event_details_df = LIB.parse_event_details(soup)

# ✅ Step 2: Load Existing Events (For Incremental Updates)
existing_event_file = os.path.join("data", config['event_details_file_name'])

if os.path.exists(existing_event_file):
    existing_event_details_df = pd.read_csv(existing_event_file)
    existing_event_urls = set(existing_event_details_df['URL'])
else:
    existing_event_details_df = pd.DataFrame(columns=raw_event_details_df.columns)
    existing_event_urls = set()

# ✅ Step 3: Identify New Events
new_events_df = raw_event_details_df[~raw_event_details_df['URL'].isin(existing_event_urls)]

if not new_events_df.empty:
    full_event_details_df = pd.concat([new_events_df, existing_event_details_df], ignore_index=True)
    full_event_details_df.to_csv(existing_event_file, index=False)
    print(f"✅ {len(new_events_df)} new events added.")
else:
    full_event_details_df = existing_event_details_df
    print("⚠️ No new events detected. Skipping event updates.")

# ✅ Step 4: Extract Fight URLs From Events
list_of_events_urls = list(full_event_details_df['URL'])

# ✅ Step 5: Load Existing Fight Details Before Scraping
existing_fight_details_file = os.path.join("data", config['fight_details_file_name'])

if os.path.exists(existing_fight_details_file):
    existing_fight_details_df = pd.read_csv(existing_fight_details_file)
    existing_fight_urls = set(existing_fight_details_df['URL'])
else:
    existing_fight_details_df = pd.DataFrame(columns=config['fight_details_column_names'])
    existing_fight_urls = set()

# ✅ Step 6: Fetch Fight URLs from New Events
all_fight_details_df = asyncio.run(
    LIB.parse_fight_details_async(list_of_events_urls, existing_fight_urls, config)
)

# ✅ Step 7: Extract New Fight URLs
all_fight_urls = list(all_fight_details_df['URL'])
new_fight_urls = [
    url for url in all_fight_urls
    if isinstance(url, str) and url.startswith("http") and url not in existing_fight_urls
]

if not new_fight_urls:
    print("✅ No new fights to scrape. Exiting early.")
else:
    print(f"🆕 {len(new_fight_urls)} new fights detected. Starting scraping process.")


✅ 718 new events added.
🆕 8012 new fights detected. Starting scraping process.


In [11]:
# ## Parse Fight Details
# Includes: EVENT, BOUT, URL

# Define list of event URLs to parse
list_of_events_urls = list(full_event_details_df['URL'])

# Load existing fight details BEFORE running the async function
existing_fight_details_file = config['fight_details_file_name']
if os.path.exists(existing_fight_details_file):
    existing_fight_details_df = pd.read_csv(existing_fight_details_file)
    existing_fight_urls = set(existing_fight_details_df['URL'])  # ✅ Defined before calling function
else:
    existing_fight_details_df = pd.DataFrame(columns=config['fight_details_column_names'])
    existing_fight_urls = set()  # ✅ Empty set to prevent errors

# Fetch fight details asynchronously with all required arguments
all_fight_details_df = loop.run_until_complete(
    LIB.parse_fight_details_async(list_of_events_urls, existing_fight_urls, config)
)

# Identify new fight details
new_fight_details_df = all_fight_details_df[~all_fight_details_df['URL'].isin(existing_fight_urls)]

# Append new fight details to the dataset (newest first)
if not new_fight_details_df.empty:
    full_fight_details_df = pd.concat([new_fight_details_df, existing_fight_details_df], ignore_index=True)
    full_fight_details_df.to_csv(existing_fight_details_file, index=False)
else:
    full_fight_details_df = existing_fight_details_df

# **Always Display All Fight Details Data**
display(full_fight_details_df)

Unnamed: 0,EVENT,BOUT,URL
0,UFC Fight Night: Adesanya vs. Imavov,Nassourdine Imavov vs. Israel Adesanya,http://ufcstats.com/fight-details/85e79748b75e...
1,UFC Fight Night: Adesanya vs. Imavov,Michael Page vs. Shara Magomedov,http://ufcstats.com/fight-details/426e7a3de681...
2,UFC Fight Night: Adesanya vs. Imavov,Sergei Pavlovich vs. Jairzinho Rozenstruik,http://ufcstats.com/fight-details/3c00f34f20c8...
3,UFC Fight Night: Adesanya vs. Imavov,Vinicius Oliveira vs. Said Nurmagomedov,http://ufcstats.com/fight-details/70b8dca3addc...
4,UFC Fight Night: Adesanya vs. Imavov,Fares Ziam vs. Mike Davis,http://ufcstats.com/fight-details/775788423f5e...
...,...,...,...
8007,UFC 2: No Way Out,Orlando Wiet vs. Robert Lucarelli,http://ufcstats.com/fight-details/3b020d4914b4...
8008,UFC 2: No Way Out,Frank Hamaker vs. Thaddeus Luster,http://ufcstats.com/fight-details/d917c8c7461b...
8009,UFC 2: No Way Out,Johnny Rhodes vs. David Levicki,http://ufcstats.com/fight-details/ccee020be2e8...
8010,UFC 2: No Way Out,Patrick Smith vs. Ray Wizard,http://ufcstats.com/fight-details/4b9ae533ccb3...


In [12]:
# ## Parse Fight Results and Stats
# Includes Fight Results and Fight Stats

# Define list of fight URLs to parse
list_of_fight_details_urls = list(full_fight_details_df['URL'])

# Fetch fight results and stats asynchronously
all_fight_results_df, all_fight_stats_df = loop.run_until_complete(
    parse_fight_results_and_stats_async(list_of_fight_details_urls, config)
)

# Load existing fight results
existing_fight_results_file = config['fight_results_file_name']
if os.path.exists(existing_fight_results_file):
    existing_fight_results_df = pd.read_csv(existing_fight_results_file)
    existing_fight_results_urls = set(existing_fight_results_df['URL'])
else:
    existing_fight_results_df = pd.DataFrame(columns=all_fight_results_df.columns)
    existing_fight_results_urls = set()

# Identify new fight results
new_fight_results_df = all_fight_results_df[~all_fight_results_df['URL'].isin(existing_fight_results_urls)]

# Append new fight results to the dataset (newest first)
if not new_fight_results_df.empty:
    full_fight_results_df = pd.concat([new_fight_results_df, existing_fight_results_df], ignore_index=True)
    full_fight_results_df.to_csv(existing_fight_results_file, index=False)
else:
    full_fight_results_df = existing_fight_results_df

# **Always Display All Fight Results Data**
display(full_fight_results_df)

# Load existing fight stats
existing_fight_stats_file = config['fight_stats_file_name']
if os.path.exists(existing_fight_stats_file):
    existing_fight_stats_df = pd.read_csv(existing_fight_stats_file)
else:
    existing_fight_stats_df = pd.DataFrame(columns=all_fight_stats_df.columns)

# Append new fight stats to the dataset (newest first)
full_fight_stats_df = pd.concat([all_fight_stats_df, existing_fight_stats_df], ignore_index=True)
full_fight_stats_df.to_csv(existing_fight_stats_file, index=False)

# **Always Display All Fight Stats Data**
display(full_fight_stats_df)


Error fetching URL: http://ufcstats.com/fight-details/2554b31a07f34e1a, attempt 1
Error fetching URL: http://ufcstats.com/fight-details/2554b31a07f34e1a, attempt 2
Error fetching URL: http://ufcstats.com/fight-details/2554b31a07f34e1a, attempt 3
Error fetching URL: http://ufcstats.com/fight-details/2554b31a07f34e1a, attempt 1
Error fetching URL: http://ufcstats.com/fight-details/2554b31a07f34e1a, attempt 2
Error fetching URL: http://ufcstats.com/fight-details/2554b31a07f34e1a, attempt 3
Error fetching URL: http://ufcstats.com/fight-details/2554b31a07f34e1a, attempt 1
Error fetching URL: http://ufcstats.com/fight-details/2554b31a07f34e1a, attempt 2
Error fetching URL: http://ufcstats.com/fight-details/2554b31a07f34e1a, attempt 3
❌ Failed to fetch event page: http://ufcstats.com/fight-details/2554b31a07f34e1a
Error fetching URL: http://ufcstats.com/fight-details/f1d3eec12662d8d7, attempt 1
Error fetching URL: http://ufcstats.com/fight-details/f1d3eec12662d8d7, attempt 2
Error fetching UR

Unnamed: 0,EVENT,BOUT,OUTCOME,WEIGHTCLASS,METHOD,ROUND,TIME,TIME FORMAT,REFEREE,DETAILS,URL
0,UFC Fight Night: Tsarukyan vs. Gamrot,Shayilan Nuerdanbieke vs. TJ Brown,W/L,Featherweight Bout,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Chris Tognoni,Sal D'amato 28 - 29.Ron McCarthy 28 - 29.Adala...,http://ufcstats.com/fight-details/21f79cd40513...
1,UFC Fight Night: Tsarukyan vs. Gamrot,Raulian Paiva vs. Sergey Morozov,L/W,Bantamweight Bout,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Jason Herzog,Chris Lee 28 - 29.Junichiro Kamijo 28 - 29.Ton...,http://ufcstats.com/fight-details/a457331c27bc...
2,UFC Fight Night: Tsarukyan vs. Gamrot,JP Buys vs. Cody Durden,L/W,Flyweight Bout,KO/TKO,1,1:08,3 Rnd (5-5-5),Mark Smith,Punch to Head At Distance,http://ufcstats.com/fight-details/8cc43d79b61e...
3,UFC Fight Night: Tsarukyan vs. Gamrot,Brian Kelleher vs. Mario Bautista,L/W,Bantamweight Bout,Submission,1,2:27,3 Rnd (5-5-5),Herb Dean,Rear Naked Choke,http://ufcstats.com/fight-details/24a9fc95a311...
4,UFC Fight Night: Tsarukyan vs. Gamrot,Vanessa Demopoulos vs. Jinh Yu Frey,W/L,Women's Strawweight Bout,Decision - Split,3,5:00,3 Rnd (5-5-5),Chris Tognoni,Ron McCarthy 28 - 29.Sal D'amato 29 - 28.Jerin...,http://ufcstats.com/fight-details/2c73bf152247...
...,...,...,...,...,...,...,...,...,...,...,...
8007,UFC - Ultimate Brazil,Tsuyoshi Kohsaka vs. Pete Williams,W/L,Heavyweight Bout,Decision - Unanimous,2,3:00,1 Rnd + OT (12-3),John McCarthy,,http://ufcstats.com/fight-details/dbdad3eb1577...
8008,UFC - Ultimate Brazil,Ebenezer Fontes Braga vs. Jeremy Horn,W/L,Middleweight Bout,Submission,1,3:27,1 Rnd + OT (12-3),John McCarthy,Guillotine Choke Standing,http://ufcstats.com/fight-details/b6de61e312db...
8009,UFC - Ultimate Brazil,Tulio Palhares vs. Adriano Santos,W/L,Middleweight Bout,KO/TKO,1,9:00,1 Rnd + OT (12-3),John McCarthy,Punches to Head From Mount,http://ufcstats.com/fight-details/35b871e0bdc1...
8010,UFC - Ultimate Japan,Kazushi Sakuraba vs. Marcus Silveira,W/L,Ultimate Japan Heavyweight Tournament Title Bout,Submission,1,3:44,1 Rnd + OT (12-3),John McCarthy,Armbar From Side Control,http://ufcstats.com/fight-details/ec1bda9a4c2a...


Unnamed: 0,EVENT,BOUT,ROUND,FIGHTER,KD,SIG.STR.,SIG.STR. %,TOTAL STR.,TD,TD %,SUB.ATT,REV.,CTRL,HEAD,BODY,LEG,DISTANCE,CLINCH,GROUND
0,UFC Fight Night: Adesanya vs. Imavov,Israel Adesanya vs. Nassourdine Imavov,Round 1,Israel Adesanya,0,20 of 44,45%,26 of 50,0 of 1,0%,0,0,0:00,8 of 27,3 of 5,9 of 12,20 of 44,0 of 0,0 of 0
1,UFC Fight Night: Adesanya vs. Imavov,Israel Adesanya vs. Nassourdine Imavov,Round 2,Israel Adesanya,0,6 of 6,100%,6 of 6,0 of 0,---,0,0,0:00,1 of 1,3 of 3,2 of 2,6 of 6,0 of 0,0 of 0
2,UFC Fight Night: Adesanya vs. Imavov,Israel Adesanya vs. Nassourdine Imavov,Round 1,Nassourdine Imavov,0,8 of 21,38%,10 of 23,0 of 3,0%,0,0,0:40,3 of 13,0 of 2,5 of 6,8 of 21,0 of 0,0 of 0
3,UFC Fight Night: Adesanya vs. Imavov,Israel Adesanya vs. Nassourdine Imavov,Round 2,Nassourdine Imavov,1,7 of 10,70%,8 of 11,0 of 0,---,0,0,0:04,6 of 8,0 of 1,1 of 1,2 of 4,0 of 0,5 of 6
4,UFC Fight Night: Adesanya vs. Imavov,Shara Magomedov vs. Michael Page,Round 1,Shara Magomedov,0,9 of 25,36%,9 of 25,0 of 0,---,0,0,0:00,1 of 10,2 of 3,6 of 12,9 of 24,0 of 1,0 of 0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132115,UFC - Ultimate Brazil,Tulio Palhares vs. Adriano Santos,Round 1,Adriano Santos,0.0,11 of 36,30%,21 of 47,2 of 3,66%,0.0,1.0,--,8 of 30,2 of 3,1 of 3,4 of 24,0 of 0,7 of 12
132116,UFC - Ultimate Japan,Kazushi Sakuraba vs. Marcus Silveira,Round 1,Kazushi Sakuraba,0.0,1 of 2,50%,2 of 3,1 of 1,100%,2.0,0.0,--,1 of 2,0 of 0,0 of 0,0 of 1,0 of 0,1 of 1
132117,UFC - Ultimate Japan,Kazushi Sakuraba vs. Marcus Silveira,Round 1,Marcus Silveira,0.0,1 of 2,50%,11 of 13,1 of 1,100%,1.0,0.0,--,1 of 2,0 of 0,0 of 0,0 of 1,1 of 1,0 of 0
132118,UFC - Ultimate Japan,Vitor Belfort vs. Joe Charles,Round 1,Vitor Belfort,0.0,0 of 0,---,0 of 0,2 of 2,100%,3.0,0.0,--,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0


In [13]:
# Ensure "data" folder exists
os.makedirs("data", exist_ok=True)

# Define paths for saving all CSVs
#fight_events_path = os.path.join("data", config['fight_event_file_name'])
fight_details_path = os.path.join("data", config['fight_details_file_name'])
fight_results_path = os.path.join("data", config['fight_results_file_name'])
fight_stats_path = os.path.join("data", config['fight_stats_file_name'])

# Save fight details
full_fight_details_df.to_csv(fight_details_path, index=False)
print(f"📁 Saved fight details to {fight_details_path}")

# Save fight results
full_fight_results_df.to_csv(fight_results_path, index=False)
print(f"📁 Saved fight results to {fight_results_path}")

# Save fight stats
full_fight_stats_df.to_csv(fight_stats_path, index=False)
print(f"📁 Saved fight stats to {fight_stats_path}")


📁 Saved fight details to data\ufc_fight_details.csv


PermissionError: [Errno 13] Permission denied: 'data\\ufc_fight_results.csv'