In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging
from tqdm.notebook import tqdm_notebook
from concurrent.futures import ThreadPoolExecutor
import random
import yaml
import numpy as np
from fuzzywuzzy import fuzz

In [7]:
# Logging setup
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Load configuration
try:
    with open("config.yaml", "r") as file:
        config = yaml.safe_load(file)
except FileNotFoundError:
    raise FileNotFoundError("The 'config.yaml' file is missing. Please ensure it exists in the directory.")


In [8]:
# Randomized sleep to mimic human behavior
def random_sleep():
    time.sleep(random.uniform(*config["sleep_range"]))

In [9]:
# Fetch URL with retry logic and exponential backoff
def fetch_with_retry(url, max_retries=config["retries"]):
    for i in range(max_retries):
        try:
            response = requests.get(url, headers=config["headers"], timeout=10)
            if response.status_code == 200:
                return response
            elif response.status_code == 404:
                logging.warning(f"404 Not Found: {url}")
                return None
        except requests.RequestException as e:
            logging.warning(f"Retrying ({i + 1}/{max_retries}) for {url}: {e}")
        time.sleep(random.uniform(2, 5))  # Randomized delay
    logging.error(f"Failed to fetch {url} after {max_retries} retries")
    return None

In [10]:
### STEP 1: SCRAPE FIGHTER LISTS ###

# UFC Stats: Generate fighter list URLs
def generate_ufcstats_urls():
    base_url = "http://ufcstats.com/statistics/fighters?char={}&page={}"
    urls = []
    for char in "abcdefghijklmnopqrstuvwxyz":
        page = 1
        while True:
            url = base_url.format(char, page)
            response = fetch_with_retry(url)
            if response and "No fighter" not in response.text:
                urls.append(url)
                page += 1
            else:
                break
    return urls

# UFC Stats: Scrape fighter names and URLs
def scrape_ufcstats_fighter_list(url):
    response = fetch_with_retry(url)
    if not response:
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    fighter_links = soup.find_all("a", class_="b-link b-link_style_black")
    return [{"FIGHTER": link.text.strip(), "UFC_URL": link["href"]} for link in fighter_links]


In [11]:
# UFC Official: Generate fighter list
def scrape_ufc_fighter_list():
    base_url = "https://www.ufc.com/athletes/all"
    response = fetch_with_retry(base_url)
    if not response:
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    fighter_links = soup.find_all("a", class_="c-card-athlete__link")
    return [{"FIGHTER": link.text.strip(), "UFC_URL": f"https://www.ufc.com{link['href']}"} for link in fighter_links]

In [12]:
# MMA Decisions: Generate fighter list URLs
def generate_mmadecisions_urls():
    base_url = "https://mmadecisions.com/decisions-by-fighter/{}"
    return [base_url.format(char) for char in "abcdefghijklmnopqrstuvwxyz"]

In [14]:
# MMA Decisions: Scrape fighter names and URLs
def scrape_mmadecisions_fighter_list(url):
    response = fetch_with_retry(url)
    if not response:
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    rows = soup.find_all("a", href=True)
    fighters = []
    for row in rows:
        if "/fighter/" in row["href"]:
            name = row.text.strip()
            fighter_url = f"https://mmadecisions.com{row['href']}"
            fighters.append({"FIGHTER": name, "MMADECISIONS_URL": fighter_url})
    return fighters

In [15]:
# Combine all fighter lists
def scrape_all_fighter_lists():
    logging.info("Scraping fighter lists from UFC Stats, UFC Official, and MMA Decisions...")
    fighter_list = []

    # UFC Stats
    ufcstats_urls = generate_ufcstats_urls()
    for url in ufcstats_urls:
        fighter_list.extend(scrape_ufcstats_fighter_list(url))

    # UFC Official
    fighter_list.extend(scrape_ufc_fighter_list())

    # MMA Decisions
    mmadecisions_urls = generate_mmadecisions_urls()
    for url in mmadecisions_urls:
        fighter_list.extend(scrape_mmadecisions_fighter_list(url))

    # Deduplicate by fighter name
    fighter_df = pd.DataFrame(fighter_list).drop_duplicates(subset=["FIGHTER"]).reset_index(drop=True)
    return fighter_df

In [16]:
### STEP 2: SCRAPE FIGHTER DETAILS ###

# UFC Stats: Scrape fighter details
def scrape_ufcstats_details(url):
    response = fetch_with_retry(url)
    if not response or response.status_code == 404:
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {}
    try:
        stats_table = soup.find("table", class_="b-list__info-box")
        if not stats_table:
            return {}

        rows = stats_table.find_all("tr")
        for row in rows:
            cells = row.find_all("td")
            if len(cells) == 2:
                label = cells[0].text.strip()
                value = cells[1].text.strip()
                if label == "Height:":
                    details["HEIGHT"] = value
                elif label == "Weight:":
                    details["WEIGHT"] = value
                elif label == "Reach:":
                    details["REACH"] = value
                elif label == "Stance:":
                    details["STANCE"] = value
                elif label == "DOB:":
                    details["DOB"] = value
    except Exception as e:
        logging.error(f"Error parsing UFC Stats: {url} - {e}")
    return details

In [17]:
# UFC Official: Scrape fighter details
def scrape_ufc_details(fighter_name):
    base_url = f"https://www.ufc.com/athlete/{fighter_name.replace(' ', '-').lower()}"
    response = fetch_with_retry(base_url)
    if not response or response.status_code == 404:
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {}
    try:
        bio_sections = soup.find_all("div", class_="c-bio__text")
        for section in bio_sections:
            label = section.find_previous("div", class_="c-bio__label").text.strip()
            value = section.text.strip()
            if label == "Height":
                details["HEIGHT"] = value
            elif label == "Weight":
                details["WEIGHT"] = value
            elif label == "Reach":
                details["REACH"] = value
    except Exception as e:
        logging.warning(f"Error parsing UFC Official page: {base_url} - {e}")
    return details

In [19]:
# MMA Decisions: Scrape fighter details
def scrape_mmadecisions_details(fighter_name):
    base_url = f"https://mmadecisions.com/search.php?search_query={fighter_name.replace(' ', '+')}"
    response = fetch_with_retry(base_url)
    if not response:
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {}
    return details

In [20]:
# Aggregate fighter details from all sources
def aggregate_fighter_details(fighter):
    details = {"FIGHTER": fighter["FIGHTER"]}
    if "UFC_URL" in fighter:
        details.update(scrape_ufcstats_details(fighter["UFC_URL"]))
    details.update(scrape_ufc_details(fighter["FIGHTER"]))
    return details

In [21]:
# Parallel processing of fighter details
def process_fighters_parallel(fighter_list):
    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(aggregate_fighter_details, fighter_list.to_dict("records")))
    return results

In [None]:
### MAIN EXECUTION ###
if __name__ == "__main__":
    fighter_list_df = scrape_all_fighter_lists()
    fighter_list_df.to_csv("fighter_list.csv", index=False)
    logging.info(f"Fighter list saved with {len(fighter_list_df)} fighters.")

    fighter_details = process_fighters_parallel(fighter_list_df)
    fighter_details_df = pd.DataFrame(fighter_details)
    fighter_details_df.to_csv("fighter_details.csv", index=False)
    logging.info("Fighter details saved.")

2025-01-27 15:16:55,442 - INFO - Scraping fighter lists from UFC Stats, UFC Official, and MMA Decisions...
