In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging
from tqdm.notebook import tqdm_notebook
from concurrent.futures import ThreadPoolExecutor
import random
import yaml
import numpy as np
from fuzzywuzzy import fuzz

In [2]:
# Logging setup
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Load configuration
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [3]:
# Randomized sleep to mimic human behavior
def random_sleep():
    time.sleep(random.uniform(*config["sleep_range"]))

In [4]:
# Fetch URL with retry logic and exponential backoff
def fetch_with_retry(url, max_retries=config["retries"]):
    for i in range(max_retries):
        try:
            response = requests.get(url, headers=config["headers"], timeout=10)
            if response.status_code == 200:
                return response
            elif response.status_code == 404:
                logging.warning(f"404 Not Found: {url}")
                return None
        except requests.RequestException as e:
            logging.warning(f"Retrying ({i + 1}/{max_retries}) for {url}: {e}")
        time.sleep(random.uniform(2, 5))  # Randomized delay
    logging.error(f"Failed to fetch {url} after {max_retries} retries")
    return None

In [5]:
### STEP 1: SCRAPE FIGHTER LISTS ###

# UFC Stats: Generate fighter list URLs
def generate_ufcstats_urls():
    base_url = "http://ufcstats.com/statistics/fighters?char={}&page={}"
    urls = []
    for char in "abcdefghijklmnopqrstuvwxyz":
        page = 1
        while True:
            url = base_url.format(char, page)
            response = fetch_with_retry(url)
            if response and "No fighter" not in response.text:
                urls.append(url)
                page += 1
            else:
                break
    return urls

# Find UFC Stats URL dynamically
def find_ufcstats_url(fighter_name):
    search_url = f"http://ufcstats.com/statistics/fighters?char={fighter_name[0].lower()}"
    response = fetch_with_retry(search_url)
    if not response:
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    fighter_links = soup.find_all("a", class_="b-link b-link_style_black")
    for link in fighter_links:
        if fighter_name.lower() in link.text.strip().lower():
            return link["href"]
    logging.warning(f"Could not find UFC Stats URL for {fighter_name}")
    return None

# Scrape UFC Stats details
def scrape_ufcstats_details(url):
    response = fetch_with_retry(url)
    if not response or response.status_code == 404:
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {}

    try:
        stats_table = soup.find("table", class_="b-list__info-box")
        if not stats_table:
            return {}

        rows = stats_table.find_all("tr")
        for row in rows:
            cells = row.find_all("td")
            if len(cells) == 2:
                label = cells[0].text.strip()
                value = cells[1].text.strip()
                if label == "Height:":
                    details["HEIGHT"] = value
                elif label == "Weight:":
                    details["WEIGHT"] = value
                elif label == "Reach:":
                    details["REACH"] = value
                elif label == "Stance:":
                    details["STANCE"] = value
                elif label == "DOB:":
                    details["DOB"] = value
    except Exception as e:
        logging.error(f"Error parsing UFC Stats: {url} - {e}")
    return details

In [6]:
# Scrape UFC Official details
def scrape_ufc_details(fighter_name):
    base_url = f"https://www.ufc.com/athlete/{fighter_name.replace(' ', '-').lower()}"
    response = fetch_with_retry(base_url)
    if not response or response.status_code == 404:
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {}
    try:
        bio_sections = soup.find_all("div", class_="c-bio__text")
        for section in bio_sections:
            label = section.find_previous("div", class_="c-bio__label").text.strip()
            value = section.text.strip()
            if label == "Height":
                details["HEIGHT"] = value
            elif label == "Weight":
                details["WEIGHT"] = value
            elif label == "Reach":
                details["REACH"] = value
    except Exception as e:
        logging.warning(f"Error parsing UFC Official page: {base_url} - {e}")
    return details

In [7]:
# MMA Decisions: Generate fighter list URLs
def generate_mmadecisions_urls():
    base_url = "https://mmadecisions.com/decisions-by-fighter/{}"
    return [base_url.format(char) for char in "abcdefghijklmnopqrstuvwxyz"]# Scrape MMA Decisions details
def scrape_mmadecisions_details(fighter_name):
    base_url = f"https://mmadecisions.com/search.php?search_query={fighter_name.replace(' ', '+')}"
    response = fetch_with_retry(base_url)
    if not response:
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {}
    return details

In [8]:
# MMA Decisions: Scrape fighter names and URLs
def scrape_mmadecisions_fighter_list(url):
    response = fetch_with_retry(url)
    if not response:
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    rows = soup.find_all("a", href=True)
    fighters = []
    for row in rows:
        if "/fighter/" in row["href"]:
            name = row.text.strip()
            fighter_url = f"https://mmadecisions.com{row['href']}"
            fighters.append({"FIGHTER": name, "MMADECISIONS_URL": fighter_url})
    return fighters

In [9]:
# Combine all fighter lists
def scrape_all_fighter_lists():
    logging.info("Scraping fighter lists from UFC Stats, UFC Official, and MMA Decisions...")
    fighter_list = []

    # UFC Stats
    ufcstats_urls = generate_ufcstats_urls()
    for url in ufcstats_urls:
        fighter_list.extend(scrape_ufcstats_fighter_list(url))

    # UFC Official
    fighter_list.extend(scrape_ufc_fighter_list())

    # MMA Decisions
    mmadecisions_urls = generate_mmadecisions_urls()
    for url in mmadecisions_urls:
        fighter_list.extend(scrape_mmadecisions_fighter_list(url))

    # Deduplicate by fighter name
    fighter_df = pd.DataFrame(fighter_list).drop_duplicates(subset=["FIGHTER"]).reset_index(drop=True)
    return fighter_df

In [10]:
### STEP 2: SCRAPE FIGHTER DETAILS ###

# UFC Stats: Scrape fighter details
def scrape_ufcstats_details(url):
    response = fetch_with_retry(url)
    if not response or response.status_code == 404:
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {}
    try:
        stats_table = soup.find("table", class_="b-list__info-box")
        if not stats_table:
            return {}

        rows = stats_table.find_all("tr")
        for row in rows:
            cells = row.find_all("td")
            if len(cells) == 2:
                label = cells[0].text.strip()
                value = cells[1].text.strip()
                if label == "Height:":
                    details["HEIGHT"] = value
                elif label == "Weight:":
                    details["WEIGHT"] = value
                elif label == "Reach:":
                    details["REACH"] = value
                elif label == "Stance:":
                    details["STANCE"] = value
                elif label == "DOB:":
                    details["DOB"] = value
    except Exception as e:
        logging.error(f"Error parsing UFC Stats: {url} - {e}")
    return details

In [11]:
# UFC Official: Scrape fighter details
def scrape_ufc_details(fighter_name):
    base_url = f"https://www.ufc.com/athlete/{fighter_name.replace(' ', '-').lower()}"
    response = fetch_with_retry(base_url)
    if not response or response.status_code == 404:
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {}
    try:
        bio_sections = soup.find_all("div", class_="c-bio__text")
        for section in bio_sections:
            label = section.find_previous("div", class_="c-bio__label").text.strip()
            value = section.text.strip()
            if label == "Height":
                details["HEIGHT"] = value
            elif label == "Weight":
                details["WEIGHT"] = value
            elif label == "Reach":
                details["REACH"] = value
    except Exception as e:
        logging.warning(f"Error parsing UFC Official page: {base_url} - {e}")
    return details

In [12]:
# MMA Decisions: Scrape fighter details
def scrape_mmadecisions_details(fighter_name):
    base_url = f"https://mmadecisions.com/search.php?search_query={fighter_name.replace(' ', '+')}"
    response = fetch_with_retry(base_url)
    if not response:
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {}
    return details

In [13]:
# Aggregate fighter details from all sources
def aggregate_fighter_details(fighter_name):
    details = {"FIGHTER": fighter_name}

    # Step 1: Try UFC Stats
    ufcstats_url = find_ufcstats_url(fighter_name)
    if ufcstats_url:
        details.update(scrape_ufcstats_details(ufcstats_url))
        details["UFC_URL"] = ufcstats_url

    # Step 2: Try UFC Official if missing data
    if any(pd.isna(details.get(key)) for key in ["HEIGHT", "WEIGHT", "REACH"]):
        ufc_official_details = scrape_ufc_details(fighter_name)
        details.update({k: ufc_official_details.get(k, v) for k, v in details.items() if pd.isna(v)})

    # Step 3: Try MMA Decisions if still missing data
    if any(pd.isna(details.get(key)) for key in ["HEIGHT", "WEIGHT", "REACH"]):
        mma_details = scrape_mmadecisions_details(fighter_name)
        details.update({k: mma_details.get(k, v) for k, v in details.items() if pd.isna(v)})

    return details

In [14]:
# Scrape fighter details in parallel
def process_fighters_parallel(fighter_names):
    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(aggregate_fighter_details, fighter_names))
    return results

In [15]:
### MAIN EXECUTION ###
if __name__ == "__main__":
    # Load fighter names from CSV
    fighter_names = pd.read_csv("data/fighter_names.csv")["Name"].tolist()

    logging.info(f"Starting scraping for {len(fighter_names)} fighters...")
    fighter_details = process_fighters_parallel(fighter_names)
    fighter_details_df = pd.DataFrame(fighter_details)
    fighter_details_df.to_csv("data/fighter_details.csv", index=False)
    logging.info("Fighter details saved to data/fighter_details.csv.")

2025-01-27 15:27:59,005 - INFO - Starting scraping for 347 fighters...
2025-01-27 15:28:11,870 - ERROR - Failed to fetch https://mmadecisions.com/search.php?search_query=Sam+Adkins after 3 retries
2025-01-27 15:28:12,418 - ERROR - Failed to fetch https://mmadecisions.com/search.php?search_query=Ricardo+Abreu after 3 retries
2025-01-27 15:28:12,508 - ERROR - Failed to fetch https://mmadecisions.com/search.php?search_query=Razak+Al-Hassan after 3 retries
2025-01-27 15:28:12,518 - ERROR - Failed to fetch https://mmadecisions.com/search.php?search_query=Nick+Agallar after 3 retries
2025-01-27 15:28:12,947 - ERROR - Failed to fetch https://mmadecisions.com/search.php?search_query=Wes+Albritton after 3 retries
2025-01-27 15:28:13,293 - ERROR - Failed to fetch https://mmadecisions.com/search.php?search_query=Marcelo+Aguiar after 3 retries
2025-01-27 15:28:14,674 - ERROR - Failed to fetch https://mmadecisions.com/search.php?search_query=Scott+Adams after 3 retries
2025-01-27 15:28:15,002 - ERR