In [1]:
pip install fuzzywuzzy[speedup]

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging
from tqdm.notebook import tqdm_notebook
from fuzzywuzzy import fuzz
import random
import yaml

In [3]:
# Logging setup
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Load configuration
config = {
    "headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0)"},
    "retries": 3,
    "sleep_range": [1, 3],
}

In [4]:
# Normalize fighter names for URL generation
def normalize_name(fighter_name):
    return fighter_name.lower().replace(" ", "-")

In [5]:
# Randomized sleep to mimic human behavior
def random_sleep():
    time.sleep(random.uniform(*config["sleep_range"]))

In [6]:
# Fetch URL with retry logic and exponential backoff
def fetch_with_retry(url, max_retries=config["retries"]):
    for i in range(max_retries):
        try:
            response = requests.get(url, headers=config["headers"], timeout=10)
            if response.status_code == 200:
                return response
        except requests.RequestException as e:
            logging.warning(f"Retrying ({i + 1}/{max_retries}) for {url}: {e}")
        time.sleep(2 ** i)  # Exponential backoff
    logging.error(f"Failed to fetch {url} after {max_retries} retries")
    return None

In [7]:
# Normalize and validate data
def normalize_height(height):
    if height and "'" in height:
        return height.replace('"', "\"")  # Escape double quotes for consistency
    return height

In [8]:
# Function to find UFC Stats URL for a fighter
def find_ufcstats_url(fighter_name):
    base_url = "http://ufcstats.com/statistics/fighters?char=a&page=all"
    response = fetch_with_retry(base_url)
    if not response:
        logging.warning(f"Failed to fetch UFC Stats directory for {fighter_name}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    fighters = soup.find_all("a", class_="b-link b-link_style_black")

    best_match = None
    best_score = 0

    for fighter in fighters:
        match_score = fuzz.partial_ratio(fighter_name.lower(), fighter.text.lower())
        if match_score > best_score and match_score > 80:  # Threshold for matching
            best_match = fighter["href"]
            best_score = match_score

    if best_match:
        logging.info(f"Found UFC Stats URL for {fighter_name}: {best_match}")
        return best_match
    
    logging.warning(f"Could not find UFC Stats URL for {fighter_name}")
    return None


In [9]:
# Function to find Tapology URL for a fighter
def find_tapology_url(fighter_name):
    search_url = f"https://www.tapology.com/search/mma-fighters/{fighter_name.replace(' ', '%20')}"
    response = fetch_with_retry(search_url)
    if not response:
        logging.warning(f"Failed to fetch Tapology search results for {fighter_name}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    search_results = soup.find_all("a", class_="search-result-link")

    best_match = None
    best_score = 0

    for result in search_results:
        name = result.find("h4").text.strip() if result.find("h4") else ""
        match_score = fuzz.partial_ratio(fighter_name.lower(), name.lower())
        if match_score > best_score and match_score > 80:  # Threshold for matching
            best_match = "https://www.tapology.com" + result["href"]
            best_score = match_score

    if best_match:
        logging.info(f"Found Tapology URL for {fighter_name}: {best_match}")
        return best_match
    
    logging.warning(f"Could not find Tapology URL for {fighter_name}")
    return None

In [10]:
# Function to find MMA Decisions URL for a fighter
def find_mmadecisions_url(fighter_name):
    search_url = f"https://mmadecisions.com/search.php?search_query={fighter_name.replace(' ', '+')}"
    response = fetch_with_retry(search_url)
    if not response:
        logging.warning(f"Failed to fetch MMA Decisions search results for {fighter_name}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    search_results = soup.find_all("a", href=True)

    best_match = None
    best_score = 0

    for result in search_results:
        if "/fighter/" in result["href"]:
            name = result.text.strip()
            match_score = fuzz.partial_ratio(fighter_name.lower(), name.lower())
            if match_score > best_score and match_score > 80:  # Threshold for matching
                best_match = "https://mmadecisions.com" + result["href"]
                best_score = match_score

    if best_match:
        logging.info(f"Found MMA Decisions URL for {fighter_name}: {best_match}")
        return best_match
    
    logging.warning(f"Could not find MMA Decisions URL for {fighter_name}")
    return None

In [11]:
# Scrape details from UFC Stats
def scrape_ufcstats_details(url):
    response = fetch_with_retry(url)
    if not response:
        return {}
    
    soup = BeautifulSoup(response.text, "html.parser")
    details = {}

    try:
        details["HEIGHT"] = normalize_height(soup.find(string="Height:").find_next("td").string.strip())
        details["WEIGHT"] = soup.find(string="Weight:").find_next("td").string.strip()
        details["REACH"] = soup.find(string="Reach:").find_next("td").string.strip()
        details["STANCE"] = soup.find(string="Stance:").find_next("td").string.strip()
        details["DOB"] = soup.find(string="DOB:").find_next("td").string.strip()
    except AttributeError:
        logging.warning(f"Some details missing for UFC Stats: {url}")

    return details

In [12]:
# Scrape details from Tapology
def scrape_tapology_details(url):
    response = fetch_with_retry(url)
    if not response:
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {}

    try:
        details["HEIGHT"] = normalize_height(soup.find(string="Height:").find_next("td").string.strip())
        details["WEIGHT"] = soup.find(string="Weight:").find_next("td").string.strip()
        details["REACH"] = soup.find(string="Reach:").find_next("td").string.strip()
        details["STANCE"] = soup.find(string="Stance:").find_next("td").string.strip()
        details["DOB"] = soup.find(string="DOB:").find_next("td").string.strip()
    except AttributeError:
        logging.warning(f"Some details missing for Tapology: {url}")

    return details

In [13]:
# Scrape details from MMA Decisions
def scrape_mmadecisions_details(url):
    response = fetch_with_retry(url)
    if not response:
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {}

    try:
        details["HEIGHT"] = normalize_height(soup.find(string="Height:").find_next("td").string.strip())
        details["WEIGHT"] = soup.find(string="Weight:").find_next("td").string.strip()
        details["REACH"] = soup.find(string="Reach:").find_next("td").string.strip()
        details["STANCE"] = soup.find(string="Stance:").find_next("td").string.strip()
        details["DOB"] = soup.find(string="DOB:").find_next("td").string.strip()
    except AttributeError:
        logging.warning(f"Some details missing for MMA Decisions: {url}")

    return details

In [14]:
# Aggregate details from all sources
def aggregate_fighter_details(fighter_name):
    final_details = {"FIGHTER": fighter_name}

    # Find UFC Stats URL
    ufc_url = find_ufcstats_url(fighter_name)
    final_details["UFC_URL"] = ufc_url

    # Scrape UFC Stats first
    if ufc_url:
        final_details.update(scrape_ufcstats_details(ufc_url))

    # Find Tapology URL
    tapology_url = find_tapology_url(fighter_name)
    final_details["TAPOLOGY_URL"] = tapology_url

    # Scrape Tapology if UFC Stats is incomplete
    if tapology_url and any(not final_details.get(col) for col in ["HEIGHT", "WEIGHT", "REACH", "STANCE", "DOB"]):
        final_details.update(scrape_tapology_details(tapology_url))

    # Find MMA Decisions URL
    mmadecisions_url = find_mmadecisions_url(fighter_name)
    final_details["MMADECISIONS_URL"] = mmadecisions_url

    # Scrape MMA Decisions if data is still incomplete
    if mmadecisions_url and any(not final_details.get(col) for col in ["HEIGHT", "WEIGHT", "REACH", "STANCE", "DOB"]):
        final_details.update(scrape_mmadecisions_details(mmadecisions_url))

    return final_details

In [15]:
# Main function to parse all fighters
def parse_all_fighters(fighter_names):
    all_fighters_details = []

    for fighter_name in tqdm_notebook(fighter_names, desc="Processing fighters"):
        details = aggregate_fighter_details(fighter_name)
        all_fighters_details.append(details)

    # Format output rows
    formatted_fighters_details = []
    for fighter in all_fighters_details:
        formatted_row = (
            f"{fighter.get('FIGHTER', '')},"
            f"{fighter.get('HEIGHT', '')},"
            f"{fighter.get('WEIGHT', '')},"
            f"{fighter.get('REACH', '')},"
            f"{fighter.get('STANCE', '')},"
            f"{fighter.get('DOB', '')},"
            f"{fighter.get('UFC_URL', '')},"
            f"{fighter.get('TAPOLOGY_URL', '')},"
            f"{fighter.get('MMADECISIONS_URL', '')}"
        )
        formatted_fighters_details.append(formatted_row)

    return formatted_fighters_details

In [None]:
# Example usage
if __name__ == "__main__":
    # Example list of fighter names
    fighter_names = pd.read_csv("data/fighter_names.csv")["Name"].tolist()

    # Parse all fighter details
    all_fighter_details = parse_all_fighters(fighter_names)

    # Save results to CSV format
    with open("all_fighter_details.csv", "w") as f:
        f.write("FIGHTER,HEIGHT,WEIGHT,REACH,STANCE,DOB,UFC_URL,TAPOLOGY_URL,MMADECISIONS_URL\n")
        for row in all_fighter_details:
            f.write(row + "\n")

    logging.info("Scraping complete. Data saved to all_fighter_details.csv.")

Processing fighters:   0%|          | 0/347 [00:00<?, ?it/s]

2025-01-26 23:12:07,942 - INFO - Found UFC Stats URL for Danny Abbadi: http://ufcstats.com/fighter-details/15df64c02b6b0fde
2025-01-26 23:12:19,992 - ERROR - Failed to fetch https://mmadecisions.com/search.php?search_query=Danny+Abbadi after 3 retries
2025-01-26 23:12:21,228 - INFO - Found UFC Stats URL for David Abbott: http://ufcstats.com/fighter-details/b361180739bed4b0
2025-01-26 23:12:33,344 - ERROR - Failed to fetch https://mmadecisions.com/search.php?search_query=David+Abbott after 3 retries
2025-01-26 23:12:34,462 - INFO - Found UFC Stats URL for Papy Abedi: http://ufcstats.com/fighter-details/c0ed7b208197e8de
2025-01-26 23:12:46,496 - ERROR - Failed to fetch https://mmadecisions.com/search.php?search_query=Papy+Abedi after 3 retries
2025-01-26 23:12:47,657 - INFO - Found UFC Stats URL for Ricardo Abreu: http://ufcstats.com/fighter-details/aa6e591c2a2cdecd
2025-01-26 23:12:59,807 - ERROR - Failed to fetch https://mmadecisions.com/search.php?search_query=Ricardo+Abreu after 3 re