In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging
from multiprocessing.pool import ThreadPool

In [2]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Columns to extract: FIGHTER, HEIGHT, WEIGHT, REACH, STANCE, DOB
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}

In [3]:
def scrape_ufcstats(fighter_name):
    base_url = "http://ufcstats.com/statistics/fighters?char=a&page=all"
    fighter_name_query = fighter_name.replace(" ", "-").lower()
    search_url = f"http://ufcstats.com/fighter-details/{fighter_name_query}"
    
    for _ in range(3):  # Retry logic
        response = requests.get(search_url, headers=headers)
        if response.status_code == 200:
            break
        time.sleep(2)
    else:
        logging.warning(f"Failed to fetch details for {fighter_name} from UFC Stats")
        return {"FIGHTER": fighter_name, "Source": "UFC Stats", "Error": "Failed to fetch details"}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {"FIGHTER": fighter_name}

    try:
        details["HEIGHT"] = soup.find(text="Height:").find_next("td").text.strip()
        details["WEIGHT"] = soup.find(text="Weight:").find_next("td").text.strip()
        details["REACH"] = soup.find(text="Reach:").find_next("td").text.strip()
        details["STANCE"] = soup.find(text="Stance:").find_next("td").text.strip()
        details["DOB"] = soup.find(text="DOB:").find_next("td").text.strip()
    except AttributeError:
        details["Error"] = "Incomplete data"

    return details

In [4]:
def scrape_mmadecisions(fighter_name):
    base_url = "https://mmadecisions.com/fighter/"
    fighter_name_query = fighter_name.replace(" ", "-").lower()
    search_url = f"{base_url}{fighter_name_query}"

    for _ in range(3):  # Retry logic
        response = requests.get(search_url, headers=headers)
        if response.status_code == 200:
            break
        time.sleep(2)
    else:
        logging.warning(f"Failed to fetch details for {fighter_name} from MMA Decisions")
        return {"FIGHTER": fighter_name, "Source": "MMA Decisions", "Error": "Failed to fetch details"}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {"FIGHTER": fighter_name}

    try:
        details["HEIGHT"] = soup.find(text="Height:").find_next("td").text.strip()
        details["WEIGHT"] = soup.find(text="Weight:").find_next("td").text.strip()
        details["REACH"] = soup.find(text="Reach:").find_next("td").text.strip()
        details["STANCE"] = soup.find(text="Stance:").find_next("td").text.strip()
        details["DOB"] = soup.find(text="DOB:").find_next("td").text.strip()
    except AttributeError:
        details["Error"] = "Incomplete data"

    return details

In [5]:
def scrape_tapology(fighter_name):
    base_url = "https://www.tapology.com/fightcenter/fighters/"
    fighter_name_query = fighter_name.replace(" ", "-").lower()
    search_url = f"{base_url}{fighter_name_query}"

    for _ in range(3):  # Retry logic
        response = requests.get(search_url, headers=headers)
        if response.status_code == 200:
            break
        time.sleep(2)
    else:
        logging.warning(f"Failed to fetch details for {fighter_name} from Tapology")
        return {"FIGHTER": fighter_name, "Source": "Tapology", "Error": "Failed to fetch details"}

    soup = BeautifulSoup(response.text, "html.parser")
    details = {"FIGHTER": fighter_name}

    try:
        details["HEIGHT"] = soup.find(text="Height:").find_next("td").text.strip()
        details["WEIGHT"] = soup.find(text="Weight:").find_next("td").text.strip()
        details["REACH"] = soup.find(text="Reach:").find_next("td").text.strip()
        details["STANCE"] = soup.find(text="Stance:").find_next("td").text.strip()
        details["DOB"] = soup.find(text="DOB:").find_next("td").text.strip()
    except AttributeError:
        details["Error"] = "Incomplete data"

    return details

In [6]:
def scrape_fighter_details(fighter_name):
    logging.info(f"Starting to scrape details for {fighter_name}")

    # Try scraping from UFC Stats first
    details = scrape_ufcstats(fighter_name)
    if "Error" not in details:
        logging.info(f"Successfully scraped details for {fighter_name} from UFC Stats")
        return details

    # If UFC Stats fails, try MMA Decisions
    details = scrape_mmadecisions(fighter_name)
    if "Error" not in details:
        logging.info(f"Successfully scraped details for {fighter_name} from MMA Decisions")
        return details

    # If MMA Decisions fails, try Tapology
    details = scrape_tapology(fighter_name)
    if "Error" not in details:
        logging.info(f"Successfully scraped details for {fighter_name} from Tapology")

    return details

In [7]:
def scrape_from_dataset_parallel(dataset_path, threads=4):
    # Load dataset of fighter names
    df = pd.read_csv(dataset_path)
    if 'Name' not in df.columns:
        raise ValueError("Dataset must contain a 'Name' column")

    fighter_names = df['Name'].tolist()

    # Parallel scraping using ThreadPool
    with ThreadPool(threads) as pool:
        results = pool.map(scrape_fighter_details, fighter_names)

    # Save results to a DataFrame and CSV
    result_df = pd.DataFrame(results)
    result_df.to_csv("scraped_fighter_details.csv", index=False)
    logging.info("Scraping complete. Data saved to scraped_fighter_details.csv.")


In [None]:
# Example usage
# Uncomment the line below to scrape a dataset of names in parallel
# scrape_from_dataset_parallel("fighter_names.csv", threads=4)