In [59]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy
import math
import csv
from unidecode import unidecode

In [72]:
url = 'https://www.ufc.com/athletes/all'
headers = { 'UserAgent' : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"}

In [65]:
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, 'html.parser')

In [68]:
num_fighters = soup.find('div', class_='althelete-total').get_text().strip()
index = num_fighters.find(' ')
num_fighters = int(num_fighters[:index])
print(f'Total of fighters = {num_fighters}')

Total of fighters = 3021


In [70]:
fighters_names = []
all_fighters = []

# each page has 11 fighters
for i in range(math.ceil(num_fighters/11)):
  # site request and scrap names
  url = f'https://www.ufc.com/athletes/all?gender=All&search=&page={i}'
  site = requests.get(url, headers=headers)
  soup = BeautifulSoup(site.content, 'html.parser')
  fighters_names.append(soup.find_all('span', class_=re.compile('c-listing-athlete__name')))

# clean names
for i in range(len(fighters_names)):
    all_fighters.extend([a.text.strip() for a in fighters_names[i]])

all_fighters = list(dict.fromkeys(all_fighters))

print(f'{len(all_fighters)} fighters found')

3015 fighters found


In [71]:
with open('fighters.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Fighter Name'])
    for fighter in all_fighters:
        writer.writerow([fighter])

In [76]:

fighters_statistics = []

for name in all_fighters:
    # clean name
    real_name = name
    name = name.replace('-', ' ')
    name = unidecode(name)
    print(f'Scraping {real_name}...')

    name_id = name.lower()
    parts = name_id.split(' ')

    name_id = re.sub(r"[^a-zA-Z0-9]+", '', parts[0])
    for j in range(1, len(parts)):
        if re.sub(r"[^a-zA-Z0-9]+", '', parts[j]) != "":
            name_id += '-' + re.sub(r"[^a-zA-Z0-9]+", '', parts[j])
    
    # site request
    url = f'https://www.ufc.com/athlete/{name_id}'
    print(url)

    site = requests.get(url, headers=headers)
    soup = BeautifulSoup(site.content, 'html.parser')

    # Initialize attributes for stats
    strike_accuracy = None
    takedown_accuracy = None
    connected_strikes_per_minute = None
    absorved_strikes_per_minute = None
    strikes_defense = None
    takedown_defense = None
    takedown_avg = None
    submission_avg = None
    knockdown_avg = None
    avg_fight_time = None
    weight = None
    height = None
    reach = None
    division = None
    age = None
    status = None
    place_of_birth = None
    record = None
    win_by_ko_tko = None
    win_by_dec = None
    win_by_sub = None
    debut = None

    # Scraping statistics
    statistic_divs = []
    statistic_divs.append(soup.find_all('div', class_='stats-records-compare.stats-records-inner'))
    statistic_divs.append(soup.find_all('div', class_='stats-records-inner'))

    # Scraping each attribute of the fighter
    for divs in statistic_divs:
        for div in divs:
            label_elements = div.find_all('h2', class_=re.compile('e-t3'))

            for label_element in label_elements:
                label = label_element.get_text().strip()
                if label == "Striking accuracy":
                    try:
                        strike_accuracy = div.find('text', class_=re.compile('e-chart-circle__percent')).get_text().strip()
                    except:
                        pass
                if label == "Takedown Accuracy":
                    try:
                        takedown_accuracy = div.find('text', class_=re.compile('e-chart-circle__percent')).get_text().strip()
                    except:
                        pass

            label_elements = soup.find_all('div', class_='c-stat-compare__label')

            for label_element in label_elements:
                parent = label_element.parent
                label = label_element.get_text().strip()

                if label == "Sig. Str. Landed":
                    try:
                        connected_strikes_per_minute = parent.find('div', class_='c-stat-compare__number').get_text().strip()
                    except:
                        pass
                if label == "Sig. Str. Absorbed":
                    try:
                        absorved_strikes_per_minute = parent.find('div', class_='c-stat-compare__number').get_text().strip()
                    except:
                        pass
                if label == "Sig. Str. Defense":
                    try:
                        strikes_defense = parent.find('div', class_='c-stat-compare__number').get_text().strip()
                    except:
                        pass
                if label == "Takedown Defense":
                    try:
                        takedown_defense = parent.find('div', class_='c-stat-compare__number').get_text().strip()
                    except:
                        pass
                if label == "Takedown avg":
                    try:
                        takedown_avg = parent.find('div', class_='c-stat-compare__number').get_text().strip()
                    except:
                        pass
                if label == "Submission avg":
                    try:
                        submission_avg = parent.find('div', class_='c-stat-compare__number').get_text().strip()
                    except:
                        pass
                if label == "Knockdown Avg":
                    try:
                        knockdown_avg = parent.find('div', class_='c-stat-compare__number').get_text().strip()
                    except:
                        pass
                if label == "Average fight time":
                    try:
                        avg_fight_time = parent.find('div', class_='c-stat-compare__number').get_text().strip()
                    except:
                        pass

    # Extracting bio information (age, weight, height, reach, etc.)
    try:
        bio_div = soup.find_all('div', class_='c-bio__label')
        for bio in bio_div:
            label = bio.get_text().strip()
            if label == "Age":
                age = bio.find_next('div', class_='c-bio__text').get_text().strip()
            if label == "Height":
                height = bio.find_next('div', class_='c-bio__text').get_text().strip()
            if label == "Weight":
                weight = bio.find_next('div', class_='c-bio__text').get_text().strip()
            if label == "Reach":
                reach = bio.find_next('div', class_='c-bio__text').get_text().strip()
            if label == "Status":
                status = bio.find_next('div', class_='c-bio__text').get_text().strip()
            if label == "Place of Birth":
                place_of_birth = bio.find_next('div', class_='c-bio__text').get_text().strip()
            if label == "Octagon Debut":
                debut = bio.find_next('div', class_='c-bio__text').get_text().strip()
    except:
        pass

    # Extracting division
    try:
        division_tag = soup.find("p", class_="hero-profile__division-title")
        if division_tag:
            division = division_tag.get_text().strip()
    except:
        pass

    # Extracting fight record (W-L-D)
    try:
        record_tag = soup.find("p", class_="hero-profile__division-body")
        if record_tag:
            record = record_tag.get_text().strip()
    except:
        pass

    # Extracting win by method statistics
    # Extracting win by method statistics
# Extracting win by method statistics
    try:
        win_by_method_tag = soup.find("h2", class_="c-stat-3bar__title", string="Win by Method")
        if win_by_method_tag:
        # Get the parent div of the "Win by Method" title
            parent_div = win_by_method_tag.find_parent('div')

        # Find all labels and values under the "Win by Method" section
            labels = parent_div.find_all('div', class_='c-stat-3bar__label')
            values = parent_div.find_all('div', class_='c-stat-3bar__value')

        # Extract the values for KO/TKO, DEC, SUB
            for label, value in zip(labels, values):
                label_text = label.get_text().strip()
                value_text = value.get_text().strip()

                if label_text == "KO/TKO":
                    win_by_ko_tko = value_text
                elif label_text == "DEC":
                    win_by_dec = value_text
                elif label_text == "SUB":
                    win_by_sub = value_text
    except Exception as e:
        print(f"Error extracting win by method: {e}")



    # Append the fighter's stats to the list
    fighters_statistics.append([real_name, strike_accuracy, takedown_accuracy, connected_strikes_per_minute, absorved_strikes_per_minute, strikes_defense,
                                takedown_defense, takedown_avg, submission_avg, knockdown_avg, avg_fight_time, weight, height, reach, division, age, status, place_of_birth, record,
                                win_by_ko_tko, win_by_dec, win_by_sub, debut])

# Creating .csv file
head = ['name', 'strike_accuracy', 'takedown_accuracy', 'connected_strikes_per_minute', 'absorved_strikes_per_minute', 'strikes_defense',
        'takedown_defense', 'takedown_avg', 'submission_avg', 'knockdown_avg', 'avg_fight_time', 'weight', 'height', 'reach', 'division', 'age', 'status', 'place_of_birth', 'record',
        'win_by_ko_tko', 'win_by_dec', 'win_by_sub', 'debut']

with open('ufc.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # Write the header
    writer.writerow(head)

    # Write multiple rows
    writer.writerows(fighters_statistics)


Scraping Danny Abbadi...
https://www.ufc.com/athlete/danny-abbadi
Scraping Nariman Abbassov...
https://www.ufc.com/athlete/nariman-abbassov
Scraping Tank Abbott...
https://www.ufc.com/athlete/tank-abbott
Scraping Hamdy Abdelwahab...
https://www.ufc.com/athlete/hamdy-abdelwahab
Scraping Mansur Abdul-Malik...
https://www.ufc.com/athlete/mansur-abdul-malik
Scraping Shamil Abdurakhimov...
https://www.ufc.com/athlete/shamil-abdurakhimov
Scraping Daichi Abe...
https://www.ufc.com/athlete/daichi-abe
Scraping Papy Abedi...
https://www.ufc.com/athlete/papy-abedi
Scraping Ricardo Abreu...
https://www.ufc.com/athlete/ricardo-abreu
Scraping Klidson Abreu...
https://www.ufc.com/athlete/klidson-abreu
Scraping John Adajar...
https://www.ufc.com/athlete/john-adajar
Scraping Juan Adams...
https://www.ufc.com/athlete/juan-adams
Scraping Scott Adams...
https://www.ufc.com/athlete/scott-adams
Scraping Anthony Adams...
https://www.ufc.com/athlete/anthony-adams
Scraping Zarrukh Adashev...
https://www.ufc.co

In [77]:
data = pd.read_csv('ufc.csv')


In [78]:
data

Unnamed: 0,name,strike_accuracy,takedown_accuracy,connected_strikes_per_minute,absorved_strikes_per_minute,strikes_defense,takedown_defense,takedown_avg,submission_avg,knockdown_avg,...,reach,division,age,status,place_of_birth,record,win_by_ko_tko,win_by_dec,win_by_sub,debut
0,Danny Abbadi,38%,,3.29,4.41,58\n %,78\n %,0.00,0.00,0.00,...,,Lightweight Division,39.0,Not Fighting,"Orlando, United States",2-2-0 (W-L-D),0 (0 %),0 (0 %),0 (0 %),"Jun. 24, 2006"
1,Nariman Abbassov,20%,0%,3.00,5.67,46\n %,67\n %,0.00,0.00,0.00,...,,Lightweight Division,29.0,Active,Kazakhstan,0-1-0 (W-L-D),0 (0 %),0 (0 %),0 (0 %),"Feb. 21, 2025"
2,Tank Abbott,39%,,2.41,10.03,38\n %,67\n %,0.00,0.00,0.00,...,,Heavyweight Division,,Not Fighting,"Huntington Beach, United States",8-10-0 (W-L-D),0 (0 %),0 (0 %),0 (0 %),"Jul. 14, 1995"
3,Hamdy Abdelwahab,52%,80%,3.40,3.87,51\n %,100\n %,2.00,0.00,0.50,...,72.0,Heavyweight Division,32.0,Active,Egypt,6-0-0 (W-L-D),5 (100%),0 (0%),0 (0%),"Jul. 30, 2022"
4,Mansur Abdul-Malik,55%,,6.61,4.21,54\n %,75\n %,0.00,0.00,1.29,...,79.5,Middleweight Division,27.0,Active,"Pittsburgh, United States",7-0-0 (W-L-D),6 (86%),0 (0%),1 (14%),"Nov. 9, 2024"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3010,Farès Ziam,52%,41%,2.85,1.62,65\n %,70\n %,1.62,0.25,0.12,...,75.0,Lightweight Division,27.0,Active,"Vénissieux, Auvergne-Rhône-Alpes, France",17-4-0 (W-L-D),6 (35%),7 (41%),4 (24%),"Sep. 7, 2019"
3011,James Zikic,44%,0%,1.93,3.20,44\n %,74\n %,0.00,1.00,0.00,...,,Light Heavyweight Division,46.0,Not Fighting,"Watford, United Kingdom",16-3-0 (W-L-D),0 (0 %),0 (0 %),0 (0 %),"Jul. 13, 2002"
3012,Cat Zingano,61%,65%,2.57,1.63,47\n %,43\n %,2.77,0.85,0.00,...,68.0,Women's Featherweight Division,40.0,Not Fighting,"Winona, United States",10-4-0 (W-L-D),5 (50%),2 (20%),3 (30%),"Apr. 14, 2013"
3013,Igor Zinoviev,,,,,,,,,,...,,Middleweight Division,,Not Fighting,United States,0-0-0 (W-L-D),,,,"Mar. 13, 1998"
