In [5]:
import requests as req
from lxml import html
import datetime as dt
import math

url = "https://www.sherdog.com/fighter/Jon-Jones-27944"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
fight_list = []
htm = req.get(url, headers = headers)
xml = html.document_fromstring(htm.content)


In [8]:
import ufc

In [12]:
ufc.get_event('ufc 280')

{'name': 'UFC 280: Oliveira vs. Makhachev',
 'date': '2022-10-22',
 'location': 'Yas Island/Yas West United Arab Emirates',
 'venue': 'Etihad Arena',
 'fights': [{'weightclass': 'Lightweight Title',
   'red corner': {'name': 'Charles Oliveira',
    'ranking': 'Unranked',
    'odds': '+165',
    'link': 'https://www.ufc.com/athlete/charles-oliveira',
    'result': 'Loss'},
   'blue corner': {'name': 'Islam Makhachev',
    'ranking': 'Unranked',
    'odds': '-195',
    'link': 'https://www.ufc.com/athlete/islam-makhachev',
    'result': 'Win'},
   'round': '2',
   'time': '3:16',
   'method': 'Submission'},
  {'weightclass': 'Bantamweight Title',
   'red corner': {'name': 'Aljamain Sterling',
    'ranking': 'Unranked',
    'odds': '-175',
    'link': 'https://www.ufc.com/athlete/aljamain-sterling',
    'result': 'Win'},
   'blue corner': {'name': 'TJ Dillashaw',
    'ranking': 'Unranked',
    'odds': '+150',
    'link': 'https://www.ufc.com/athlete/tj-dillashaw',
    'result': 'Loss'},
 

# UFC API

UFC API is a lightweight web crawler built in Python to retrieve data on UFC fighters and events.

# Installation

You can install UFC API using pip:

```
pip install ufc_api
```

# Usage

Usage is simple. To get stats on a particular fighter returned as a json:

```
>>> from ufc_api import get_fighter

>>> get_fighter('Jon Jones')

>>> {'name': 'Jon Jones',
 'nickname': 'Bones',
 'nationality': 'United States',
 'birthplace': 'Rochester, New York',
 'birthdate': 'Jul 19, 1987',
 'age': '35',
 'height': '6\'4"',
 'weight': '248 lbs',
 'association': 'Jackson-Wink MMA',
 'weight_class': 'Heavyweight',
 'wins': {'total': '27',
  'ko/tko': '10',
  'submissions': '7',
  'decisions': '10',
  'others': '0'},
 'losses': {'total': '1',
  'ko/tko': '0',
  'submissions': '0',
  'decisions': '0',
  'others': '1'},
 'fights': [{'name': 'UFC 285 - Jones vs. Gane',
   'date': 'Mar / 04 / 2023',
   'url': 'https://www.sherdog.com/events/UFC-285-Jones-vs-Gane-95232',
   'result': 'win',
   'method': 'Submission (Guillotine Choke)',
   'referee': 'Marc Goddard',
   'round': '1',
   'time': '2:04',
   'opponent': 'Ciryl Gane'},
   ...
   
```

To get data on an event, the usage is similar:

```
```

In [28]:
def parse_sherdog_fighter(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
    htm = req.get(url, headers = headers)
    xml = html.document_fromstring(htm.content)
    
    
    wins_detailed = xml.xpath("//div[@class='wins']/div[@class='meter']/div[1]/text()")
    losses_detailed = xml.xpath("//div[@class='loses']/div[@class='meter']/div[1]/text()")
    bio = xml.xpath("//div[@class='fighter-info']")[0]
    
    try:
        other_wins = wins_detailed[3]
        other_losses = losses_detailed[3]
    except IndexError:
        other_wins = '0'
        other_losses = '0'

    fighter = {
        'name' : xml.xpath("//span[@class='fn']/text()")[0],
        'nickname' : bio.xpath("//span[@class='nickname']/em/text()")[0],
        'nationality' : bio.xpath("//strong[@itemprop='nationality']/text()")[0],
        'birthplace' : xml.xpath("//span[@class='locality']/text()")[0],
        'birthdate' : xml.xpath("//span[@itemprop='birthDate']/text()")[0],
        'age' : xml.xpath("//span[@itemprop='birthDate']/preceding-sibling::b/text()")[0],
        'height' : xml.xpath("//b[@itemprop='height']/text()")[0],
        'weight' : xml.xpath("//b[@itemprop='weight']/text()")[0],
        'association' : xml.xpath("//span[@itemprop='memberOf']/a/span/text()")[0],
        'weight_class' : xml.xpath("//div[@class='association-class']/a/text()")[0],

        'wins' : {
            'total': xml.xpath("//div[@class='winloses win']/span[2]/text()")[0],
            'ko/tko': wins_detailed[0],
            'submissions':wins_detailed[1],
            'decisions':wins_detailed[2],
            'others': other_wins
                },
        'losses' : {
            'total': xml.xpath("//div[@class='winloses lose']/span[2]/text()")[0],
            'ko/tko': losses_detailed[0],
            'submissions':losses_detailed[1],
            'decisions':losses_detailed[2],
            'others':other_losses
                },

        'fights' : []
    }

    fight_rows = xml.xpath("//table[@class='new_table fighter']/tr[not(@class='table_head')]")

    for row in fight_rows:
        try:
            referee =  row.xpath("td[4]/span/a/text()")[0]
        except IndexError:
            referee = ""

        fight = {
            'name': row.xpath("td[3]/a/descendant-or-self::*/text()")[0],
            'date': row.xpath("td[3]/span/text()")[0],
            'url': "https://www.sherdog.com" + row.xpath("td[3]/a/@href")[0],
            'result': row.xpath("td[1]/span/text()")[0],
            'method': row.xpath("td[4]/b/text()")[0],
            'referee': referee,
            'round': row.xpath("td[5]/text()")[0],
            'time': row.xpath("td[6]/text()")[0],
            'opponent': row.xpath("td[2]/a/text()")[0]
        }
        fighter['fights'].append(fight)
    return fighter

    


In [None]:
parse_sherdog_fighter("https://www.sherdog.com/fighter/Brandon-Vera-4886")

In [None]:
bad.xpath("td[3]/a/descendant-or-self::*/text()")

In [None]:
xml.xpath("//span[@itemprop='birthDate']/preceding-sibling::b/text()")

In [None]:
dt.datetime.strptime(birthdate, "%b %d, %Y")

In [None]:
delt

In [None]:
xml.xpath("//span[@itemprop='memberOf']/a/span/text()")[0]

In [None]:
row = fight_rows[0]

fight = {
    'name': row.xpath("td[3]/a/span/text()")[0],
    'date': row.xpath("td[3]/span/text()")[0],
    'url': "https://www.sherdog.com" + row.xpath("td[3]/a/@href")[0],
    'result': row.xpath("td[1]/span/text()")[0],
    'method': row.xpath("td[4]/b/text()")[0],
    'referee': row.xpath("td[4]/span/a/text()")[0],
    'round': row.xpath("td[5]/text()")[0],
    'time': row.xpath("td[6]/text()")[0],
    'opponent': row.xpath("td[2]/a/text()")[0]
}

In [None]:
url = "https://www.sherdog.com/sitemap.xml"
htm = req.get(url, headers = headers)
xml = etree.HTML(htm.content)

In [None]:
xml.xpath("//sitemap/loc[contains(text(), 'fighter')]/text()")

In [189]:
from bs4 import BeautifulSoup as bs
url = "https://www.sherdog.com/sitemap-fighters.xml"
htm = req.get(url, headers = headers)

In [190]:
def bs_test():
    soup = bs(htm.content)
    [x.text for x in soup.find_all('loc')]

def lxml_test():
    xml = etree.HTML(htm.content)
    xml.xpath("//loc/text()")
    
def regex_test():
    re.findall("(?<=<loc>)(.*)(?=</loc>)", htm.text)
    

In [191]:
%timeit bs_test()

1.21 s ± 93.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [192]:
%timeit lxml_test()

149 ms ± 598 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [193]:
%timeit regex_test()

29.5 ms ± 3.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
import re
dummy_text = "<loc>this is a link</loc>"
re.findall("(?<=<loc>)(.*)(?=</loc>)", htm.text)


In [213]:
import os
import asyncio
import aiohttp
import re

async def update_links():
    
    #Remove the current url mapping if it exists
    try:
        os.remove("url_mapping.json")
    except FileNotFoundError:
        pass
    
    #Pull all the links to sub site maps for fighter pages
    url = "https://www.sherdog.com/sitemap.xml"
    htm = req.get(url, headers = headers)
    xml = etree.HTML(htm.content)
    fighter_maps = xml.xpath("//sitemap/loc[contains(text(), 'fighter')]/text()")

    final_dict = {}
    
    #Pull all the links and names from the sub site maps for each of the fighter pages
    res = await asyncio.gather(*(get_link_sitemap(url) for url in fighter_maps))
    
    #Update dictionary with all the name:links 
    [final_dict.update(x) for x in res]
    
    #write everything to a json file
    with open("url_mapping.json", "w") as f:
        json.dump(final_dict, f)
        
        
async def get_link_sitemap(url):
    connector = aiohttp.TCPConnector(limit=60)
    async with aiohttp.ClientSession(connector=connector) as session:
        async with session.get(url, headers=headers) as response:
            htm = await response.text()              
            names = {" ".join(l.split("/")[-1].split("-")[-1]): {'name': " ".join(l.split("/")[-1].split("-")[;-1]), 'link': l} for l in re.findall("(?<=<loc>)(.*)(?=</loc>)", htm)}
            return names

In [214]:
stuff = await update_links()

In [1]:
from googlesearch import search
search("Jon Jones Sherdog", num_results = 5)



<generator object search at 0x000001B0647BEAB0>

https://www.sherdog.com/fighter/John-Jones-45686
https://www.sherdog.com/fighter/John-Jones-3942
https://www.sherdog.com/fighter/Felipe-Boaventura-147201
https://www.sherdog.com/fighter/news/8/Jon-Jones-27944
https://www.sherdog.com/
https://www.sherdog.com/news/news/Jon-Jones-Stipe-Miocic-Likely-Last-Fight-Unless-Francis-Ngannou-Returns-to-UFC-189891
https://www.tapology.com/fightcenter/fighters/jon-jones-bones
https://en.wikipedia.org/wiki/Jon_Jones
https://www.ufc.com/athlete/jon-jones
https://www.espn.com/mma/fighter/_/id/2335639/jon


In [None]:
[" ".join(l.split("/")[-1].split("-")[:-1]) for l in xml.xpath("//loc/text()")]

In [None]:
l = fighter_urls[0]

In [None]:
names = {" ".join(l.split("/")[-1].split("-")[:-1]): l for l in xml.xpath("//loc/text()")}

In [None]:
names

In [None]:
import json
with open("url_mapping.json", "w") as f:
    json.dump(names, f)


In [215]:
with open("url_mapping.json", "r") as f:
    new = json.load(f)

In [None]:
len(new)

In [135]:
import requests as req
from lxml import html
import datetime as dt
import math

def search(query):
    url = 'https://www.google.com/search?q=' + query
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
    htm = req.get(url, headers = headers)
    xml = html.document_fromstring(htm.content)
    return xml.xpath("//h3/parent::a/@href")


In [65]:
def get_sherdog_link(query):
    possible_urls = search(query+" Sherdog")
    for url in possible_urls:
        if ("sherdog.com/fighter/" in url) and (not "/news/" in url):
            return url
    raise BaseException("Sherdog link not found !")
    
def get_ufc_link(query):
    possible_urls = search(query+" UFC")
    for url in possible_urls:
        if ("ufc.com/athlete/" in url):
            return url
    raise BaseException("UFC link not found !")

In [32]:
get_ufc_link("Jon Jones")

'https://www.ufc.com/athlete/jon-jones'

In [67]:
def get_fighter(query):
    sherdog_link = get_sherdog_link(query)
    ufc_link = get_ufc_link(query)
    
    fighter = parse_sherdog_fighter(sherdog_link)
    fighter.update(get_ufc_stats(ufc_link))
    return fighter

In [68]:
get_fighter('Jo')

https://www.sherdog.com/fighter/Joanna-Jedrzejczyk-101411


BaseException: UFC link not found !

In [62]:
def get_ufc_stats(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
    htm = req.get(url, headers = headers)
    xml = html.document_fromstring(htm.content)

    str_tds = xml.xpath("//dd/text()")
    distance = xml.xpath("//div[@class='c-stat-3bar__value']/text()")
    stats = xml.xpath("//div[@class='c-stat-compare__number']/text()")

    fighter = {
        'strikes': {
            'attempted': str_tds[1],
            'landed': str_tds[0],
            'standing': distance[0].split(" ")[0],
            'clinch': distance[1].split(" ")[0],
            'ground': distance[2].split(" ")[0],
            'striking defense': stats[4].strip(),
            'strikes per minute': stats[0].strip()
        },
        'takedowns': {
            'attempted': str_tds[3],
            'landed': str_tds[2],
            'takedown defense':stats[5].strip(),
            'subs per 15min': stats[3].strip()
        }
    }
    return fighter

In [61]:
get_ufc_stats('https://www.ufc.com/athlete/jon-jones')

In [43]:
xml.xpath("//div[@class='c-stat-compare__number']/text()")

['4.29\n              ',
 '2.22\n          ',
 '1.93\n              ',
 '0.48\n          ',
 '64\n                  ',
 '95\n                      ',
 '0.22\n              ',
 '14:53\n          ']

In [44]:
xml.xpath("//div[@class='c-stat-compare__label']/text()")

['Sig. Str. Landed',
 'Sig. Str. Absorbed',
 'Takedown avg',
 'Submission avg',
 'Sig. Str. Defense',
 'Takedown Defense',
 'Knockdown Avg',
 'Average fight time']

In [55]:
fighter

{'strikes': {'attempted': '2536',
  'landed': '1468',
  'standing': '958',
  'clinch': '248',
  'ground': '262',
  'striking defense': '64'},
 'takedowns': {'attempted': '97',
  'landed': '36',
  'takedown defense': '95',
  'subs per 15min': '0.48'}}

In [45]:
1468/4.29

342.1911421911422

In [48]:
36/(1.93/15)

279.79274611398966

In [1]:
from fighter import get_fighter



In [None]:
get_fighter('Corey Sandhagen')

In [24]:
def get_upcoming_event_links():
    url = 'https://www.ufc.com/events'
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
    htm = req.get(url, headers = headers)
    xml = html.document_fromstring(htm.content)
    return ["https://www.ufc.com/"+x for x in xml.xpath("//details[@id='events-list-upcoming']/div/div/div/div/div/section/ul/li/article/div[1]/div/a/@href")]

In [25]:
get_upcoming_event_links()

['https://www.ufc.com//event/ufc-fight-night-july-15-2023',
 'https://www.ufc.com//event/ufc-fight-night-july-22-2023',
 'https://www.ufc.com//event/ufc-291',
 'https://www.ufc.com//event/ufc-fight-night-august-05-2023',
 'https://www.ufc.com//event/ufc-fight-night-august-12-2023',
 'https://www.ufc.com//event/ufc-292',
 'https://www.ufc.com//event/ufc-fight-night-august-26-2023',
 'https://www.ufc.com//event/ufc-fight-night-september-02-2023']

In [142]:
url = 'https://www.ufc.com/event/ufc-280'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
htm = req.get(url, headers = headers)
xml = html.document_fromstring(htm.content)

In [30]:
fights_html = xml.xpath("//div[@class='fight-card']/div/div/section/ul/li")

In [169]:
def get_upcoming_event_links():
    url = 'https://www.ufc.com/events'
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
    htm = req.get(url, headers = headers)
    xml = html.document_fromstring(htm.content)
    return ["https://www.ufc.com/"+x for x in xml.xpath("//details[@id='events-list-upcoming']/div/div/div/div/div/section/ul/li/article/div[1]/div/a/@href")]

def get_ufc_link(query):
    possible_urls = search(query+" UFC")
    for url in possible_urls:
        if ("ufc.com/event/" in url):
            return url
    raise BaseException("UFC link not found !")
    
def get_ranking(fight, corner):
    if corner == 'red':
        path = "div/div/div/div[2]/div[2]/div[2]/div[1]/span/text()"
    else:
        path = "div/div/div/div[2]/div[2]/div[2]/div[2]/span/text()"
        
    try:
        return fight.xpath(path)[0][1:]
    except IndexError:
        return "Unranked"
    
def get_name(fight, corner):
    if corner == 'red':
        path = "div/div/div/div[2]/div[2]/div[5]/div[1]/a/span/text()"
    else:
        path = "div/div/div/div[2]/div[2]/div[5]/div[3]/a/span/text()"
        
    name = " ".join(fight.xpath(path))
    
    if name == '':
        path = path.replace("/span", "")
        name = " ".join(fight.xpath(path)).strip()
    
    return name

def parse_event(url, past=True):
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
    htm = req.get(url, headers = headers)
    xml = html.document_fromstring(htm.content)
    fights_html = xml.xpath("//div[@class='fight-card']/div/div/section/ul/li")
    
    prefix = xml.xpath("//div[@class='c-hero__header']/div[1]/div/h1/text()")[0].strip()
    names = xml.xpath("//div[@class='c-hero__header']/div[2]/span/span/text()")

    name = f"{prefix}: {names[0].strip()} vs. {names[-1].strip()}"

    date = dt.datetime.fromtimestamp(int(xml.xpath("//div[@class='c-hero__bottom-text']/div[1]/@data-timestamp")[0]))
    date = date.strftime("%Y-%m-%d")
    location = xml.xpath("//div[@class='c-hero__bottom-text']/div[2]/div/text()")[0].split(",")

    event = {
        'name': name,
        'date': date,
        'location': location[1].strip(),
        'venue': location[0].strip(),
        'fights': []
    }
    for fight in fights_html:
        this_fight = {
                'weightclass': fight.xpath("div/div/div/div[2]/div[2]/div[1]/div[2]/text()")[0][:-5],
                'red corner': {
                    'name': get_name(fight, 'red'),
                    'ranking': get_ranking(fight, 'red'),
                    'odds': fight.xpath("div/div/div/div[4]/div[2]/span[1]/span/text()")[0],
                    'link': fight.xpath("div/div/div/div[2]/div[2]/div[5]/div[1]/a/@href")[0]
                },
                'blue corner': {
                    'name': get_name(fight, 'blue'),
                    'ranking': get_ranking(fight, 'blue'),
                    'odds': fight.xpath("div/div/div/div[4]/div[2]/span[3]/span/text()")[0],
                    'link': fight.xpath("div/div/div/div[2]/div[2]/div[5]/div[3]/a/@href")[0]            
                }
            }
        if past:
            result = fight.xpath("div/div/div/div[2]//div[@class='c-listing-fight__outcome-wrapper']/div/text()")
            method = fight.xpath("div//div[@class='c-listing-fight__result-text method']/text()")
            
            finished_round = fight.xpath("div//div[@class='c-listing-fight__result-text round']/text()")
            finished_time = fight.xpath("div//div[@class='c-listing-fight__result-text time']/text()")
            
            this_fight['round'] = finished_round[0]
            this_fight['time'] = finished_time[0]
            this_fight['method'] = method[0]
            this_fight['red corner']['result'] = result[0].strip()
            this_fight['blue corner']['result'] = result[1].strip()
        event['fights'].append(this_fight)
    return event

def get_upcoming_events():
    links = get_upcoming_event_links()
    
    results = {}
    
    for url in links:
        event = parse_event(url, False)
        results[event['name']] = event
    return results


In [137]:
def get_ufc_link(query):
    possible_urls = search(query+" UFC")
    for url in possible_urls:
        if ("ufc.com/event/" in url):
            return url
    raise BaseException("UFC link not found !")

In [140]:
def get_event(query):
    link = get_ufc_link(query)
    return parse_event(link)

In [166]:
fight.xpath("div//div[@class='c-listing-fight__result-text method']/text()")

['Submission', 'Submission']

In [None]:
import fighter as ft
ft.get_fighter("Frankie Edgar")

In [4]:
ft.get_ufc_link("Frankie Edgar")

BaseException: UFC link not found !

In [2]:
ft.get_upcoming_events()

{'UFC Fight Night: Holm vs. Bueno Silva': {'name': 'UFC Fight Night: Holm vs. Bueno Silva',
  'date': '2023-07-15',
  'location': 'Las Vegas United States',
  'venue': 'UFC APEX',
  'fights': [{'weightclass': "Women's Bantamweight",
    'red corner': {'name': 'Holly Holm',
     'ranking': '3',
     'odds': '-170',
     'link': 'https://www.ufc.com/athlete/holly-holm'},
    'blue corner': {'name': 'Mayra Bueno Silva',
     'ranking': '10',
     'odds': '+145',
     'link': 'https://www.ufc.com/athlete/mayra-bueno-silva'}},
   {'weightclass': 'Welterweight',
    'red corner': {'name': 'Jack Della Maddalena',
     'ranking': '14',
     'odds': '-625',
     'link': 'https://www.ufc.com/athlete/sara-elpar-0'},
    'blue corner': {'name': 'Bassil Hafez',
     'ranking': 'Unranked',
     'odds': '+440',
     'link': 'https://www.ufc.com/athlete/bassil-hafez'}},
   {'weightclass': 'Lightweight',
    'red corner': {'name': 'Ottman Azaitar',
     'ranking': 'Unranked',
     'odds': '-105',
     

In [93]:
xml.xpath("//div[@class='c-hero__bottom-text']/div[1]/@data-timestamp")

['1689472800']

In [98]:
dt.datetime.fromtimestamp(int(xml.xpath("//div[@class='c-hero__bottom-text']/div[1]/@data-timestamp")[0])).strftime("%Y-%m-%d")

'2023-07-15'

In [109]:
xml.xpath("//div[@class='c-hero__header']/div[1]/div/h1/text()")[0].strip()

'UFC Fight Night'

In [113]:
xml.xpath("//div[@class='c-hero__header']/div[2]/span/span/text()")

['Holm', '\n    ', '\n    ', '\n    ', 'Bueno Silva']

In [119]:
fight.xpath("div/div/div/div[2]/div[2]/div[2]/div[2]/span/text()")

[]

In [120]:
fight.xpath("div/div/div/div[2]/div[2]/div[2]/div[1]/span/text()")[0][1:]

'14'

In [133]:
req.get('https://www.ufc.com/views/ajax?_wrapper_format=drupal_ajax')

<Response [404]>

In [148]:
results = xml.xpath("//div[@class='c-listing-fight__outcome-wrapper']/div/text()")

In [153]:
fight.xpath("//div[@class='c-listing-fight__outcome-wrapper']/div/text()")

[]

In [162]:
fight.xpath("div/div/div/div[2]//div[@class='c-listing-fight__outcome-wrapper']/div/text()")

['\n                  Loss\n                ',
 '\n                  Win\n                ']