In [13]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import spacy

In [14]:
spacy_nlp = spacy.load('en_core_web_sm')

In [70]:
def entity_lst_from_wiki_url(meta_url):
    response = requests.get(meta_url)

    soup = BeautifulSoup(response.content, "html.parser")

    href_set = set()
    for link in soup.find_all("a"):
        href = link.get("href")
        if href and "/wiki/" in href:
            #print(href)
            href_set.add(href.replace("/wiki/", ""))
    href_set_lst = list(href_set)
    print(f"length of entities in the list: %d"%(len(href_set_lst)))

    print('running space nlp to filter out location type entities')

    href_set_person_lst = []
    for entity in href_set_lst:
        if entity[:5]!='https':
            loaded = spacy_nlp(entity.replace("_", " "))
            if loaded.ents:
                #print(loaded.ents)
                #if loaded.ents[0].label_=='PERSON':
                if loaded.ents[0].label_!='GPE':
                    url = "https://en.wikipedia.org" + "/wiki/" + entity.replace(" ", "_")
                    href_set_person_lst.append(url)

    return href_set_person_lst

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_UFC_fighters"

response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

fighter_links = []
for link in soup.find_all("a"):
    href = link.get("href")
    if href and "/wiki/" in href and "UFC_fighter" in href:
        fighter_links.append("https://en.wikipedia.org" + href)

In [10]:
href_set = set()
for link in soup.find_all("a"):
    href = link.get("href")
    if href and "/wiki/" in href:
        #print(href)
        href_set.add(href.replace("/wiki/", ""))

In [42]:
href_set_lst = list(href_set)
len(href_set_lst)

647

### filter entities extracted from the wiki page, assume any entity that is not a geographic location(GPE) is a UFC fighter; we go this approach because sometimes spacy won't recognize the fighter as person(PERSON) correctly.


In [59]:
href_set_person_lst = []
for entity in href_set_lst:
    if entity[:5]!='https':
        loaded = spacy_nlp(entity.replace("_", " "))
        if loaded.ents:
            #print(loaded.ents)
            #if loaded.ents[0].label_=='PERSON':
            if loaded.ents[0].label_!='GPE':
                url = "https://en.wikipedia.org" + "/wiki/" + entity.replace(" ", "_")
                href_set_person_lst.append(url)

In [60]:
len(href_set_person_lst)

474

In [64]:
with open("data/ufc_wiki_urls_raw.txt", "w") as f:
    for item in href_set_person_lst:
        f.write(f"{item}\n")

### manually remove links that are not UFC fighters

In [None]:
### e.g. https://en.wikipedia.org/wiki/Strawweight_(MMA), https://en.wikipedia.org/wiki/UFC_290, https://en.wikipedia.org/wiki/Category:Articles_with_hCards

In [66]:
with open("data/ufc_wiki_urls_raw.txt", "r") as f:
    lines = f.readlines()
ufc_fighters_lst = [line.strip() for line in lines]

In [67]:
len(href_set_person_lst)

474

In [71]:
href_current_person_lst = entity_lst_from_wiki_url("https://en.wikipedia.org/wiki/List_of_current_UFC_fighters")

length of entities in the list: 644
running space nlp to filter out location type entities


In [72]:
href_UFC_rankings_person_lst = entity_lst_from_wiki_url("https://en.wikipedia.org/wiki/UFC_rankings")

length of entities in the list: 676
running space nlp to filter out location type entities


In [None]:
href_UFC_hof_person_lst = entity_lst_from_wiki_url("https://en.wikipedia.org/wiki/UFC_Hall_of_Fame")

In [75]:
href_combined_person_lst = list(set(href_set_person_lst + href_current_person_lst + href_UFC_rankings_person_lst))

In [76]:
len(href_combined_person_lst)

638

In [81]:
href_combined_person_lst = [url for url in href_combined_person_lst if 'UFC' not in url]

In [82]:
len(href_combined_person_lst)

486

In [78]:
### https://en.wikipedia.org/wiki/Ultimate_Fighting_Championship
### The organization had 578 fighters contracted as of January 2023.

In [83]:
with open("data/ufc_wiki_urls_raw_v2.txt", "w") as f:
    for item in href_combined_person_lst:
        f.write(f"{item}\n")

In [84]:
added_persons_lst = [ent for ent in href_combined_person_lst if ent not in href_set_person_lst]

In [86]:
len(added_persons_lst)

64

In [87]:
with open("data/ufc_wiki_urls_raw_v2_check.txt", "w") as f:
    for item in added_persons_lst:
        f.write(f"{item}\n")

In [90]:
# manually review added_persons_lst
added_url_lst = [
    "https://en.wikipedia.org/wiki/Ariane_Lipski",
    "https://en.wikipedia.org/wiki/Muhammad_Mokaev",
    "https://en.wikipedia.org/wiki/Francis_Ngannou",
    "https://en.wikipedia.org/wiki/Michelle_Waterson-Gomez"
]

In [97]:
with open('data/ufc_wiki_urls.txt', 'r') as file:
    urls_orig = file.readlines()
urls_orig = [url.strip() for url in urls_orig]

In [98]:
href_set_person_lst_v2 = list(set(urls_orig + added_url_lst))

In [99]:
len(href_set_person_lst_v2)

389

In [100]:
with open("data/ufc_wiki_urls_v2.txt", "w") as f:
    for item in href_set_person_lst_v2:
        f.write(f"{item}\n")