# Determine list of UFC fighters and their URLs

Let's go to https://en.wikipedia.org/wiki/List_of_male_mixed_martial_artists and extract all names/urls with 'UFC' next them

In [1]:
import pandas as pd

#scraping
import requests
from bs4 import BeautifulSoup

In [2]:
fighters_list_page = requests.get('https://en.wikipedia.org/wiki/List_of_male_mixed_martial_artists')
soup = BeautifulSoup(fighters_list_page.content, 'html.parser')

In [3]:
soup = BeautifulSoup(fighters_list_page.content,'html.parser')

In [4]:
name_exceptions = {}
name_exceptions['Tim Hague (†)- (KOTC, UFC, MFC, WSOF)'] = 'Tim Hague'
name_exceptions['Kevin Randleman (†) - (PRIDE, UFC, Sengoku, Strikeforce)'] = 'Kevin Randleman'

def remove_nick_name(name):
    if '"' in name:
        left = name[:name.index('"')]
        right = name[name.index('"')+1:]
        right = right[right.index('"')+2:]
        return left + right
    else:
        return name

def clean_name(name):
    name = name.lstrip()
    if name in name_exceptions:
        return name_exceptions[name]
    else:
        idx = name.index(' - ')
        if '[' in name:
            idx = min(idx,name.index('['))
        name = remove_nick_name(name[:idx])
        return name.rstrip()

ufc_names = []

for li in soup.findAll('li')[26:]:
    txt = li.get_text()
    if 'UFC' in txt:
        ufc_names.append(clean_name(li.get_text()))
        
ufc_names

['David Abbott',
 'Papy Abedi',
 'Sam Adkins',
 'Omari Akhmedov',
 'Yoshihiro Akiyama',
 'Mostapha Al-Turk',
 'John Albert',
 'Ildemar Alcântara',
 'Iuri Alcântara',
 'José Aldo',
 'John Alessio',
 'Houston Alexander',
 'Royce Alger',
 'Sultan Aliev',
 'Ben Alloway',
 'Ricardo Almeida',
 'Thomas Almeida',
 'Eddie Alvarez',
 'Sean Alvarez',
 'Thiago Alves',
 'Warlley Alves',
 'Sam Alvey',
 'Adlan Amagov',
 'Makwan Amirkhani',
 'Corey Anderson',
 'Alex Andrade',
 'Viscardi Andrade',
 'Dylan Andrews',
 'Reese Andy',
 'Yoji Anjo',
 'Gadzhimurad Antigulov',
 'Felipe Arantes',
 'Andrei Arlovski',
 'Matt Arroyo',
 'Junior Assunção',
 'Raphael Assunção',
 'Rich Attonito',
 'Olivier Aubin-Mercier',
 'Marcus Aurélio',
 'Niklas Bäckström',
 'Seth Baczynski',
 'Ryan Bader',
 'Ali Bagautinov',
 'Siyar Bahadurzada',
 'Shamar Bailey',
 'Antonio Banuelos',
 'Renan Barão',
 'Bryan Barberena',
 'Edson Barboza',
 'Luke Barnatt',
 'Josh Barnett',
 'David Baron',
 'Phil Baroni',
 'Carlos Barreto',
 'Franci

In [5]:
url_exceptions = {}
url_exceptions['Mostapha Al-Turk'] = 'https://en.wikipedia.org/wiki/Mostapha_al-Turk'
url_exceptions['Ryan Janes'] = 'https://en.wikipedia.org/wiki/Ryan_Janes_(fighter)'
url_exceptions['Lipeng Zhang'] = 'https://en.wikipedia.org/wiki/Zhang_Lipeng'

def form_fighter_url(name):
    url_guess = 'https://en.wikipedia.org/wiki/' + name.replace(' ','_')
    url_guess_fighter = url_guess + '_(fighter)'
    if requests.head(url_guess_fighter).status_code == 200:
        return url_guess_fighter
    elif requests.head(url_guess).status_code == 200:
        return url_guess
    elif name in url_exceptions:
        return url_exceptions[name]
    else:
        print('No URL: {0}, {1}'.format(name,url_guess))
    
urls = {}

for name in ufc_names:
    url = form_fighter_url(name)
    if url:
        urls[name] = url

In [8]:
urls_df = pd.DataFrame(pd.Series(urls),columns=['URL']).reset_index().rename(columns={'index':'Name'})
urls_df.to_csv('.\Data\FighterURLs.csv',index=False)
urls_df.head(20)

Unnamed: 0,Name,URL
0,Aaron Riley,https://en.wikipedia.org/wiki/Aaron_Riley
1,Aaron Rosa,https://en.wikipedia.org/wiki/Aaron_Rosa
2,Aaron Simpson,https://en.wikipedia.org/wiki/Aaron_Simpson
3,Abel Trujillo,https://en.wikipedia.org/wiki/Abel_Trujillo
4,Adlan Amagov,https://en.wikipedia.org/wiki/Adlan_Amagov
5,Adriano Martins,https://en.wikipedia.org/wiki/Adriano_Martins
6,Akihiro Gono,https://en.wikipedia.org/wiki/Akihiro_Gono
7,Akira Corassani,https://en.wikipedia.org/wiki/Akira_Corassani
8,Al Iaquinta,https://en.wikipedia.org/wiki/Al_Iaquinta
9,Alan Belcher,https://en.wikipedia.org/wiki/Alan_Belcher
