# 01 Scraping Fighter Info

## Imports

In [94]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import math
import re
from matplotlib import pyplot as plt
import time
import datetime

## Get List of Urls for Each Page of Fighters

In [85]:
url = 'http://www.ufc.com/fighter/Weight_Class/filterFighters?offset={offset}&fighterFilter=All'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
number_of_fighters = int(soup.find('div', {'class':'paginate-results'}).find_all('span', {'class':'row-count'})[1].text)
fighters_per_page = 20
pages = math.ceil(number_of_fighters/fighters_per_page)

urls = []
for i in range(pages):
    offset = i*20
    url = f'http://www.ufc.com/fighter/Weight_Class/filterFighters?offset={offset}&fighterFilter=All'
    urls.append(url)

In [86]:
list_of_fighter_dict = []

for page_num, url in enumerate(urls):

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    
    # Get the list of fighters object from the page
    fighters_html = soup.find('table', {'class':'fighter-listing'})
    list_of_fighter_htmls = fighters_html.find_all('tr', {'class':'fighter'})
    
    if page_num%5==0:
        print(f"Scrapping Page {page_num } of {len(urls)}...")
    for fighter_html in list_of_fighter_htmls:
        fighter_dict = {}
        fighter_dict['fighter_slug'] = fighter_html.find('div', {'class':'fighter-info'}).find('a').attrs['href'][9:]
        fighter_dict['fighter_name'] = re.sub('\n', '',fighter_html.find('div', {'class':'fighter-info'}).find('a').text.strip())
        
        if len(fighter_html.find_all('div', {'class':'main-txt'})) == 3:
            # Get wins, losses and ties
            win_loss_html = fighter_html.find_all('div', {'class':'main-txt'})[0]
            if win_loss_html.text:
                fighter_dict['wins'] = int(re.match("(\d*)-(\d*)-(\d*)", win_loss_html.text).group(1))
                fighter_dict['losses'] = int(re.match("(\d*)-(\d*)-(\d*)", win_loss_html.text).group(2))
                fighter_dict['ties'] = int(re.match("(\d*)-(\d*)-(\d*)", win_loss_html.text).group(3))

            # Get Height - problem - sometimes no height

            height_html = fighter_html.find_all('div', {'class':'main-txt'})[1]

            feet = int(re.match("(\d*)' (\d*)", height_html.text).group(1))
            inches = int(re.match("(\d*)' (\d*)", height_html.text).group(2))
            total_height = feet*12 + inches
            fighter_dict['height'] = total_height

            # Get Weight
            weight_html = fighter_html.find_all('div', {'class':'main-txt'})[2]
            fighter_dict['weight'] = int(re.match("(\d*) lbs", weight_html.text).group(1))

        list_of_fighter_dict.append(fighter_dict)
        time.sleep(1)
    print("Done!")

Scrapping Page 0 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 5 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 10 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 15 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 20 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 25 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 30 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 35 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 40 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 45 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 50 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 55 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 60 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 65 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 70 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 75 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 80 of 110...
Done!
Done!
Done!
Done!
Done!
Scrapping Page 8

In [133]:
df = pd.DataFrame(list_of_fighter_dict)

In [134]:
len(list_of_fighter_dict)

2183

## Export Fighter Slug Data

In [90]:
df.to_csv('../../02_Data/01_Raw_Scraped_Data/Fighters_Slug.csv')

## For each fighter in the list, go to their UFC page and pull down their info

In [147]:
def get_all_fighter_info(list_fighter_slugs):
    list_fighter_info_dict = []
    
    for key, fighter_slug in enumerate(list_fighter_slugs):
        if key%25==0:
            print(f'{key}: Scraping {fighter_slug} at {datetime.datetime.now().strftime("%Y-%m-%d %H:%M")}')
        fighter_url = f'http://www.ufc.com/fighter/{fighter_slug}'
        try:
            response = requests.get(fighter_url)
            soup = BeautifulSoup(response.content, 'lxml')
            fighter_info_dict = get_fighter_info(soup, fighter_slug)
            list_fighter_info_dict.append(fighter_info_dict)

        except:
            log_scraping_fights(fighter_slug)
            
        time.sleep(1)
        
    return list_fighter_info_dict

In [122]:
def log_scraping_fights(fighter_slug):
    print(f'error with {fighter_slug} at {datetime.datetime.now().strftime("%Y-%m-%d %H:%M")}')
    with open('fighter_info_error_log.csv', 'a+', newline='') as f:
        f.write(f'\nerror scraping {fighter_slug} at {datetime.datetime.now().strftime("%Y-%m-%d %H:%M")}')


In [123]:
def get_fighter_info(soup, fighter_slug):
    fighter_info_dict = {}
    for tr in soup.find('div', {'class':'fighter-info'}).find('table').find_all('tr'):
        label = tr.find('td', {'class':'label'}).text
        value = tr.find('td', {'class':'value'}).text

        # Try to clean up the value if exists
        try:
            value = re.sub('\n','', value)
            value = re.sub('\t','', value)
        except:
            print('something here??')
            pass
        fighter_info_dict[label] = value
    fighter_info_dict['fighter_slug'] = fighter_slug
    return fighter_info_dict

In [148]:
list_fighter_info_dict = get_all_fighter_info(df.fighter_slug)

0: Scraping danny-Abbadi at 2018-07-10 22:43
25: Scraping alfonso-Alcarez at 2018-07-10 22:44
50: Scraping adlan-Amagov at 2018-07-10 22:45
75: Scraping igor-Araujo at 2018-07-10 22:45
100: Scraping mehdi-Baghdad at 2018-07-10 22:46
125: Scraping shayna-Baszler at 2018-07-10 22:47
150: Scraping matt-Bessette at 2018-07-10 22:48
175: Scraping steve-Bosse at 2018-07-10 22:49
200: Scraping damien-brown at 2018-07-10 22:50
225: Scraping kevin-Burns at 2018-07-10 22:50
250: Scraping carlos-Candelario at 2018-07-10 22:51
275: Scraping duane-Cason at 2018-07-10 22:52
300: Scraping john-Cholish at 2018-07-10 22:53
325: Scraping mark-Coleman at 2018-07-10 22:54
350: Scraping jc-cottrell at 2018-07-10 22:55
375: Scraping patrick-Cummins at 2018-07-10 22:56
error with w.wec.tv/lcdavis at 2018-07-10 22:57
400: Scraping marcus-Davis at 2018-07-10 22:57
425: Scraping royden-Demotta at 2018-07-10 22:57
450: Scraping russell-Doane at 2018-07-10 22:58
475: Scraping alexis-Dufresne at 2018-07-10 22:59
5

In [149]:
df = pd.DataFrame(list_fighter_info_dict)

## Export

In [153]:
df.to_csv('../../02_Data/01_Raw_Scraped_Data/Fighter_Info2181.csv')