### Scrap fight records

Here, we go to the URLs of each fight and pull out their fight records. We save them as separate CSVs.

In [1]:
import numpy as np
import pandas as pd
from warnings import warn
import os

#scraping
import requests
from bs4 import BeautifulSoup

In [2]:
fighter_urls = pd.read_csv('.\Data\FighterURLs.csv')
fighter_urls = fighter_urls.set_index('Name')

drop = ['André Pederneiras',
       'Christian Morecraft',
       'Colin Fletcher',
       'David Baron',
        'Mark Schultz']

#drop some people where we have difficulty with their URLs:
fighter_urls = fighter_urls.drop(drop)
fighter_urls.loc['Daniel Kelly','URL'] = 'https://en.wikipedia.org/wiki/Dan_Kelly_(fighter)'
fighter_urls.loc['Maurice Smith','URL'] = 'https://en.wikipedia.org/wiki/Maurice_Smith_(kickboxer)'
fighter_urls.loc['Ryan Hall','URL'] = 'https://en.wikipedia.org/wiki/Ryan_Hall_(grappler)'

fighter_urls.head(10)

Unnamed: 0_level_0,URL
Name,Unnamed: 1_level_1
Aaron Riley,https://en.wikipedia.org/wiki/Aaron_Riley
Aaron Rosa,https://en.wikipedia.org/wiki/Aaron_Rosa
Aaron Simpson,https://en.wikipedia.org/wiki/Aaron_Simpson_(f...
Abel Trujillo,https://en.wikipedia.org/wiki/Abel_Trujillo
Adlan Amagov,https://en.wikipedia.org/wiki/Adlan_Amagov
Adriano Martins,https://en.wikipedia.org/wiki/Adriano_Martins_...
Akihiro Gono,https://en.wikipedia.org/wiki/Akihiro_Gono
Akira Corassani,https://en.wikipedia.org/wiki/Akira_Corassani
Al Iaquinta,https://en.wikipedia.org/wiki/Al_Iaquinta
Alan Belcher,https://en.wikipedia.org/wiki/Alan_Belcher


In [6]:
#These are rows that I'm manually correcting.
row_text_exceptions = {}
row_text_exceptions['\nWin\n9–0\nAntônio Silva\nKO (punches)\nStrikeforce: Barnett vs. Kharitonov\n000000002011-09-10-0000September 10, 2011\n1\n3:56\nCincinnati, Ohio, United\nStates\n\nStrikeforce Heavyweight Grand Prix semifinal.\n'] = \
'\nWin\n9–0\nAntônio Silva\nKO (punches)\nStrikeforce: Barnett vs. Kharitonov\n000000002011-09-10-0000September 10, 2011\n1\n3:56\nCincinnati, Ohio, United States\nStrikeforce Heavyweight Grand Prix semifinal.\n'
row_text_exceptions['\nLoss\n10–6\nRiley Dutro\nTKO (punches)\nPacific Xtreme Combat 55\n000000002016-11-18-0000November 18, 2016\n1\n2:46\nMangilao, Guam\nFor the vacant Pacific Xtreme Combat (PXC) Flyweight title.\n\n']= '\nLoss\n10–6\nRiley Dutro\nTKO (punches)\nPacific Xtreme Combat 55\n000000002016-11-18-0000November 18, 2016\n1\n2:46\nMangilao, Guam\nFor the vacant Pacific Xtreme Combat (PXC) Flyweight title.\n'
row_text_exceptions['\nLoss\n11–4\nJan Błachowicz\nDecision (unanimous)\nKSW 22\n000000002013-03-16-0000March 16, 2013\n3\n5:00\nWarsaw, Poland\nFor the KSW Light Heavyweight Championship.\nFight of the Night.\n'] = '\nLoss\n11–4\nJan Błachowicz\nDecision (unanimous)\nKSW 22\n000000002013-03-16-0000March 16, 2013\n3\n5:00\nWarsaw, Poland\nFor the KSW Light Heavyweight Championship.Fight of the Night.\n'
row_text_exceptions['\nWin\n13–3–2\nDave Rivas\nKO (Punches and Knees)\n\nDB 11 - DesertBrawl 11\n000000002004-07-17-0000July 17, 2004\n3\nN/A\nBend, Oregon, United States\n\n'] = '\nWin\n13–3–2\nDave Rivas\nKO (Punches and Knees)\nDB 11 - DesertBrawl 11\n000000002004-07-17-0000July 17, 2004\n3\nN/A\nBend, Oregon, United States\n\n'
row_text_exceptions['\nLoss\n16–13\nAlexandre Pantoja\nSubmission (rear-naked choke)\nUFC Fight Night: Nelson vs. Ponzinibbio\n000000002017-07-16-000016 July 2017\n3\n2:31\nGlasgow, Scotland\n\n\n'] = '\nLoss\n16–13\nAlexandre Pantoja\nSubmission (rear-naked choke)\nUFC Fight Night: Nelson vs. Ponzinibbio\n000000002017-07-16-000016 July 2017\n3\n2:31\nGlasgow, Scotland\n\n'
row_text_exceptions['\nLoss\n15–7\nOwen Roddy\nDecision (split)\nCage Contender 21\n000000002012-07-22-0000July 22, 2012\n3\n5:00\nDublin, Ireland\nFeatherweight debut; for Cage Contender Featherweight Championship.\n\n'] = '\nLoss\n15–7\nOwen Roddy\nDecision (split)\nCage Contender 21\n000000002012-07-22-0000July 22, 2012\n3\n5:00\nDublin, Ireland\nFeatherweight debut; for Cage Contender Featherweight Championship.\n'

def correct_table(table):
    """
    Given a table, this should tell us if this is the MMA fight record table.
    """
    return 'Opponent' in table.findAll('tr')[0].get_text().split('\n')


def table_from_URL(url,print_rows=False):
    fighter_page = requests.get(url)
    
    if fighter_page.status_code != 200:
        warn('Failed on: '+url)
        return None
    
    fighter_soup = BeautifulSoup(fighter_page.content, 'html.parser')
    
    mma_record_headline = fighter_soup.find(class_="mw-headline",id='Mixed_martial_arts_record')
    
    if mma_record_headline is None:
        mma_record_headline = fighter_soup.find(class_="mw-headline",id='Mixed_Martial_Arts_Record')
    
    if mma_record_headline is None:
        raise ValueError('mma_record_headline is None')
    
    tables = mma_record_headline.findAllNext('table', class_='wikitable')
    
    table_bool = [correct_table(tb) for tb in tables]
    
    for i, boolean in enumerate(table_bool):
        if boolean:
            table = tables[i]
            break

    raw_rows = []
    for row in table.findAll('tr'):
        row_text = row.get_text()
        if row_text in row_text_exceptions:
            row_text = row_text_exceptions[row_text]
            
        if url == 'https://en.wikipedia.org/wiki/Evan_Tanner' and len(row_text.split('\n')) == 13:
            row_text = row_text[:-2]
            
        if row_text == '\nLoss\n24–8–2\nRyan Schultz\nDecision (split)\nIFL - Las Vegas\n000000002008-02-29-0000February 29, 2008\n3\n5:00\nLas Vegas, Nevada, United States\nLoss\n24–7–2\nWagnney Fabiano\nSubmission (guillotine choke)\nIFL - World Grand Prix Semifinals\n000000002007-11-03-0000November 3, 2007\n2\n1:53\nChicago, Illinois, United States\n\n':
            r1 = '\nLoss\n24–8–2\nRyan Schultz\nDecision (split)\nIFL - Las Vegas\n000000002008-02-29-0000February 29, 2008\n3\n5:00\nLas Vegas, Nevada, United States\n\n'
            r2 = '\nLoss\n24–7–2\nWagnney Fabiano\nSubmission (guillotine choke)\nIFL - World Grand Prix Semifinals\n000000002007-11-03-0000November 3, 2007\n2\n1:53\nChicago, Illinois, United States\n\n'
            raw_rows.append(r1.split('\n'))
            raw_rows.append(r2.split('\n'))
            continue
            
        if print_rows:
            print('-----------')
            print(len(row_text.split('\n')))
            print(row_text.split('\n'))
            print(repr(row_text))
            
        raw_rows.append(row_text.split('\n'))
        
    # get birthday
    bday_find = fighter_soup.find(class_="bday")
    if bday_find is not None:
        bday = bday_find.get_text()
    else:
        print('No bday found for: '+ url)
        bday = 'NA'
    
    df_out = pd.DataFrame(raw_rows[1:],columns=raw_rows[0])
    df_out['birthday'] = bday

    return df_out

In [7]:
for i in range(fighter_urls.shape[0]):
    
    name = fighter_urls.index[i]
    filename = name.lower().replace(' ','_') +'.pkl'
    if filename not in os.listdir('Data/Raw Tables'):
        
        url = fighter_urls['URL'].iloc[i]
        print('{0}, {1}\n{2}\n------------'.format(name, str(i),url))
        record = table_from_URL(url, False)
        print(record.iloc[:3,:3])

        record.to_pickle('./Data/Raw Tables/' + filename)


Alexandre Ferreira, 27
https://en.wikipedia.org/wiki/Alexandre_Ferreira_(fighter)
------------
No bdya found for: https://en.wikipedia.org/wiki/Alexandre_Ferreira_(fighter)
     Res. Record
0    Loss   18–6
1     Win   18–5
2     Win   17–5
Alexey Oleinik, 28
https://en.wikipedia.org/wiki/Alexey_Oleinik
------------
     Res.   Record
0    Loss  52–11–1
1     Win  52–10–1
2     Win  51–10–1
Ali Bagautinov, 29
https://en.wikipedia.org/wiki/Ali_Bagautinov
------------
    Res. Record
0    Win   17–6
1    Win   16–6
2    Win   15–6
Alistair Overeem, 30
https://en.wikipedia.org/wiki/Alistair_Overeem
------------
     Res.     Record
0    Loss  43–16 (1)
1     Win  43–15 (1)
2     Win  42–15 (1)
Aljamain Sterling, 31
https://en.wikipedia.org/wiki/Aljamain_Sterling
------------
     Res. Record
0     Win   15–3
1    Loss   14–3
2     Win   14–2
Allan Goes, 32
https://en.wikipedia.org/wiki/Allan_Goes
------------
     Res.  Record
0    Loss  10–5–2
1     Win  10–4–2
2     Win   9–4–2
Alptekin