In [1]:
import pandas as pd
import numpy as np
import altair as alt
import requests
from bs4 import BeautifulSoup
import json

In [2]:
from true_skill_through_time import *

In [3]:
# boxer_wiki_urls contains the wikipedia URLs of a large list of boxers
with open('data/ufc_wiki_urls.txt', 'r') as file:
    urls = file.readlines()

urls = [url.strip() for url in urls]

In [4]:
def extract_ufc_record(url):
    """
    Given a ufc's Wikipedia URL, this will extract the table called "Mixed martial arts record" and do some cleanup.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    section = None
    for header in soup.find_all(['h2', 'h3', 'h4']):
        if 'Mixed martial arts record' in header.get_text():
            section = header
            break
    
    if section:
        tables = section.find_all_next('table')
        
        for table in tables:
            first_row = table.find('tr')
            columns = first_row.find_all(['th', 'td'])
            
            if len(columns) >= 4:
                headers = [header.get_text(strip=True) for header in table.find_all('th')]
                rows = []
                for row in table.find_all('tr')[1:]:  # Skip header row if present
                    cells = row.find_all(['th', 'td'])
                    rows.append([cell.get_text(strip=True) for cell in cells])
                df = pd.DataFrame(rows, columns=headers if headers else None).rename(columns={'Date': "Date raw", 'Res.':'Result'})
                return df 
                
    print('No suitable table found with at least 4 columns.')
    return None

In [8]:
ufc_records = {}

for url in urls[:]:
    print(url)
    fighter_name = url[30:]
    print('fighter_name:', fighter_name)
    try:
        record = extract_ufc_record(url)
        if record is not None:
            ufc_records[fighter_name] = record
    except:
        print(f"broke on: {url}")

https://en.wikipedia.org/wiki/Derrick_Lewis
fighter_name: Derrick_Lewis
https://en.wikipedia.org/wiki/Jack_Della_Maddalena
fighter_name: Jack_Della_Maddalena
https://en.wikipedia.org/wiki/Kelvin_Gastelum
fighter_name: Kelvin_Gastelum
https://en.wikipedia.org/wiki/Reinier_de_Ridder
fighter_name: Reinier_de_Ridder
https://en.wikipedia.org/wiki/Vitor_Petrino
fighter_name: Vitor_Petrino
https://en.wikipedia.org/wiki/Carlos_Ulberg
fighter_name: Carlos_Ulberg
https://en.wikipedia.org/wiki/Drakkar_Klose
fighter_name: Drakkar_Klose
https://en.wikipedia.org/wiki/Kang_Kyung-ho
fighter_name: Kang_Kyung-ho
https://en.wikipedia.org/wiki/Daniel_Pineda_(fighter)
fighter_name: Daniel_Pineda_(fighter)
https://en.wikipedia.org/wiki/Walt_Harris_(fighter)
fighter_name: Walt_Harris_(fighter)
https://en.wikipedia.org/wiki/Andre_Petroski
fighter_name: Andre_Petroski
https://en.wikipedia.org/wiki/Rinya_Nakamura
fighter_name: Rinya_Nakamura
https://en.wikipedia.org/wiki/Jonathan_Martinez
fighter_name: Jonathan

In [None]:
ufc_records_dict_lst = []
for k, v in ufc_records.items():
    ufc_records_dict_lst.append({k: v.to_dict()})
with open('data/ufc_wiki_raw.json', 'w') as f:
    json.dump(ufc_records_dict_lst, f, indent=4) 