In [16]:
'''
Overview
this notebook parses all fighters' details and tale of the tape

scrape ufc fighters' details
includes first, last, nickname, url
from url scrape scrape fighter's tale of the tape, 
includes fighter, height, weight, reach, stance, dob
'''

"\nOverview\nthis notebook parses all fighters' details and tale of the tape\n\nscrape ufc fighters' details\nincludes first, last, nickname, url\nfrom url scrape scrape fighter's tale of the tape, \nincludes fighter, height, weight, reach, stance, dob\n"

In [17]:
# imports
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm_notebook

# import library
import scrape_ufc_stats_library as LIB
import importlib
importlib.reload(LIB)

# import configs
import yaml
config = yaml.safe_load(open('scrape_ufc_stats_config.yaml'))

# Parse Fighter Details
Includes:
<br>
First
<br>
Last
<br>
Nickname
<br>
URL

In [18]:
# generate list of urls for fighter details
list_of_alphabetical_urls = LIB.generate_alphabetical_urls()

# don't spam the notebook; show a preview + count
print(f"{len(list_of_alphabetical_urls)} pages")
print(list_of_alphabetical_urls[:3], "...", list_of_alphabetical_urls[-3:])

27 pages
['http://ufcstats.com/statistics/fighters?char=a&page=all', 'http://ufcstats.com/statistics/fighters?char=b&page=all', 'http://ufcstats.com/statistics/fighters?char=c&page=all'] ... ['http://ufcstats.com/statistics/fighters?char=y&page=all', 'http://ufcstats.com/statistics/fighters?char=z&page=all', 'http://ufcstats.com/statistics/fighters?char=other&page=all']


In [19]:
from pathlib import Path

# create empty list to collect per-page dataframes
dfs = []
errors = []

want_cols = config['fighter_details_column_names']
out_dir = Path(config.get('output_dir', '.'))
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / config['fighter_details_file_name']

for url in tqdm_notebook(list_of_alphabetical_urls, desc="Fighter index pages"):
    try:
        soup = LIB.get_soup(url)
        df = LIB.parse_fighter_details(soup, want_cols)

        # skip empty results gracefully
        if df is None or df.empty:
            continue

        # enforce exact column order + ensure missing columns exist
        df = df.reindex(columns=want_cols, fill_value="")

        dfs.append(df)

    except Exception as e:
        errors.append((url, str(e)))

# single concat (fast)
all_fighter_details_df = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame(columns=want_cols)

# de-dup (prefer URL if present)
subset = ['URL'] if 'URL' in all_fighter_details_df.columns else ['FIRST', 'LAST']
all_fighter_details_df = all_fighter_details_df.drop_duplicates(subset=subset, keep='first')

# optional tidy sort for readability
sort_cols = [c for c in ['LAST', 'FIRST'] if c in all_fighter_details_df.columns]
if sort_cols:
    all_fighter_details_df = all_fighter_details_df.sort_values(sort_cols, kind='stable').reset_index(drop=True)

# show a quick preview + counts (avoid printing thousands of rows)
display(all_fighter_details_df.head(10))
print(f"Total fighters: {len(all_fighter_details_df)}  |  pages with errors: {len(errors)}")

# write to file
all_fighter_details_df.to_csv(out_path, index=False, encoding='utf-8')
print(f"Saved → {out_path}")

# if any errors, show a small sample to debug
if errors:
    print("Sample errors:")
    for u, msg in errors[:5]:
        print(" -", u, "→", msg)

Fighter index pages:   0%|          | 0/27 [00:00<?, ?it/s]

Unnamed: 0,FIRST,LAST,NICKNAME,URL
0,Tom,Aaron,,http://ufcstats.com/fighter-details/93fe7332d1...
1,Danny,Abbadi,The Assassin,http://ufcstats.com/fighter-details/15df64c02b...
2,Nariman,Abbasov,Bayraktar,http://ufcstats.com/fighter-details/59a9d6dac6...
3,David,Abbott,Tank,http://ufcstats.com/fighter-details/b361180739...
4,Hamdy,Abdelwahab,The Hammer,http://ufcstats.com/fighter-details/3329d692ae...
5,Mansur,Abdul-Malik,,http://ufcstats.com/fighter-details/841695e02c...
6,Shamil,Abdurakhimov,Abrek,http://ufcstats.com/fighter-details/2f5cbecbbe...
7,Daichi,Abe,,http://ufcstats.com/fighter-details/5140122c3e...
8,Hiroyuki,Abe,Abe Ani,http://ufcstats.com/fighter-details/c0ed7b2081...
9,Papy,Abedi,Makambo,http://ufcstats.com/fighter-details/c9f6385af6...


Total fighters: 4404  |  pages with errors: 0
Saved → data/ufc_fighter_details.csv


# Parse Fighter Tale of the Tape

Includes:
<br>
Fighter
<br>
Height
<br>
Weight
<br>
Reach
<br>
Stance
<br>
DOB
<br>
URL

In [23]:
# define list of urls of fighters to parse
list_of_fighter_urls = list(all_fighter_details_df['URL'])
print(f"Size of fighter URLs: {len(list_of_fighter_urls)}")

Size of fighter URLs: 4404


In [25]:
# create empty df to store fighters' tale of the tape
all_fighter_tott_df = pd.DataFrame(columns=config['fighter_tott_column_names'])

# loop through list_of_fighter_urls
for url in tqdm_notebook(list_of_fighter_urls):
    # get soup
    soup = LIB.get_soup(url)
    # parse fighter tale of the tape
    fighter_tott = LIB.parse_fighter_tott(soup)
    # organise fighter tale of the tape
    fighter_tott_df = LIB.organise_fighter_tott(fighter_tott, config['fighter_tott_column_names'], url)
    # concat fighter
    all_fighter_tott_df = pd.concat([all_fighter_tott_df, fighter_tott_df])

# show all fighters' tale of the tape
display(all_fighter_tott_df)

# write to file
all_fighter_tott_df.to_csv(config['fighter_tott_file_name'], index=False)

  0%|          | 0/4404 [00:00<?, ?it/s]

Unnamed: 0,FIGHTER,HEIGHT,WEIGHT,REACH,STANCE,DOB,URL
0,Tom Aaron,--,155 lbs.,--,,"Jul 13, 1978",http://ufcstats.com/fighter-details/93fe7332d1...
0,Danny Abbadi,"5' 11""",155 lbs.,--,Orthodox,"Jul 03, 1983",http://ufcstats.com/fighter-details/15df64c02b...
0,Nariman Abbasov,"5' 8""",155 lbs.,"66""",Orthodox,"Feb 01, 1994",http://ufcstats.com/fighter-details/59a9d6dac6...
0,David Abbott,"6' 0""",265 lbs.,--,Switch,"Apr 26, 1965",http://ufcstats.com/fighter-details/b361180739...
0,Hamdy Abdelwahab,"6' 2""",264 lbs.,"72""",Southpaw,"Jan 22, 1993",http://ufcstats.com/fighter-details/3329d692ae...
...,...,...,...,...,...,...,...
0,Geronimo dos Santos,"6' 3""",264 lbs.,--,Orthodox,"Aug 17, 1980",http://ufcstats.com/fighter-details/b51f6791c7...
0,Rayanne dos Santos,"5' 2""",115 lbs.,"62""",Orthodox,"Jun 08, 1995",http://ufcstats.com/fighter-details/02bb48869e...
0,Tiago dos Santos e Silva,"5' 9""",145 lbs.,"70""",Orthodox,"Apr 12, 1987",http://ufcstats.com/fighter-details/8ce87f7e3a...
0,Mike van Arsdale,"6' 2""",205 lbs.,--,Switch,"Jun 20, 1965",http://ufcstats.com/fighter-details/2ee09ec2a0...
