In [36]:
import pandas as pd
import numpy as np
import altair as alt
import requests
from bs4 import BeautifulSoup

In [37]:
# boxer_wiki_urls contains the wikipedia URLs of a large list of boxers
with open('data/boxer_wiki_urls.txt', 'r') as file:
    urls = file.readlines()

urls = [url.strip() for url in urls]

In [38]:
def extract_boxing_record(url):
    """
    Given a boxer's Wikipedia URL, this will extract the table called "Professional boxing record" and do some cleanup.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    section = None
    for header in soup.find_all(['h2', 'h3', 'h4']):
        if 'Professional boxing record' in header.get_text():
            section = header
            break
    
    if section:
        tables = section.find_all_next('table')
        
        for table in tables:
            first_row = table.find('tr')
            columns = first_row.find_all(['th', 'td'])
            
            if len(columns) >= 4:
                headers = [header.get_text(strip=True) for header in table.find_all('th')]
                rows = []
                for row in table.find_all('tr')[1:]:  # Skip header row if present
                    cells = row.find_all(['th', 'td'])
                    rows.append([cell.get_text(strip=True) for cell in cells])
                df = pd.DataFrame(rows, columns=headers if headers else None).rename(columns={'Date': "Date raw", 'Res.':'Result'})
                return df 
                
    print('No suitable table found with at least 4 columns.')
    return None

In [39]:
boxing_records = {}

for url in urls:
    print(url)
    boxer_name = url[30:]
    try:
        record = extract_boxing_record(url)
        if record is not None:
            boxing_records[boxer_name] = record
    except:
        print(f"broke on: {url}")

https://en.wikipedia.org/wiki/William_Abelyan
No suitable table found with at least 4 columns.
https://en.wikipedia.org/wiki/Arthur_Abraham
https://en.wikipedia.org/wiki/Miguel_Acosta_(boxer)
https://en.wikipedia.org/wiki/Tomasz_Adamek
https://en.wikipedia.org/wiki/Ola_Afolabi
No suitable table found with at least 4 columns.
https://en.wikipedia.org/wiki/Joachim_Alcine
https://en.wikipedia.org/wiki/Devon_Alexander
https://en.wikipedia.org/wiki/Jos%C3%A9_Alfaro_(boxer)
https://en.wikipedia.org/wiki/Muhammad_Ali
https://en.wikipedia.org/wiki/Canelo_%C3%81lvarez
https://en.wikipedia.org/wiki/Elvis_%C3%81lvarez
No suitable table found with at least 4 columns.
https://en.wikipedia.org/wiki/Sa%C3%BAl_%C3%81lvarez
https://en.wikipedia.org/wiki/Lou_Ambers
https://en.wikipedia.org/wiki/Sammy_Angott
https://en.wikipedia.org/wiki/Alfredo_Angulo
https://en.wikipedia.org/wiki/Vito_Antuofermo
https://en.wikipedia.org/wiki/Fred_Apostoli
https://en.wikipedia.org/wiki/Jorge_Arce
https://en.wikipedia.or

## Parse the raw data from wikipedia into a propert pandas datetime series.

In [40]:
def parse_dates(date_list):
    date_series = pd.Series(date_list)
    parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
    return parsed_dates

boxing_records = {k: v for k, v in boxing_records.items() if all([c in v.columns for c in ['Date raw', 'Result', 'Result']])}

for k, v in boxing_records.items():
    v['Date'] = parse_dates(v['Date raw'])

  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')


Parsing dates, column naming and page naming aren't perfect. So we have to make some adjustments. Ultimately, if there's a match we can't put a date to, we drop it.

In [41]:
import urllib.parse

boxing_matches = []
for k, v in boxing_records.items():
    fighter = urllib.parse.unquote(k.replace('_', ' ').replace('(boxer)', '').strip())

    if fighter == 'Boxing career of Manny Pacquiao':
        fighter = 'Manny Pacquiao'

    v = v.rename(columns={'Res.':'Result'})
    logi = v['Date'].isnull()
    print(f"Dropping: {logi.sum()}")
    
    boxing_matches.append(v[~logi].assign(Fighter=fighter)[['Fighter', 'Opponent', 'Result', 'Date', 'Date raw']])

boxing_matches = pd.concat(boxing_matches, axis=0).reset_index(drop=True)

Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 33
Dropping: 0
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 69
Dropping: 24
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 10
Dropping: 0
Dropping: 0
Dropping: 24
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 3
Dropping: 1
Dropping: 0
Dropping: 62
Dropping: 0
Dropping: 35
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 29
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 1
Dropping: 23
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Droppin

## Clean the data so that all the Wins are converts to 1's and 0's are converted to losses.

The tricky thing is, Wikipedia has many ways to say "Win" or "Loss."

In [42]:
boxing_matches['Result'].unique()

array(['Win', 'Loss', 'Draw', 'NC', 'Won', 'ND', 'N/C|', 'Lose', 'Wim'],
      dtype=object)

In [43]:
boxing_matches = boxing_matches[boxing_matches['Result'].isin(['Loss', 'Lost', 'L by TKO', 'L by KO', 'Lose', 'LOST', 'Wim', 'Win', 'Won', 'W by KO', 'W by TKO', 'W by SD', 'W by PTS'])]

In [44]:
mapper = {'Loss':0, 'Lost':0, 'L by TKO':0, 'L by KO':0, 'Lose':0, 'LOST':0,
          'Win':1, 'Wim':1, 'Won':1, 'W by KO':1, 'W by TKO':1, 'W by SD':1, 'W by PTS':1}
boxing_matches['Outcome'] = boxing_matches['Result'].apply(lambda x: mapper[x])

## Cleaning up Fighter Names

Sometimes, a fighter's name will appear differently on their own page than on another fighters page. So we have to map them to be the same string. We do this by guessing which names are likely misspellings of each (by counting overlaps in their character histogram). Then we map the names with a manually created dictionary.

In [45]:
fighters = set(boxing_matches['Fighter'])
opponents = set(boxing_matches['Opponent'])
# fighters - opponents

In [46]:
from collections import Counter

def count_histogram_overlap(str1, str2):
    hist1 = Counter(str1)
    hist2 = Counter(str2)
    common_chars = set(hist1.keys()).intersection(set(hist2.keys()))
    overlap = 0
    for char in common_chars:
        overlap += min(hist1[char], hist2[char])
    
    return overlap / (len(str1) + len(str2))

In [47]:
names_all = list(fighters | opponents)
names_all = sorted(names_all)
chr_overlap = [np.nan]

for i in range(1, len(names_all)):
    chr_overlap.append(count_histogram_overlap(names_all[i-1], names_all[i]))

df = pd.DataFrame(dict(names_all=names_all, chr_overlap = chr_overlap))

In [48]:
top_indices = df.sort_values('chr_overlap', ascending=False).head(20).index
indices_ch = []

for ti in top_indices:
    indices_ch.append(ti - 1)
    indices_ch.append(ti)

df.loc[indices_ch]

Unnamed: 0,names_all,chr_overlap
10885,Miguel Angel Saurez,0.414634
10886,Miguel Angel Suarez,0.5
11426,Nikolay Emereev,0.310345
11427,Nikolay Eremeev,0.5
5392,George Monroe,0.37037
5393,George Moreno,0.5
1573,Bill Haderman,0.26087
1574,Bill Hardeman,0.5
12719,Rey Megrino,0.291667
12720,Rey Migreno,0.5


In [49]:
name_changes = {}
name_changes["Rey Migreno"] = "Rey Megrino"
name_changes["George Ashe"] = "George Ashie"
name_changes["Stanyslav Tomkachov"] = "Stanyslav Tovkachov"
name_changes["Greg Scott-Briggs"] = "Greg Scott Briggs"
name_changes["Kongthawat Sorkitti"] = "Kongthawat Sor Kitti"
name_changes["Rogelio Castañeda"] = "Rogelio Castaneda"
name_changes["Miguel Angel Suarez"] = "Miguel Angel Saurez"
name_changes["Nikolay Eremeev"] = "Nikolay Emereev"
name_changes["Bill Haderman"] = "Bill Hardeman"
name_changes["Rubén Darío Palacios"] = "Rubén Darío Palacio"
name_changes["George Kambosos Jr."] = "George Kambosos Jr"
name_changes["Sven Erik Paulsen"] = "Svein Erik Paulsen"
name_changes["Singnum Chuwatana"] = "Singnum Chuwattana"
name_changes["Mohammed Medjadi	"] = "Mohammed Medjadji"

# Remove Double Counting

We are trying to figure out all the boxing matches based on the boxing records from people's wikipedia page. However, that will double count! A fight will get represented twice, once for each fighter's wikipedia. So we remove them here.

In [50]:
for c in ['Fighter', 'Opponent']:
    boxing_matches[c] = boxing_matches[c].apply(lambda x: name_changes[x] if x in name_changes else x)
    boxing_matches[c] = boxing_matches[c].str.replace('Jr.', 'Jr')
    boxing_matches[c] = boxing_matches[c].str.replace('Sr.', 'Sr')

In [51]:
boxing_matches['key'] = np.nan

for i, row in boxing_matches.iterrows():
    f, o, d = row['Fighter'], row['Opponent'], row['Date']
    if f < o:
        k = f'{f}_{o}_{d}'
    else:
        k = f'{o}_{f}_{d}'
    boxing_matches.loc[i, 'key'] = k

In [52]:
boxing_matches = boxing_matches.drop_duplicates(subset=['key']).sort_values(['Date', 'Fighter']).reset_index(drop=True)

## Save the data, since the above is slow.

In [54]:
boxing_matches.to_parquet('data/boxing_matches.parquet')

# To-Do's

- [ ] The `boxing_matches` dataset isn't quite in the right format. It needs to be the format seen in the `true_skill_wc3_dev.ipynb` notebook. The difference is, that has a `Winner` and `Loser` column. This data hasn't been organized that way, so some work needs to be done there.
- [ ] Once it's in the right format, apply the `TrueSkillThroughTimeApplied` class, in a similar way seen in the `true_skill_wc3_dev` notebook.
- [ ] You'll need to optimize the hyperparameters. You can use the `.learn_optimal_parameters` method for that.
- [ ] Inspect the results. Answer the question.. who is the Greatest Of All Time? FYI, it's OK to trim the results. We don't need to consider all matches since 1911.
- [ ] Read this paper for best practices on how to do this: https://www.herbrich.me/papers/ttt.pdf
- [ ] To make sure we're doing things right, check out this: https://glandfried.github.io/TrueSkillThroughTime.jl/man/examples/. We could run `TrueSkillThroughTimeApplied` on the data given their (there CSV file you can download.) We should be fine, since my code uses the repo developed on that site.
- [ ] Do some sort of out of sample test evaluation. Everything done so far has been in sample. We should hold out some future data and see if this model is good. 