In [1]:
import pandas as pd
import numpy as np
import altair as alt
import requests
from bs4 import BeautifulSoup
import json

In [2]:
from true_skill_through_time import *

In [2]:
# boxer_wiki_urls contains the wikipedia URLs of a large list of boxers
with open('data/boxer_wiki_urls.txt', 'r') as file:
    urls = file.readlines()

urls = [url.strip() for url in urls]

In [3]:
def extract_boxing_record(url):
    """
    Given a boxer's Wikipedia URL, this will extract the table called "Professional boxing record" and do some cleanup.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    section = None
    for header in soup.find_all(['h2', 'h3', 'h4']):
        if 'Professional boxing record' in header.get_text():
            section = header
            break
    
    if section:
        tables = section.find_all_next('table')
        
        for table in tables:
            first_row = table.find('tr')
            columns = first_row.find_all(['th', 'td'])
            
            if len(columns) >= 4:
                headers = [header.get_text(strip=True) for header in table.find_all('th')]
                rows = []
                for row in table.find_all('tr')[1:]:  # Skip header row if present
                    cells = row.find_all(['th', 'td'])
                    rows.append([cell.get_text(strip=True) for cell in cells])
                df = pd.DataFrame(rows, columns=headers if headers else None).rename(columns={'Date': "Date raw", 'Res.':'Result'})
                return df 
                
    print('No suitable table found with at least 4 columns.')
    return None

### It takes about 20 mins to pull the raw data from Wikipedia

In [4]:
boxing_records = {}

for url in urls:
    print(url)
    boxer_name = url[30:]
    try:
        record = extract_boxing_record(url)
        if record is not None:
            boxing_records[boxer_name] = record
    except:
        print(f"broke on: {url}")

https://en.wikipedia.org/wiki/William_Abelyan
No suitable table found with at least 4 columns.
https://en.wikipedia.org/wiki/Arthur_Abraham
https://en.wikipedia.org/wiki/Miguel_Acosta_(boxer)
https://en.wikipedia.org/wiki/Tomasz_Adamek
https://en.wikipedia.org/wiki/Ola_Afolabi
No suitable table found with at least 4 columns.
https://en.wikipedia.org/wiki/Joachim_Alcine
https://en.wikipedia.org/wiki/Devon_Alexander
https://en.wikipedia.org/wiki/Jos%C3%A9_Alfaro_(boxer)
https://en.wikipedia.org/wiki/Muhammad_Ali
https://en.wikipedia.org/wiki/Canelo_%C3%81lvarez
https://en.wikipedia.org/wiki/Elvis_%C3%81lvarez
No suitable table found with at least 4 columns.
https://en.wikipedia.org/wiki/Sa%C3%BAl_%C3%81lvarez
https://en.wikipedia.org/wiki/Lou_Ambers
https://en.wikipedia.org/wiki/Sammy_Angott
https://en.wikipedia.org/wiki/Alfredo_Angulo
https://en.wikipedia.org/wiki/Vito_Antuofermo
https://en.wikipedia.org/wiki/Fred_Apostoli
https://en.wikipedia.org/wiki/Jorge_Arce
https://en.wikipedia.or

### save the raw Wikipedia data

In [11]:
# boxing_records_dict_lst = []
# for k, v in boxing_records.items():
#     boxing_records_dict_lst.append({k: v.to_dict()})
# with open('data/boxer_wiki_raw.json', 'w') as f:
#     json.dump(boxing_records_dict_lst, f, indent=4) 

In [13]:
boxing_records["Arthur_Abraham"].head()

Unnamed: 0,No.,Result,Record,Opponent,Type,"Round, time",Date raw,Location,Notes,Date
0,53,Win,47–6,Patrick Nielsen,SD,12,28 Apr 2018,"Baden-Arena, Offenburg, Germany",Won vacant WBO International super-middleweigh...,2018-04-28
1,52,Loss,46–6,Chris Eubank Jr.,UD,12,15 Jul 2017,"The SSE Arena Wembley,London, England",ForIBO super-middleweight title,2017-07-15
2,51,Win,46–5,Robin Krasniqi,UD,12,22 Apr 2017,"Messe, Erfurt, Germany",,2017-04-22
3,50,Win,45–5,Tim-Robin Lihaug,TKO,"8 (12),1:09",16 Jul 2016,"Max-Schmeling-Halle, Berlin, Germany",Won vacant WBO International super-middleweigh...,2016-07-16
4,49,Loss,44–5,Gilberto Ramírez,UD,12,9 Apr 2016,"MGM Grand Garden Arena,Paradise, Nevada, US",Lost WBO super-middleweight title,2016-04-09


## Parse the raw data from wikipedia into a propert pandas datetime series.

In [12]:
def parse_dates(date_list):
    date_series = pd.Series(date_list)
    parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
    return parsed_dates

boxing_records = {k: v for k, v in boxing_records.items() if all([c in v.columns for c in ['Date raw', 'Result', 'Result']])}

for k, v in boxing_records.items():
    v['Date'] = parse_dates(v['Date raw'])

  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
  parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')


In [14]:
boxing_records["Arthur_Abraham"].head()

Unnamed: 0,No.,Result,Record,Opponent,Type,"Round, time",Date raw,Location,Notes,Date
0,53,Win,47–6,Patrick Nielsen,SD,12,28 Apr 2018,"Baden-Arena, Offenburg, Germany",Won vacant WBO International super-middleweigh...,2018-04-28
1,52,Loss,46–6,Chris Eubank Jr.,UD,12,15 Jul 2017,"The SSE Arena Wembley,London, England",ForIBO super-middleweight title,2017-07-15
2,51,Win,46–5,Robin Krasniqi,UD,12,22 Apr 2017,"Messe, Erfurt, Germany",,2017-04-22
3,50,Win,45–5,Tim-Robin Lihaug,TKO,"8 (12),1:09",16 Jul 2016,"Max-Schmeling-Halle, Berlin, Germany",Won vacant WBO International super-middleweigh...,2016-07-16
4,49,Loss,44–5,Gilberto Ramírez,UD,12,9 Apr 2016,"MGM Grand Garden Arena,Paradise, Nevada, US",Lost WBO super-middleweight title,2016-04-09


Parsing dates, column naming and page naming aren't perfect. So we have to make some adjustments. Ultimately, if there's a match we can't put a date to, we drop it.

In [15]:
import urllib.parse

boxing_matches = []
for k, v in boxing_records.items():
    fighter = urllib.parse.unquote(k.replace('_', ' ').replace('(boxer)', '').strip())

    if fighter == 'Boxing career of Manny Pacquiao':
        fighter = 'Manny Pacquiao'

    v = v.rename(columns={'Res.':'Result'})
    logi = v['Date'].isnull()
    print(f"Dropping: {logi.sum()}")
    
    boxing_matches.append(v[~logi].assign(Fighter=fighter)[['Fighter', 'Opponent', 'Result', 'Date', 'Date raw']])

boxing_matches_df = pd.concat(boxing_matches, axis=0).reset_index(drop=True)

Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 33
Dropping: 0
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 69
Dropping: 24
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 10
Dropping: 0
Dropping: 0
Dropping: 24
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 3
Dropping: 1
Dropping: 0
Dropping: 62
Dropping: 0
Dropping: 35
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 29
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 1
Dropping: 23
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Droppin

In [16]:
boxing_matches_df.head()

Unnamed: 0,Fighter,Opponent,Result,Date,Date raw
0,Arthur Abraham,Patrick Nielsen,Win,2018-04-28,28 Apr 2018
1,Arthur Abraham,Chris Eubank Jr.,Loss,2017-07-15,15 Jul 2017
2,Arthur Abraham,Robin Krasniqi,Win,2017-04-22,22 Apr 2017
3,Arthur Abraham,Tim-Robin Lihaug,Win,2016-07-16,16 Jul 2016
4,Arthur Abraham,Gilberto Ramírez,Loss,2016-04-09,9 Apr 2016


In [18]:
boxing_matches_df.shape

(27710, 5)

In [19]:
#boxing_matches_df.to_csv("data/boxing_matches_parsed.csv", index=False)

In [20]:
boxing_matches = boxing_matches_df

## Clean the data so that all the Wins are converts to 1's and 0's are converted to losses.

The tricky thing is, Wikipedia has many ways to say "Win" or "Loss."

In [21]:
boxing_matches['Result'].unique()

array(['Win', 'Loss', 'Draw', 'NC', 'Won', 'ND', 'N/C|', 'Lose', 'Wim'],
      dtype=object)

In [22]:
boxing_matches = boxing_matches[boxing_matches['Result'].isin(['Loss', 'Lost', 'L by TKO', 'L by KO', 'Lose', 'LOST', 'Wim', 'Win', 'Won', 'W by KO', 'W by TKO', 'W by SD', 'W by PTS'])]

In [23]:
mapper = {'Loss':0, 'Lost':0, 'L by TKO':0, 'L by KO':0, 'Lose':0, 'LOST':0,
          'Win':1, 'Wim':1, 'Won':1, 'W by KO':1, 'W by TKO':1, 'W by SD':1, 'W by PTS':1}
boxing_matches['Outcome'] = boxing_matches['Result'].apply(lambda x: mapper[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boxing_matches['Outcome'] = boxing_matches['Result'].apply(lambda x: mapper[x])


## Cleaning up Fighter Names

Sometimes, a fighter's name will appear differently on their own page than on another fighters page. So we have to map them to be the same string. We do this by guessing which names are likely misspellings of each (by counting overlaps in their character histogram). Then we map the names with a manually created dictionary.

In [24]:
fighters = set(boxing_matches['Fighter'])
opponents = set(boxing_matches['Opponent'])
# fighters - opponents

In [25]:
from collections import Counter

def count_histogram_overlap(str1, str2):
    hist1 = Counter(str1)
    hist2 = Counter(str2)
    common_chars = set(hist1.keys()).intersection(set(hist2.keys()))
    overlap = 0
    for char in common_chars:
        overlap += min(hist1[char], hist2[char])
    
    return overlap / (len(str1) + len(str2))

In [26]:
names_all = list(fighters | opponents)
names_all = sorted(names_all)
chr_overlap = [np.nan]

for i in range(1, len(names_all)):
    chr_overlap.append(count_histogram_overlap(names_all[i-1], names_all[i]))

df = pd.DataFrame(dict(names_all=names_all, chr_overlap = chr_overlap))

In [27]:
top_indices = df.sort_values('chr_overlap', ascending=False).head(20).index
indices_ch = []

for ti in top_indices:
    indices_ch.append(ti - 1)
    indices_ch.append(ti)

df.loc[indices_ch]

Unnamed: 0,names_all,chr_overlap
10885,Miguel Angel Saurez,0.414634
10886,Miguel Angel Suarez,0.5
11426,Nikolay Emereev,0.310345
11427,Nikolay Eremeev,0.5
5392,George Monroe,0.37037
5393,George Moreno,0.5
1573,Bill Haderman,0.26087
1574,Bill Hardeman,0.5
12719,Rey Megrino,0.291667
12720,Rey Migreno,0.5


In [28]:
name_changes = {}
name_changes["Rey Migreno"] = "Rey Megrino"
name_changes["George Ashe"] = "George Ashie"
name_changes["Stanyslav Tomkachov"] = "Stanyslav Tovkachov"
name_changes["Greg Scott-Briggs"] = "Greg Scott Briggs"
name_changes["Kongthawat Sorkitti"] = "Kongthawat Sor Kitti"
name_changes["Rogelio Castañeda"] = "Rogelio Castaneda"
name_changes["Miguel Angel Suarez"] = "Miguel Angel Saurez"
name_changes["Nikolay Eremeev"] = "Nikolay Emereev"
name_changes["Bill Haderman"] = "Bill Hardeman"
name_changes["Rubén Darío Palacios"] = "Rubén Darío Palacio"
name_changes["George Kambosos Jr."] = "George Kambosos Jr"
name_changes["Sven Erik Paulsen"] = "Svein Erik Paulsen"
name_changes["Singnum Chuwatana"] = "Singnum Chuwattana"
name_changes["Mohammed Medjadi	"] = "Mohammed Medjadji"

# Remove Double Counting

We are trying to figure out all the boxing matches based on the boxing records from people's wikipedia page. However, that will double count! A fight will get represented twice, once for each fighter's wikipedia. So we remove them here.

In [29]:
for c in ['Fighter', 'Opponent']:
    boxing_matches[c] = boxing_matches[c].apply(lambda x: name_changes[x] if x in name_changes else x)
    boxing_matches[c] = boxing_matches[c].str.replace('Jr.', 'Jr')
    boxing_matches[c] = boxing_matches[c].str.replace('Sr.', 'Sr')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boxing_matches[c] = boxing_matches[c].apply(lambda x: name_changes[x] if x in name_changes else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boxing_matches[c] = boxing_matches[c].str.replace('Jr.', 'Jr')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boxing_matches[c] = boxing_matches[c].str.r

In [30]:
boxing_matches['key'] = np.nan

for i, row in boxing_matches.iterrows():
    f, o, d = row['Fighter'], row['Opponent'], row['Date']
    if f < o:
        k = f'{f}_{o}_{d}'
    else:
        k = f'{o}_{f}_{d}'
    boxing_matches.loc[i, 'key'] = k

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boxing_matches['key'] = np.nan
  boxing_matches.loc[i, 'key'] = k


In [31]:
boxing_matches = boxing_matches.drop_duplicates(subset=['key']).sort_values(['Date', 'Fighter']).reset_index(drop=True)

In [32]:
boxing_matches.head()

Unnamed: 0,Fighter,Opponent,Result,Date,Date raw,Outcome,key
0,Jake Kilrain,Jack Daley,Win,1879-01-01,"Jan 1, 1879",1,Jack Daley_Jake Kilrain_1879-01-01 00:00:00
1,Jake Kilrain,Jem Driscoll,Win,1879-02-01,"Feb 1, 1879",1,Jake Kilrain_Jem Driscoll_1879-02-01 00:00:00
2,Jake Kilrain,Dan Dwyer,Win,1879-03-10,"Mar 10, 1879",1,Dan Dwyer_Jake Kilrain_1879-03-10 00:00:00
3,John L. Sullivan,Jack Curley,Win,1879-03-13,"Mar 13, 1879",1,Jack Curley_John L. Sullivan_1879-03-13 00:00:00
4,John L. Sullivan,Johnny Cocky Woods,Win,1879-03-14,"Mar 14, 1879",1,John L. Sullivan_Johnny Cocky Woods_1879-03-14...


## Save the data, since the above is slow.

In [34]:
boxing_matches.to_parquet('data/boxing_matches.parquet')

### Change the data format to Winner, Loser, Date

In [36]:
len(boxing_matches), len(boxing_matches[boxing_matches.Outcome==1]), len(boxing_matches[boxing_matches.Outcome==0])

(25651, 22161, 3490)

In [37]:
len(boxing_matches)==len(boxing_matches[boxing_matches.Outcome==1])+len(boxing_matches[boxing_matches.Outcome==0])

True

In [39]:
type(boxing_matches.iloc[0].Date)

pandas._libs.tslibs.timestamps.Timestamp

In [40]:
boxing_matches['Winner'] = boxing_matches.apply(lambda row: row.Fighter if row.Outcome==1 else row.Opponent, axis=1)

In [44]:
boxing_matches['Loser'] = boxing_matches.apply(lambda row: row.Opponent if row.Outcome==1 else row.Fighter, axis=1)

In [45]:
boxing_matches.tail(20)

Unnamed: 0,Fighter,Opponent,Result,Date,Date raw,Outcome,key,Winner,Loser
25631,Devon Alexander,Vlad Panin,Loss,2024-05-24,"May 24, 2024",0,Devon Alexander_Vlad Panin_2024-05-24 00:00:00,Vlad Panin,Devon Alexander
25632,Deontay Wilder,Zhilei Zhang,Loss,2024-06-01,"Jun 1, 2024",0,Deontay Wilder_Zhilei Zhang_2024-06-01 00:00:00,Zhilei Zhang,Deontay Wilder
25633,Filip Hrgović,Daniel Dubois,Loss,2024-06-01,1 Jun 2024,0,Daniel Dubois_Filip Hrgović_2024-06-01 00:00:00,Daniel Dubois,Filip Hrgović
25634,Gervonta Davis,Frank Martin,Win,2024-06-15,"Jun 15, 2024",1,Frank Martin_Gervonta Davis_2024-06-15 00:00:00,Gervonta Davis,Frank Martin
25635,Marco Huck,Evgenios Lazaridis,Win,2024-06-29,29 Jun 2024,1,Evgenios Lazaridis_Marco Huck_2024-06-29 00:00:00,Marco Huck,Evgenios Lazaridis
25636,Nate Diaz,Jorge Masvidal,Win,2024-07-06,"Jul 6, 2024",1,Jorge Masvidal_Nate Diaz_2024-07-06 00:00:00,Nate Diaz,Jorge Masvidal
25637,Robson Conceição,O'Shaquie Foster,Win,2024-07-06,6 Jul 2024,1,O'Shaquie Foster_Robson Conceição_2024-07-06 0...,Robson Conceição,O'Shaquie Foster
25638,Shakur Stevenson,Artem Harutyunyan,Win,2024-07-06,"Jul 6, 2024",1,Artem Harutyunyan_Shakur Stevenson_2024-07-06 ...,Shakur Stevenson,Artem Harutyunyan
25639,Román González,Rober Barrera,Win,2024-07-12,12 Jul 2024,1,Rober Barrera_Román González_2024-07-12 00:00:00,Román González,Rober Barrera
25640,Tony Yoka,Amine Boucetta,Win,2024-07-27,27 Jul 2024,1,Amine Boucetta_Tony Yoka_2024-07-27 00:00:00,Tony Yoka,Amine Boucetta


In [46]:
boxing_matches[['Winner', 'Loser', 'Date']].to_parquet("data/boxing_matches_refined.parquet")

### Run true_skill_through_time

In [3]:
games = pd.read_parquet("data/boxing_matches_refined.parquet")

In [5]:
games = games.rename(columns={'Winner': 'winner', 'Loser': 'loser', 'Date': 'timestamp'})

In [7]:
games.to_parquet("data/boxing_matches_refined_tstt.parquet")

In [3]:
games = pd.read_parquet("data/boxing_matches_refined_tstt.parquet")

### another run after we increase the lower bounds of params; then improve neg_log_evidence

In [4]:
games = pd.read_parquet("data/boxing_matches_refined_tstt.parquet")

In [5]:
self = TrueSkillThroughTimeApplied(games)

In [6]:
print(games.head().to_string())

             winner               loser  timestamp  time_0_to_999_int
0      Jake Kilrain          Jack Daley 1879-01-01                  0
1      Jake Kilrain        Jem Driscoll 1879-02-01                  0
2      Jake Kilrain           Dan Dwyer 1879-03-10                  1
3  John L. Sullivan         Jack Curley 1879-03-13                  1
4  John L. Sullivan  Johnny Cocky Woods 1879-03-14                  1


In [7]:
self.learn_optimal_parameters()


gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 10906.1092

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 10906.1092

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 10906.1092

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 10906.1092

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 17218.8764

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 17218.8765

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 17218.8764

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 17218.8764

gamma: 0.0323, sigma: 0.0323, beta: 0.0323, 
NLE: 10055.8885

gamma: 0.0323, sigma: 0.0323, beta: 0.0323, 
NLE: 10055.8885

gamma: 0.0323, sigma: 0.0323, beta: 0.0323, 
NLE: 10055.8885

gamma: 0.0323, sigma: 0.0323, beta: 0.0323, 
NLE: 10055.8885

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 9948.2152

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 9948.2152

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 9948.2152

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 9948.2152

gamma: 0.10

In [9]:
optimal_gamma = 0.0195
optimal_sigma = 0.1454
optimal_beta = 0.0758

In [10]:
self.set_optimal_parameters(gamma = optimal_gamma, sigma = optimal_sigma, beta = optimal_beta)

In [11]:
skill_curves = self.set_skill_curves()

In [14]:
# https://sports.betmgm.com/en/blog/ranking-the-worlds-top-10-boxers/
# supringly Mike Tyson is not on this list
top_boxers_online = ['Muhammad Ali', 'Joe Louis', 'Sugar Ray Robinson', 'Rocky Marciano', 'Floyd Mayweather Jr', 'Manny Pacquiao', 'Jack Dempsey', 'Roberto Durán', 'Henry Armstrong', 'Willie Pep']

In [15]:
self.plot_player_skills(players = top_boxers_online[:]+['Mike Tyson'], width=1500, burnin=0)

In [16]:
self.plot_player_skills(players = top_boxers_online[:]+['Mike Tyson'], width=1500, burnin=10)

### Considering every available match, the GOAT of boxing is Floyd Mayweather Jr, the second best boxer would be Manny Pacquiao(his number of matches is less than 20 though)
### Many other great boxers have an increasing skill trend at early career then gradually decay over time, however, Floyd Mayweather Jr has a steadily increasing skill trend throughout the career.

In [17]:
self.plot_calibration()

  x=alt.X('win_prob_midpoint', title='Predicted Win Probability'),


# To-Do's

- [ ] The `boxing_matches` dataset isn't quite in the right format. It needs to be the format seen in the `true_skill_wc3_dev.ipynb` notebook. The difference is, that has a `Winner` and `Loser` column. This data hasn't been organized that way, so some work needs to be done there.
- [ ] Once it's in the right format, apply the `TrueSkillThroughTimeApplied` class, in a similar way seen in the `true_skill_wc3_dev` notebook.
- [ ] You'll need to optimize the hyperparameters. You can use the `.learn_optimal_parameters` method for that.
- [ ] Inspect the results. Answer the question.. who is the Greatest Of All Time? FYI, it's OK to trim the results. We don't need to consider all matches since 1911.
- [ ] Read this paper for best practices on how to do this: https://www.herbrich.me/papers/ttt.pdf
- [ ] To make sure we're doing things right, check out this: https://glandfried.github.io/TrueSkillThroughTime.jl/man/examples/. We could run `TrueSkillThroughTimeApplied` on the data given their (there CSV file you can download.) We should be fine, since my code uses the repo developed on that site.
- [ ] Do some sort of out of sample test evaluation. Everything done so far has been in sample. We should hold out some future data and see if this model is good. 