In [2]:
import pandas as pd
import numpy as np
import altair as alt
import requests
from bs4 import BeautifulSoup
import json
import urllib.parse
from collections import Counter

In [3]:
from true_skill_through_time import *

In [4]:
# ufc_wiki_urls contains the wikipedia URLs of a large list of ufc fighters
with open('data/ufc_wiki_urls.txt', 'r') as file:
    urls = file.readlines()

urls = [url.strip() for url in urls]

In [5]:
with open('data/ufc_wiki_urls_v2.txt', 'r') as file:
    urls_v2 = file.readlines()
urls_v2 = [url.strip() for url in urls_v2]

In [6]:
len(urls), len(urls_v2)

(385, 389)

In [7]:
added_url_lst = [url for url in urls_v2 if url not in urls]

In [10]:
def extract_ufc_record(url):
    """
    Given a ufc's Wikipedia URL, this will extract the table called "Mixed martial arts record" and do some cleanup.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    section = None
    for header in soup.find_all(['h2', 'h3', 'h4']):
        if 'Mixed martial arts record' in header.get_text():
            section = header
            break
    
    if section:
        tables = section.find_all_next('table')
        
        for table in tables:
            first_row = table.find('tr')
            columns = first_row.find_all(['th', 'td'])
            
            if len(columns) >= 4:
                headers = [header.get_text(strip=True) for header in table.find_all('th')]
                rows = []
                for row in table.find_all('tr')[1:]:  # Skip header row if present
                    cells = row.find_all(['th', 'td'])
                    rows.append([cell.get_text(strip=True) for cell in cells])
                df = pd.DataFrame(rows, columns=headers if headers else None).rename(columns={'Date': "Date raw", 'Res.':'Result'})
                return df 
                
    print('No suitable table found with at least 4 columns.')
    return None

In [8]:
ufc_records = {}

for url in urls[:]:
    print(url)
    fighter_name = url[30:]
    print('fighter_name:', fighter_name)
    try:
        record = extract_ufc_record(url)
        if record is not None:
            ufc_records[fighter_name] = record
    except:
        print(f"broke on: {url}")

https://en.wikipedia.org/wiki/Derrick_Lewis
fighter_name: Derrick_Lewis
https://en.wikipedia.org/wiki/Jack_Della_Maddalena
fighter_name: Jack_Della_Maddalena
https://en.wikipedia.org/wiki/Kelvin_Gastelum
fighter_name: Kelvin_Gastelum
https://en.wikipedia.org/wiki/Reinier_de_Ridder
fighter_name: Reinier_de_Ridder
https://en.wikipedia.org/wiki/Vitor_Petrino
fighter_name: Vitor_Petrino
https://en.wikipedia.org/wiki/Carlos_Ulberg
fighter_name: Carlos_Ulberg
https://en.wikipedia.org/wiki/Drakkar_Klose
fighter_name: Drakkar_Klose
https://en.wikipedia.org/wiki/Kang_Kyung-ho
fighter_name: Kang_Kyung-ho
https://en.wikipedia.org/wiki/Daniel_Pineda_(fighter)
fighter_name: Daniel_Pineda_(fighter)
https://en.wikipedia.org/wiki/Walt_Harris_(fighter)
fighter_name: Walt_Harris_(fighter)
https://en.wikipedia.org/wiki/Andre_Petroski
fighter_name: Andre_Petroski
https://en.wikipedia.org/wiki/Rinya_Nakamura
fighter_name: Rinya_Nakamura
https://en.wikipedia.org/wiki/Jonathan_Martinez
fighter_name: Jonathan

In [13]:
ufc_records_added = {}
for url in added_url_lst[:]:
    print(url)
    fighter_name = url[30:]
    print('fighter_name:', fighter_name)
    try:
        record = extract_ufc_record(url)
        if record is not None:
            ufc_records_added[fighter_name] = record
    except:
        print(f"broke on: {url}")

https://en.wikipedia.org/wiki/Ariane_Lipski
fighter_name: Ariane_Lipski
https://en.wikipedia.org/wiki/Muhammad_Mokaev
fighter_name: Muhammad_Mokaev
https://en.wikipedia.org/wiki/Francis_Ngannou
fighter_name: Francis_Ngannou
https://en.wikipedia.org/wiki/Michelle_Waterson-Gomez
fighter_name: Michelle_Waterson-Gomez


In [9]:
# ufc_records_dict_lst = []
# for k, v in ufc_records.items():
#     ufc_records_dict_lst.append({k: v.to_dict()})
# with open('data/ufc_wiki_raw.json', 'w') as f:
#     json.dump(ufc_records_dict_lst, f, indent=4) 

In [15]:
ufc_records_dict_lst_added = []
for k, v in ufc_records_added.items():
    ufc_records_dict_lst_added.append({k: v.to_dict()})

In [25]:
with open('data/ufc_wiki_raw.json', 'r') as file:
    ufc_records_dict_lst = json.load(file)

In [None]:
ufc_records_dict_lst_added = []
for k, v in ufc_records_added.items():
    ufc_records_dict_lst_added.append({k: v.to_dict()})

In [28]:
len(ufc_records_dict_lst)

378

In [30]:
ufc_records_dict_lst = ufc_records_dict_lst + ufc_records_dict_lst_added

In [31]:
len(ufc_records_dict_lst), len(ufc_records_dict_lst_added)

(382, 4)

In [32]:
with open('data/ufc_wiki_raw_v2.json', 'w') as f:
    json.dump(ufc_records_dict_lst, f, indent=4) 

In [39]:
ufc_records = {}
for item in ufc_records_dict_lst:
    # Each item is a dictionary with a single key-value pair
    # We use .items() to get the key and value
    for k, v in item.items():
        ufc_records[k] = pd.DataFrame(v) if isinstance(v, dict) else v 

In [40]:
def parse_dates(date_list):
    date_series = pd.Series(date_list)
    parsed_dates = pd.to_datetime(date_series.str.replace('–', '-').str.replace('[377]', ''), errors='coerce')
    return parsed_dates

ufc_records = {k: v for k, v in ufc_records.items() if all([c in v.columns for c in ['Date raw', 'Result', 'Result']])}

for k, v in ufc_records.items():
    v['Date'] = parse_dates(v['Date raw'])

In [41]:
ufc_records["Li_Jingliang"].head()

Unnamed: 0,Result,Record,Opponent,Method,Event,Date raw,Round,Time,Location,Notes,Date
0,Loss,19–9,Carlos Prates,KO (punches),UFC 305,"August 18, 2024",2,4:02,"Perth, Australia",,2024-08-18
1,Loss,19–8,Daniel Rodriguez,Decision (split),UFC 279,"September 10, 2022",3,5:00,"Las Vegas,Nevada, United States",Catchweight (180 lb) bout.,2022-09-10
2,Win,19–7,Muslim Salikhov,TKO (punches and elbows),UFC on ABC: Ortega vs. Rodríguez,"July 16, 2022",2,4:38,"Elmont, New York, United States",Performance of the Night.,2022-07-16
3,Loss,18–7,Khamzat Chimaev,Technical Submission (rear-naked choke),UFC 267,"October 30, 2021",1,3:16,"Abu Dhabi, United Arab Emirates",,2021-10-30
4,Win,18–6,Santiago Ponzinibbio,KO (punch),UFC on ABC: Holloway vs. Kattar,"January 16, 2021",1,4:25,"Abu Dhabi, United Arab Emirates",Performance of the Night.,2021-01-16


In [42]:
ufc_matches = []
for k, v in ufc_records.items():
    fighter = urllib.parse.unquote(k.replace('_', ' ').replace('(fighter)', '').strip())
    logi = v['Date'].isnull()
    print(f"Dropping: {logi.sum()}")
    
    ufc_matches.append(v[~logi].assign(Fighter=fighter)[['Fighter', 'Opponent', 'Result', 'Date', 'Date raw']])

ufc_matches_df = pd.concat(ufc_matches, axis=0).reset_index(drop=True)

Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 1
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 2
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 1
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 1
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Dropping: 0
Drop

In [43]:
ufc_matches_df.head()

Unnamed: 0,Fighter,Opponent,Result,Date,Date raw
0,Derrick Lewis,Rodrigo Nascimento,Win,2024-05-11,"May 11, 2024"
1,Derrick Lewis,Jailton Almeida,Loss,2023-11-04,"November 4, 2023"
2,Derrick Lewis,Marcos Rogério de Lima,Win,2023-07-29,"July 29, 2023"
3,Derrick Lewis,Serghei Spivac,Loss,2023-02-04,"February 4, 2023"
4,Derrick Lewis,Sergei Pavlovich,Loss,2022-07-30,"July 30, 2022"


In [44]:
ufc_matches_df.shape

(8795, 5)

In [45]:
ufc_matches_df['Result'].unique()

array(['Win', 'Loss', 'NC', 'Draw'], dtype=object)

In [46]:
ufc_matches_df = ufc_matches_df[ufc_matches_df['Result'].isin(['Win', 'Loss'])]

In [47]:
mapper = {'Loss':0, 
          'Win':1}
ufc_matches_df['Outcome'] = ufc_matches_df['Result'].apply(lambda x: mapper[x])

In [48]:
fighters = set(ufc_matches_df['Fighter'])
opponents = set(ufc_matches_df['Opponent'])
# fighters - opponents

In [49]:
def count_histogram_overlap(str1, str2):
    hist1 = Counter(str1)
    hist2 = Counter(str2)
    common_chars = set(hist1.keys()).intersection(set(hist2.keys()))
    overlap = 0
    for char in common_chars:
        overlap += min(hist1[char], hist2[char])
    
    return overlap / (len(str1) + len(str2))

In [50]:
names_all = list(fighters | opponents)
names_all = sorted(names_all)
chr_overlap = [np.nan]

for i in range(1, len(names_all)):
    chr_overlap.append(count_histogram_overlap(names_all[i-1], names_all[i]))

df = pd.DataFrame(dict(names_all=names_all, chr_overlap = chr_overlap))

In [51]:
top_indices = df.sort_values('chr_overlap', ascending=False).head(20).index
indices_ch = []

for ti in top_indices:
    indices_ch.append(ti - 1)
    indices_ch.append(ti)

df.loc[indices_ch]

Unnamed: 0,names_all,chr_overlap
518,Baasankhuu Damlanpurev,0.16129
519,Baasankhuu Damnlanpurev,0.488889
25,Abdulhalik Magomedov,0.35
26,Abdulkhalik Magomedov,0.487805
1658,Gaetano Pirello,0.277778
1659,Gaetano Pirrello,0.483871
4593,Tyler Bialeck,0.272727
4594,Tyler Bialecki,0.481481
3781,Rafael Correa,0.392857
3782,Rafael Correia,0.481481


In [52]:
name_changes = {}
name_changes["Baasankhuu Damnlanpurev"] = "Baasankhuu Damlanpurev"
name_changes["Abdulhalik Magomedov"] = "Abdulkhalik Magomedov"
name_changes["Gaetano Pirello"] = "Gaetano Pirrello"
name_changes["Rafael Correa"] = "Rafael Correia"
name_changes["Tyler Bialeck"] = "Tyler Bialecki"
name_changes["Dave Moran"] = "Dave Morgan"
name_changes["Benoît Saint Denis"] = "Benoît Saint-Denis"
name_changes["Isabela de Padua"] = "Isabela de Pádua"
name_changes["Gilberto Galvao"] = "Gilberto Galvão"
name_changes["Piera Rodriguez"] = "Piera Rodríguez"
name_changes["Adrian Yanez"] = "Adrian Yañez"
name_changes["Diego Lopez"] = "Diego Lopes"

In [53]:
for c in ['Fighter', 'Opponent']:
    ufc_matches_df[c] = ufc_matches_df[c].apply(lambda x: name_changes[x] if x in name_changes else x)
    ufc_matches_df[c] = ufc_matches_df[c].str.replace('Jr.', 'Jr')
    ufc_matches_df[c] = ufc_matches_df[c].str.replace('Sr.', 'Sr')

In [54]:
ufc_matches_df['key'] = np.nan

for i, row in ufc_matches_df.iterrows():
    f, o, d = row['Fighter'], row['Opponent'], row['Date']
    if f < o:
        k = f'{f}_{o}_{d}'
    else:
        k = f'{o}_{f}_{d}'
    ufc_matches_df.loc[i, 'key'] = k

  ufc_matches_df.loc[i, 'key'] = k


In [55]:
ufc_matches_df = ufc_matches_df.drop_duplicates(subset=['key']).sort_values(['Date', 'Fighter']).reset_index(drop=True)

In [56]:
ufc_matches_df.head()

Unnamed: 0,Fighter,Opponent,Result,Date,Date raw,Outcome,key
0,Andrei Arlovski,Viacheslav Datsik,Loss,1999-04-09,9 April 1999,0,Andrei Arlovski_Viacheslav Datsik_1999-04-09 0...
1,Andrei Arlovski,Roman Zentsov,Win,2000-04-09,9 April 2000,1,Andrei Arlovski_Roman Zentsov_2000-04-09 00:00:00
2,Andrei Arlovski,Michael Tielrooy,Win,2000-04-09,9 April 2000,1,Andrei Arlovski_Michael Tielrooy_2000-04-09 00...
3,Andrei Arlovski,John Dixson,Win,2000-05-13,13 May 2000,1,Andrei Arlovski_John Dixson_2000-05-13 00:00:00
4,Andrei Arlovski,Aaron Brink,Win,2000-11-17,17 November 2000,1,Aaron Brink_Andrei Arlovski_2000-11-17 00:00:00


In [57]:
ufc_matches_df.shape

(7660, 7)

In [58]:
ufc_matches_df.to_parquet('data/ufc_matches_v2.parquet')

In [59]:
len(ufc_matches_df), len(ufc_matches_df[ufc_matches_df.Outcome==1]), len(ufc_matches_df[ufc_matches_df.Outcome==0])

(7660, 5975, 1685)

In [60]:
len(ufc_matches_df)==len(ufc_matches_df[ufc_matches_df.Outcome==1])+len(ufc_matches_df[ufc_matches_df.Outcome==0])

True

In [61]:
type(ufc_matches_df.iloc[0].Date)

pandas._libs.tslibs.timestamps.Timestamp

In [62]:
ufc_matches_df['Winner'] = ufc_matches_df.apply(lambda row: row.Fighter if row.Outcome==1 else row.Opponent, axis=1)

In [63]:
ufc_matches_df['Loser'] = ufc_matches_df.apply(lambda row: row.Opponent if row.Outcome==1 else row.Fighter, axis=1)

In [64]:
ufc_matches_df.head()

Unnamed: 0,Fighter,Opponent,Result,Date,Date raw,Outcome,key,Winner,Loser
0,Andrei Arlovski,Viacheslav Datsik,Loss,1999-04-09,9 April 1999,0,Andrei Arlovski_Viacheslav Datsik_1999-04-09 0...,Viacheslav Datsik,Andrei Arlovski
1,Andrei Arlovski,Roman Zentsov,Win,2000-04-09,9 April 2000,1,Andrei Arlovski_Roman Zentsov_2000-04-09 00:00:00,Andrei Arlovski,Roman Zentsov
2,Andrei Arlovski,Michael Tielrooy,Win,2000-04-09,9 April 2000,1,Andrei Arlovski_Michael Tielrooy_2000-04-09 00...,Andrei Arlovski,Michael Tielrooy
3,Andrei Arlovski,John Dixson,Win,2000-05-13,13 May 2000,1,Andrei Arlovski_John Dixson_2000-05-13 00:00:00,Andrei Arlovski,John Dixson
4,Andrei Arlovski,Aaron Brink,Win,2000-11-17,17 November 2000,1,Aaron Brink_Andrei Arlovski_2000-11-17 00:00:00,Andrei Arlovski,Aaron Brink


In [65]:
ufc_matches_df = ufc_matches_df.rename(columns={'Winner': 'winner', 'Loser': 'loser', 'Date': 'timestamp'})

In [66]:
ufc_matches_df[['winner', 'loser', 'timestamp']].to_parquet("data/ufc_matches_refined_v2.parquet")

In [None]:
#ufc_matches_df = pd.read_parquet("data/ufc_matches_refined.parquet")

In [67]:
self = TrueSkillThroughTimeApplied(ufc_matches_df)

In [68]:
print(ufc_matches_df.head().to_string())

           Fighter           Opponent Result  timestamp          Date raw  Outcome                                                    key             winner             loser  time_0_to_999_int
0  Andrei Arlovski  Viacheslav Datsik   Loss 1999-04-09      9 April 1999        0  Andrei Arlovski_Viacheslav Datsik_1999-04-09 00:00:00  Viacheslav Datsik   Andrei Arlovski                  0
1  Andrei Arlovski      Roman Zentsov    Win 2000-04-09      9 April 2000        1      Andrei Arlovski_Roman Zentsov_2000-04-09 00:00:00    Andrei Arlovski     Roman Zentsov                 39
2  Andrei Arlovski   Michael Tielrooy    Win 2000-04-09      9 April 2000        1   Andrei Arlovski_Michael Tielrooy_2000-04-09 00:00:00    Andrei Arlovski  Michael Tielrooy                 39
3  Andrei Arlovski        John Dixson    Win 2000-05-13       13 May 2000        1        Andrei Arlovski_John Dixson_2000-05-13 00:00:00    Andrei Arlovski       John Dixson                 42
4  Andrei Arlovski        Aaro

In [69]:
self.learn_optimal_parameters()


gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 4455.1450

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 4455.1450

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 4455.1450

gamma: 0.0200, sigma: 0.0200, beta: 0.0200, 
NLE: 4455.1450

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 5221.3064

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 5221.3064

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 5221.3064

gamma: 0.1000, sigma: 0.1000, beta: 0.1000, 
NLE: 5221.3064

gamma: 0.0366, sigma: 0.0366, beta: 0.0366, 
NLE: 4210.0036

gamma: 0.0366, sigma: 0.0366, beta: 0.0366, 
NLE: 4210.0036

gamma: 0.0366, sigma: 0.0366, beta: 0.0366, 
NLE: 4210.0036

gamma: 0.0366, sigma: 0.0366, beta: 0.0366, 
NLE: 4210.0036

gamma: 0.0697, sigma: 0.0697, beta: 0.0697, 
NLE: 4209.2526

gamma: 0.0697, sigma: 0.0697, beta: 0.0697, 
NLE: 4209.2526

gamma: 0.0697, sigma: 0.0697, beta: 0.0697, 
NLE: 4209.2526

gamma: 0.0697, sigma: 0.0697, beta: 0.0697, 
NLE: 4209.2526

gamma: 0.0679, sigma: 0

In [70]:
#NLE: 4209.2409
optimal_gamma = 0.0659
optimal_sigma = 3.9321
optimal_beta = 2.4791

In [71]:
self.set_optimal_parameters(gamma = optimal_gamma, sigma = optimal_sigma, beta = optimal_beta)

In [72]:
skill_curves = self.set_skill_curves()

In [73]:
# https://en.wikipedia.org/wiki/UFC_rankings
top_fighters_online = ['Islam Makhachev', 'Alex Pereira', 'Jon Jones', 'Ilia Topuria', 'Belal Muhammad', 'Dricus du Plessis', \
                       'Merab Dvalishvili', 'Tom Aspinall', 'Leon Edwards', 'Alexander Volkanovski', 'Max Holloway', \
                        'Alexandre Pantoja', "Sean O'Malley", 'Sean Strickland', 'Charles Oliveira']

In [74]:
self.plot_player_skills(players = top_fighters_online, width=1500, burnin=0)

In [75]:
self.plot_calibration()

  bucket_means = df.groupby('win_prob_bucket').agg(


### the calibration plot on the entire 7660 games dataset is very similar to the boxing data