In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import random
import time
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

pd.set_option('display.max_columns', None)

In [5]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

session = requests.Session()

retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
)

adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)


def get_fighters():
    fighters = []
    
    for c in "abcdefghijklmnopqrstuvwxyz":
        url = f"http://ufcstats.com/statistics/fighters?char={c}&page=all"
        html = requests.get(url, headers=headers).text
        soup = BeautifulSoup(html, "html.parser")

        rows = soup.select("table.b-statistics__table tbody tr")

        for r in rows:
            link = r.select_one("td:nth-of-type(1) a")
            if link is None:
                continue

            name = link.text.strip()
            profile_url = link['href']

            surname = r.select_one("td:nth-of-type(2)")
            surname = surname.text.strip() if surname else None

            fighters.append({
                "name": name,
                "surname": surname,
                "profile_url": profile_url
            })

    return pd.DataFrame(fighters)

In [37]:
fighters_df = get_fighters()
fighters_df.head(5)

Unnamed: 0,name,surname,profile_url
0,Tom,Aaron,http://ufcstats.com/fighter-details/93fe7332d1...
1,Danny,Abbadi,http://ufcstats.com/fighter-details/15df64c02b...
2,Nariman,Abbasov,http://ufcstats.com/fighter-details/59a9d6dac6...
3,Darion,Abbey,http://ufcstats.com/fighter-details/4961467134...
4,David,Abbott,http://ufcstats.com/fighter-details/b361180739...


In [3]:
def get_fighter_info(url):
    url = f"{url}"
        
    try:
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print("Retry failed:", e)
        return {}
    
    soup = BeautifulSoup(response.text, "html.parser")

    stats = {
        "height": None,
        "weight": None,
        "reach": None,
        "stance": None,
        "dob": None,
        "slpm": None,
        "stracc": None,
        "sapm": None,
        "strdef": None,
        "tdavg": None,
        "tdacc": None,
        "tddef": None,
        "subavg": None
    }

    items = soup.select("li.b-list__box-list-item.b-list__box-list-item_type_block")

    for item in items:
        parts = item.text.strip().split(":")
        if len(parts) != 2:
            continue

        key = parts[0].strip().lower()
        value = parts[1].strip()

        if key == "height":
            stats["height"] = value
        elif key == "weight":
            stats["weight"] = value
        elif key == "reach":
            stats["reach"] = value
        elif key == "stance":
            stats["stance"] = value
        elif key == "dob":
            stats["dob"] = value
        elif key == "slpm":
            stats["slpm"] = value
        elif key == "str. acc.":
            stats["stracc"] = value
        elif key == "sapm":
            stats["sapm"] = value
        elif key == "str. def":
            stats["strdef"] = value
        elif key == "td avg.":
            stats["tdavg"] = value
        elif key == "td acc.":
            stats["tdacc"] = value
        elif key == "td def.":
            stats["tddef"] = value
        elif key == "sub. avg.":
            stats["subavg"] = value
            
    return stats

In [19]:
def get_events():
    events = []
    
    url = "http://www.ufcstats.com/statistics/events/completed?page=all"
    html = requests.get(url, headers=headers).text
    soup = BeautifulSoup(html, "html.parser")
    
    rows = soup.select("table.b-statistics__table-events tbody tr")

    for r in rows:
        link = r.select_one("td:nth-of-type(1) a")
        date = r.select_one("td:nth-of-type(1) span")
        if not link or not date:
            continue

        event_name = link.text.strip()
        event_url = link['href']
        event_date = int(date.text.strip().split(",")[1])

        events.append({
            "event_name": event_name,
            "event_url": event_url,
            "event_date": event_date
        })
    del events[0]
    
    return events

In [20]:
def get_fights(event):
    url = event['event_url']
    html = requests.get(url, headers=headers).text
    soup = BeautifulSoup(html, 'html.parser')
    
    fights = []
    
    rows = soup.select("table.b-fight-details__table tbody tr")
    
    for r in rows:
        winner = r.select_one("td:nth-of-type(2) p:nth-of-type(1) a").text.strip()
        looser = r.select_one("td:nth-of-type(2) p:nth-of-type(2) a").text.strip()
        fighter1_url = r.select_one("td:nth-of-type(2) p:nth-of-type(1) a")['href']
        fighter2_url = r.select_one("td:nth-of-type(2) p:nth-of-type(2) a")['href']
        
        fights.append({
            "winner": winner,
            "winner_url": fighter1_url,
            "looser": looser,
            "looser_url": fighter2_url,
            "fight_date": event['event_date']
        })
    return fights

In [None]:
events = get_events()
all_fights = [get_fights(event) for event in events]
all_fights = [fight for event in all_fights for fight in event]

KeyboardInterrupt: 

In [10]:
fights_dataset = pd.read_csv("../data/raw/fights_dataset.csv")
fights_dataset.drop('Unnamed: 0', axis=1)

Unnamed: 0,winner,winner_url,looser,looser_url
0,Islam Makhachev,http://www.ufcstats.com/fighter-details/275aca...,Jack Della Maddalena,http://www.ufcstats.com/fighter-details/6b453b...
1,Valentina Shevchenko,http://www.ufcstats.com/fighter-details/132deb...,Zhang Weili,http://www.ufcstats.com/fighter-details/1ebe20...
2,Michael Morales,http://www.ufcstats.com/fighter-details/c32aeb...,Sean Brady,http://www.ufcstats.com/fighter-details/45f7cb...
3,Carlos Prates,http://www.ufcstats.com/fighter-details/7ee0fd...,Leon Edwards,http://www.ufcstats.com/fighter-details/f1fac9...
4,Benoit Saint Denis,http://www.ufcstats.com/fighter-details/c2299e...,Beneil Dariush,http://www.ufcstats.com/fighter-details/08af93...
...,...,...,...,...
8449,Orlando Wiet,http://www.ufcstats.com/fighter-details/a6c2f5...,Robert Lucarelli,http://www.ufcstats.com/fighter-details/96087e...
8450,Frank Hamaker,http://www.ufcstats.com/fighter-details/c3c23c...,Thaddeus Luster,http://www.ufcstats.com/fighter-details/505934...
8451,Johnny Rhodes,http://www.ufcstats.com/fighter-details/319fa1...,David Levicki,http://www.ufcstats.com/fighter-details/49590e...
8452,Patrick Smith,http://www.ufcstats.com/fighter-details/46c8ec...,Ray Wizard,http://www.ufcstats.com/fighter-details/ea0ad1...


In [11]:
for idx, fight in fights_dataset.iterrows():
    random_state = random.randint(0, 1)
    
    if random_state == 1:
        fights_dataset.at[idx, 'outcome'] = 1
    else:
        fights_dataset.at[idx, 'outcome'] = 0
        
        winner_temp = fight['winner']
        looser_temp = fight['looser']
        winner_url_temp = fight['winner_url']
        looser_url_temp = fight['looser_url']
        
        fights_dataset.at[idx, 'winner'] = looser_temp
        fights_dataset.at[idx, 'winner_url'] = looser_url_temp
        fights_dataset.at[idx, 'looser'] = winner_temp
        fights_dataset.at[idx, 'looser_url'] = winner_url_temp

In [12]:
fights_dataset = fights_dataset.drop('Unnamed: 0', axis=1)

In [162]:
fights_dataset.to_csv(os.path.join(r"C:\Users\user\Desktop\UFC-fights-prediction\data\raw", "fights_dataset_randomized.csv"))

In [13]:
fights_dataset = pd.read_csv('../data/raw/fights_dataset_randomized.csv').drop('Unnamed: 0', axis=1)

In [9]:
def get_win_streaks(fighter1_url, opponent_name):
    try:
        r = session.get(fighter1_url, headers=headers, timeout=10)
        r.raise_for_status()
    except:
        return {"cur_streak": 0, "max_streak": 0}

    soup = BeautifulSoup(r.text, "html.parser")

    name = soup.select_one("span.b-content__title-highlight").text.strip()

    rows = soup.select("table.b-fight-details__table tbody tr")

    fight_idx = None
    for idx, row in enumerate(rows):
        names = row.select("td.b-fight-details__table-col:nth-of-type(2) a")
        if len(names) < 2:
            continue

        p1, p2 = names[0].text.strip(), names[1].text.strip()

        if {p1, p2} == {name, opponent_name}:
            fight_idx = idx
            break

    if fight_idx is None:
        return {"cur_streak": 0, "max_streak": 0}

    last_fights = rows[fight_idx + 1:]
    last_fights = last_fights[::-1]

    results = []
    for r in last_fights:
        res = r.select_one("td.b-fight-details__table-col:nth-of-type(1) p")
        if res:
            results.append(res.text.strip().upper())

    cur = 0
    maxs = 0
    for r in results:
        if r == "WIN":
            cur += 1
        else:
            cur = 0
        maxs = max(maxs, cur)
    
    return {
        "cur_streak": cur,
        "max_streak": maxs,
    }

In [7]:
def get_fights_ds_with_stats(fights_dataset):
    fighter_cache = {}
    
    for idx, fight in fights_dataset.iterrows():
        url_1 = fights_dataset.at[idx, 'winner_url']
        url_2 = fights_dataset.at[idx, 'looser_url']
        
        if url_1 not in fighter_cache:
            fighter_cache[url_1] = get_fighter_info(url_1)
            fighter_1_stats = fighter_cache[url_1]
            time.sleep(random.uniform(1.0, 2.5))
        else:
            fighter_1_stats = fighter_cache[url_1]

        if url_2 not in fighter_cache:
            fighter_cache[url_2] = get_fighter_info(url_2)
            fighter_2_stats = fighter_cache[url_2]
            time.sleep(random.uniform(1.0, 2.5))
        else:
            fighter_2_stats = fighter_cache[url_2]
        
        for item in fighter_1_stats:
            fights_dataset.at[idx, item + "_1"] = fighter_1_stats[item]
            
        for item in fighter_2_stats:
            fights_dataset.at[idx, item + "_2"] = fighter_2_stats[item]
            
        if idx % 500 == 0:
            fights_dataset.to_csv("../data/raw/progress.csv", index=False)
            
    columns = list(fights_dataset.columns.values)
    columns.remove('outcome')
    columns.append('outcome')
    fights_dataset = fights_dataset[columns].drop(['winner_url', 'looser_url'], axis=1)
    
    return fights_dataset

In [14]:
def get_win_streak_dataset(fights_dataset):
    fighter_cache = {}
    ws_df = pd.DataFrame()
    
    for idx, fight in fights_dataset.iterrows():
        url_1 = fights_dataset.at[idx, 'winner_url']
        url_2 = fights_dataset.at[idx, 'looser_url']
        opp_name_1 = fights_dataset.at[idx, 'looser']
        opp_name_2 = fights_dataset.at[idx, 'winner']
        
        if url_1 not in fighter_cache:
            fighter_cache[url_1] = get_win_streaks(url_1, opp_name_1)
            fighter_1_stats = fighter_cache[url_1]
            time.sleep(random.uniform(1.0, 2.5))
        else:
            fighter_1_stats = fighter_cache[url_1]
            
        if url_2 not in fighter_cache:
                fighter_cache[url_2] = get_win_streaks(url_2, opp_name_2)
                fighter_2_stats = fighter_cache[url_2]
                time.sleep(random.uniform(1.0, 2.5))
        else:
            fighter_2_stats = fighter_cache[url_2]
        
        for item in fighter_1_stats:
            ws_df.at[idx, item + "_1"] = fighter_1_stats[item]
            
        for item in fighter_2_stats:
            ws_df.at[idx, item + "_2"] = fighter_2_stats[item]
            
        if idx % 500 == 0:
            ws_df.to_csv("../data/raw/progress_ws2.csv", index=False)
            
    return ws_df

In [21]:
def get_fight_dates():
    events = get_events()
    all_fights = [get_fights(event) for event in events]
    all_fights = [fight for event in all_fights for fight in event]
    
    df = pd.DataFrame(all_fights)
    
    return df['fight_date']

In [22]:
fight_dates = get_fight_dates()

In [16]:
fights_dataset_with_stats = get_fights_ds_with_stats(fights_dataset)

In [17]:
fights_dataset_with_stats.to_csv(os.path.join(r"C:\Users\user\Desktop\UFC-fights-prediction\data\raw", "fights_dataset_with_stats.csv"))

In [15]:
df_for_ws = pd.read_csv("../data/raw/fights_dataset_randomized.csv")

In [16]:
winstreak_dataset = get_win_streak_dataset(df_for_ws)

In [17]:
winstreak_dataset.to_csv("../data/raw/winstreak_dataset.csv", index=False)

In [28]:
winstreak_dataset = pd.read_csv("../data/raw/winstreak_dataset.csv")

In [29]:
fights_dataset = pd.read_csv('../data/raw/fights_dataset_with_stats.csv')
fights_dataset = pd.concat([fights_dataset, winstreak_dataset], axis=1)
fights_dataset

Unnamed: 0,winner,looser,height_1,weight_1,reach_1,stance_1,dob_1,slpm_1,stracc_1,sapm_1,strdef_1,tdavg_1,tdacc_1,tddef_1,subavg_1,height_2,weight_2,reach_2,stance_2,dob_2,slpm_2,stracc_2,sapm_2,strdef_2,tdavg_2,tdacc_2,tddef_2,subavg_2,fight_date,outcome,cur_streak_1,max_streak_1,cur_streak_2,max_streak_2
0,Islam Makhachev,Jack Della Maddalena,"5' 10""",170 lbs.,"70""",Southpaw,"Oct 27, 1991",2.45,58%,1.45,61%,3.10,56%,91%,1.0,"5' 11""",170 lbs.,"73""",Switch,"Sep 10, 1996",5.57,51%,3.84,63%,0.13,10%,64%,0.1,2025.0,1.0,15.0,15.0,9.0,9.0
1,Zhang Weili,Valentina Shevchenko,"5' 4""",115 lbs.,"63""",Switch,"Aug 13, 1989",4.66,52%,2.63,53%,1.96,44%,63%,0.5,"5' 5""",125 lbs.,"66""",Southpaw,"Mar 07, 1988",3.04,52%,2.00,63%,2.65,61%,76%,0.3,2025.0,0.0,5.0,5.0,2.0,9.0
2,Michael Morales,Sean Brady,"6' 0""",170 lbs.,"79""",Orthodox,"Jun 24, 1999",5.68,49%,3.26,53%,1.04,41%,89%,0.0,"5' 10""",170 lbs.,"72""",Orthodox,"Nov 23, 1992",3.96,55%,3.05,59%,3.53,53%,86%,0.9,2025.0,1.0,7.0,7.0,3.0,5.0
3,Leon Edwards,Carlos Prates,"6' 2""",170 lbs.,"74""",Southpaw,"Aug 25, 1991",2.60,54%,2.44,52%,1.25,37%,63%,0.4,"6' 1""",170 lbs.,"78""",Switch,"Aug 17, 1993",3.77,55%,4.53,47%,0.21,100%,80%,0.0,2025.0,0.0,0.0,8.0,1.0,5.0
4,Benoit Saint Denis,Beneil Dariush,"5' 11""",155 lbs.,"73""",Southpaw,"Dec 18, 1995",5.07,55%,4.21,42%,4.33,37%,70%,1.6,"5' 10""",155 lbs.,"72""",Southpaw,"May 06, 1989",3.78,49%,2.62,57%,2.11,38%,82%,0.8,2025.0,1.0,2.0,5.0,1.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8449,Orlando Wiet,Robert Lucarelli,"5' 10""",170 lbs.,--,Southpaw,"Oct 24, 1965",0.00,0%,0.00,0%,0.00,0%,0%,0.0,"6' 2""",245 lbs.,--,,--,0.00,0%,0.00,0%,0.00,0%,0%,0.0,1994.0,1.0,1.0,1.0,0.0,0.0
8450,Frank Hamaker,Thaddeus Luster,--,--,--,,--,0.00,0%,0.00,0%,0.00,0%,0%,0.0,"6' 3""",210 lbs.,--,,--,0.00,0%,0.00,0%,0.00,0%,0%,0.0,1994.0,1.0,0.0,0.0,0.0,0.0
8451,David Levicki,Johnny Rhodes,"6' 5""",275 lbs.,--,,--,0.00,0%,0.00,0%,0.00,0%,0%,0.0,"6' 0""",210 lbs.,--,Orthodox,--,0.00,0%,0.00,0%,0.00,0%,0%,0.0,1994.0,0.0,0.0,0.0,2.0,2.0
8452,Ray Wizard,Patrick Smith,--,--,--,,--,0.00,0%,0.00,0%,0.00,0%,0%,0.0,"6' 2""",225 lbs.,--,Orthodox,"Aug 28, 1963",0.00,0%,0.00,0%,0.00,0%,0%,0.0,1994.0,0.0,0.0,0.0,0.0,3.0


In [4]:
complete_df = pd.concat([fights_dataset, winstreak_dataset], axis=1)

In [40]:
complete_df = pd.read_csv('../data/raw/fights_dataset_with_stats.csv')
complete_df = pd.concat([complete_df, fight_dates.to_frame().head(8453)], axis=1)

In [30]:
outcome = fights_dataset.outcome
fights_dataset = fights_dataset.drop('outcome', axis=1)
fights_dataset = pd.concat([fights_dataset, outcome.to_frame()], axis=1)

In [31]:
fights_dataset.to_csv("../data/raw/fights_dataset_with_stats.csv", index=False)

In [32]:
df = pd.read_csv("../data/raw/fights_dataset_with_stats.csv")

In [33]:
from sklearn.model_selection import train_test_split

test, train = train_test_split(df, test_size=0.7, random_state=123, shuffle=False)

In [34]:
train.to_csv("../data/raw/train_raw.csv", index=False)
test.to_csv("../data/raw/test_raw.csv", index=False)