In [113]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os
import re

In [114]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.169 Safari/537.36"}

years = list(range(2010,2026))

output_dir = "html_pages"
os.makedirs(output_dir, exist_ok=True)

for year in years:
    url = f'https://www.hockeyfights.com/fightlog/1/reg{year}/1'
    print(f"Scraping URL: {url}")

    response = requests.get(url, headers=headers)
    
    if response.status_code == 403:
        print(f"Access denied for {url}. Status code: 403.")
        break  
    elif response.status_code != 200:
        print(f"Failed to fetch {url}. Status code: {response.status_code}")
        break 

    output_file = os.path.join(output_dir, f"hockeyfights_{year}.html")
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(response.text)

    print(f"HTML content for {year} saved to {output_file}")

print("Scraping complete!")

Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2010/1
HTML content for 2010 saved to html_pages/hockeyfights_2010.html
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2011/1
HTML content for 2011 saved to html_pages/hockeyfights_2011.html
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/1
HTML content for 2012 saved to html_pages/hockeyfights_2012.html
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2013/1
HTML content for 2013 saved to html_pages/hockeyfights_2013.html
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2014/1
HTML content for 2014 saved to html_pages/hockeyfights_2014.html
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2015/1
HTML content for 2015 saved to html_pages/hockeyfights_2015.html
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2016/1
HTML content for 2016 saved to html_pages/hockeyfights_2016.html
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2017/1
HTML content for 2017 saved to ht

In [115]:
folder_path = 'html_pages'
pages = {}

for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        file_path = os.path.join(folder_path, filename)
        
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        year_match = re.search(r'(\d{4})', filename)
        if year_match:
            year = year_match.group(1)
        
        match = re.search(r'lastPage.*?(\d+)', content)
        if match:
            last_page = match.group(1) 
            pages[year] = int(last_page)

urls = []
for year, last_page in pages.items():
    for page in range(1, last_page + 1):
        url = f'https://www.hockeyfights.com/fightlog/1/reg{year}/{page}'
        urls.append(url)
        print(f"Scraping URL: {url}")

Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/1
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/2
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/3
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/4
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/5
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/6
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/7
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/8
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/9
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/10
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/11
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/12
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/13
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/14
Scraping URL: https://www.hockeyfights.com/fightlog/1/reg2012/15
Scraping URL: https://www.hockeyfi

In [116]:
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
})

for url in urls:
    try:
        response_main = session.get(url, timeout=10)
        response_main.raise_for_status()
        
        safe_filename = url.replace("https://", "").replace("/", "_").replace(":", "_")
        output_file = os.path.join(output_dir, f"fights_{safe_filename}.html")
        with open(output_file, "w", encoding="utf-8") as file:
            file.write(response_main.text)
        
        print(f"Saved HTML for URL: {url} to {output_file}")
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")


Saved HTML for URL: https://www.hockeyfights.com/fightlog/1/reg2012/1 to html_pages/fights_www.hockeyfights.com_fightlog_1_reg2012_1.html
Saved HTML for URL: https://www.hockeyfights.com/fightlog/1/reg2012/2 to html_pages/fights_www.hockeyfights.com_fightlog_1_reg2012_2.html
Saved HTML for URL: https://www.hockeyfights.com/fightlog/1/reg2012/3 to html_pages/fights_www.hockeyfights.com_fightlog_1_reg2012_3.html
Saved HTML for URL: https://www.hockeyfights.com/fightlog/1/reg2012/4 to html_pages/fights_www.hockeyfights.com_fightlog_1_reg2012_4.html
Saved HTML for URL: https://www.hockeyfights.com/fightlog/1/reg2012/5 to html_pages/fights_www.hockeyfights.com_fightlog_1_reg2012_5.html
Saved HTML for URL: https://www.hockeyfights.com/fightlog/1/reg2012/6 to html_pages/fights_www.hockeyfights.com_fightlog_1_reg2012_6.html
Saved HTML for URL: https://www.hockeyfights.com/fightlog/1/reg2012/7 to html_pages/fights_www.hockeyfights.com_fightlog_1_reg2012_7.html
Saved HTML for URL: https://www.ho

In [117]:
html_dir = "html_pages" 
output_file = "extracted_fight_links.txt"  

fight_links_main = []

for filename in os.listdir(html_dir):
    if filename.endswith(".html"):
        file_path = os.path.join(html_dir, filename)

        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()

        soup = BeautifulSoup(content, "html.parser")
        fight_links = soup.find_all('a', href=lambda href: href and href.startswith("/fights/"))

        for fight in fight_links:
            href = fight['href']  
            full_url = f"https://www.hockeyfights.com{href}"  
            fight_links_main.append(full_url)

with open(output_file, "w", encoding="utf-8") as f:
    for link in fight_links_main:
        f.write(link + "\n")

print(f"Success! We have {len(fight_links_main)} links!")

Success! We have 17835 links!


In [118]:
fights_file = "extracted_fight_links.txt"

output_dir = "fights_html"
os.makedirs(output_dir, exist_ok=True)

with open(fights_file, "r", encoding="utf-8") as file:
    fight_links = file.readlines()

fight_links = [link.strip() for link in fight_links]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
}

for link in fight_links:
    try:
        response = requests.get(link, headers=headers)

        if response.status_code == 403:
            print(f"Access forbidden for {link} (403 error). Skipping...")
            continue
        elif response.status_code != 200:
            print(f"Failed to fetch {link}, status code: {response.status_code}. Skipping...")
            continue

        file_name = link.split("/")[-1] + ".html"
        output_path = os.path.join(output_dir, file_name)

        with open(output_path, "w", encoding="utf-8") as file:
            file.write(response.text)

        print(f"Saved HTML for {link} to {output_path}")

    except Exception as e:
        print(f"Error processing {link}: {e}")

Saved HTML for https://www.hockeyfights.com/fights/110287 to fights_html/110287.html
Saved HTML for https://www.hockeyfights.com/fights/110287 to fights_html/110287.html
Saved HTML for https://www.hockeyfights.com/fights/110287 to fights_html/110287.html
Saved HTML for https://www.hockeyfights.com/fights/110281 to fights_html/110281.html
Saved HTML for https://www.hockeyfights.com/fights/110281 to fights_html/110281.html
Saved HTML for https://www.hockeyfights.com/fights/110281 to fights_html/110281.html
Saved HTML for https://www.hockeyfights.com/fights/110278 to fights_html/110278.html
Saved HTML for https://www.hockeyfights.com/fights/110278 to fights_html/110278.html
Saved HTML for https://www.hockeyfights.com/fights/110278 to fights_html/110278.html
Saved HTML for https://www.hockeyfights.com/fights/110279 to fights_html/110279.html
Saved HTML for https://www.hockeyfights.com/fights/110279 to fights_html/110279.html
Saved HTML for https://www.hockeyfights.com/fights/110279 to figh

In [119]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import re

html_dir = "fights_html"

fights_data = []

for filename in os.listdir(html_dir):
    if filename.endswith(".html"):
        file_path = os.path.join(html_dir, filename)

        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()

        soup = BeautifulSoup(content, "html.parser")

        # Extract the fight date
        fight_date = soup.find('span', class_="text-gray-400 dark:text-gray-200")
        fight_date = fight_date.text.strip() if fight_date else None

        # Extract the period and time
        time = soup.find('span', class_="text-center text-gray-400")
        time = time.text.strip() if time else None
        
        # Split the time string (e.g., "1st Period - 5:02")
        if time:
            period, period_time = time.split(" - ")
            # Extract only the numeric part of the period (e.g., "1" from "1st")
            period = re.sub(r'\D', '', period)  # Remove any non-digit characters
            time = period_time.strip()  # Get the time (e.g., "5:02")

        # Extract player and team names
        players_1 = soup.find_all('span', class_="text-center text-lg text-gray-800 dark:text-gray-50 md:text-2xl md:text-end")
        players_2 = soup.find_all('span', class_="text-center text-lg text-gray-800 dark:text-gray-50 md:text-2xl md:text-start")
        
        teams_1 = soup.find_all('span', class_="text-center text-sm text-gray-500 md:text-base md:text-end")
        teams_2 = soup.find_all('span', class_="text-center text-sm text-gray-500 md:text-base md:text-start")
        
        # Extract voting results
        voting_results = soup.find_all('a', href=True)  # Targets <a> tags with player links
        players = []
        votes = []

        for result in voting_results:
            player_name = result.find('span', class_="text-base text-gray-800 dark:text-gray-100")
            player_name = player_name.text.strip() if player_name else None

            vote_percentage = result.find('span', class_="text-base text-gray-600 dark:text-gray-100")
            vote_percentage = vote_percentage.text.strip() if vote_percentage else None

            vote_count = result.find('span', class_="text-xs leading-[18px] text-gray-400")
            vote_count = vote_count.text.strip() if vote_count else None

            if player_name and vote_percentage and vote_count:
                players.append(player_name)
                votes.append((vote_percentage, vote_count))

        # Only proceed if we have two players (a matchup)
        if len(players) == 2:
            # Extract the team names for the players
            team1_name = teams_1[0].text.strip() if teams_1 else None
            team2_name = teams_2[0].text.strip() if teams_2 else None

            fights_data.append({
                "Date": fight_date,
                "Period": period,  # Only numeric part of the period (e.g., "1")
                "Time": time,  # Time extracted (e.g., "5:02")
                "Player 1": players[0],
                "Team 1": team2_name,  # Team for Player 1
                "Player 1 Votes": votes[0][1],  # Votes for Player 1
                "Player 1 Vote Percentage": votes[0][0],  # Vote percentage for Player 1
                "Player 2": players[1],
                "Team 2": team1_name,  # Team for Player 2
                "Player 2 Votes": votes[1][1],  # Votes for Player 2
                "Player 2 Vote Percentage": votes[1][0], 
            })

# Convert to a pandas DataFrame
df = pd.DataFrame(fights_data)

# Save to Excel
output_file = "fights_data_with_teams.xlsx"
df.to_excel(output_file, index=False, engine="openpyxl")

print(f"Extraction complete. Data saved to {output_file}.")

Extraction complete. Data saved to fights_data_with_teams.xlsx.
