In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json

BASE_URL = "http://www.collegehockeystats.net"
SEASON_URL = f"{BASE_URL}/1920/teamstats/ecachm"

def fetch_html(url):
    response = requests.get(url)
    return response.text if response.status_code == 200 else ""

def extract_team_links(html):
    soup = BeautifulSoup(html, "html.parser")
    anchor_tags = soup.find_all("a", href=True)
    collected_links = []
    unique_paths = set()

    for tag in anchor_tags:
        href = tag["href"]
        if "/1920/teamstats/" in href and href not in unique_paths:
            complete_url = BASE_URL + href
            team_label = tag.get_text(strip=True)
            collected_links.append((team_label, complete_url))
            unique_paths.add(href)
    return collected_links

def extract_home_record(page_text):
    # First pattern: Home Record
    record_pattern = re.search(r"Home Record:\s+(\d+)-(\d+)-(\d+)", page_text)
    if record_pattern:
        return tuple(map(int, record_pattern.groups()))

    # Fallback: Overall record
    fallback_pattern = re.search(r"Overall\s*-\s*\d+\s*GP\s*\((\d+)-(\d+)-(\d+)", page_text)
    if fallback_pattern:
        return tuple(map(int, fallback_pattern.groups()))

    return (0, 0, 0)

def extract_player_info(soup):
    roster = []
    for table in soup.find_all("table"):
        for row in table.find_all("tr"):
            cells = row.find_all("td")
            if len(cells) >= 2:
                jersey_num = cells[0].get_text(strip=True)
                player_name = cells[1].get_text(strip=True)
                if jersey_num.isdigit() and player_name:
                    roster.append({
                        "number": jersey_num,
                        "name": player_name
                    })
    return roster

def collect_team_data():
    season_html = fetch_html(SEASON_URL)
    team_pages = extract_team_links(season_html)
    compiled_data = []

    for team, url in team_pages:
        print(f"Fetching data for team: {team}")
        page_html = fetch_html(url)
        soup = BeautifulSoup(page_html, "html.parser")
        raw_text = soup.get_text()

        wins, losses, ties = extract_home_record(raw_text)
        team_roster = extract_player_info(soup)

        team_data = {
            "team": team,
            "home_wins": wins,
            "home_losses": losses,
            "home_ties": ties,
            "players": team_roster
        }
        compiled_data.append(team_data)

    return compiled_data

if __name__ == "__main__":
    scraped_results = collect_team_data()
    print(json.dumps(scraped_results, indent=2))


Fetching data for team: Brown
Fetching data for team: Clarkson
Fetching data for team: Colgate
Fetching data for team: Cornell
Fetching data for team: Dartmouth
Fetching data for team: Harvard
Fetching data for team: Princeton
Fetching data for team: Quinnipiac
Fetching data for team: Rensselaer
Fetching data for team: St. Lawrence
Fetching data for team: Union
Fetching data for team: Yale
[
  {
    "team": "Brown",
    "home_wins": 8,
    "home_losses": 21,
    "home_ties": 2,
    "players": [
      {
        "number": "8",
        "name": "Zach Giuttari"
      },
      {
        "number": "13",
        "name": "Brent Beaudoin"
      },
      {
        "number": "7",
        "name": "Justin Jallen"
      },
      {
        "number": "17",
        "name": "Bradley Cocca"
      },
      {
        "number": "6",
        "name": "Tony Stillwell"
      },
      {
        "number": "2",
        "name": "Luke Krys"
      },
      {
        "number": "25",
        "name": "Michael Maloney"
  