In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import itertools

browser = webdriver.Firefox()

In [2]:
def get_wfh_teams(year=2024, division=1):
    url = f"https://stats.ncaa.org/team/inst_team_list?academic_year={year}&division={division}&sport_code=WFH"
    browser.get(url)
    source = browser.page_source
    soup = BeautifulSoup(source)
    teams = soup.find('table').find_all('a', href=True)
    team_names = [a.text for a in teams]
    team_links = [a['href'] for a in teams]
    return team_names, team_links

In [3]:
def format_goals(txt: str) -> int:
    if len(txt) == 0:
        return 0
    return int(''.join([s for s in itertools.islice(txt, 0, None) if s.isdigit()]))

In [4]:
def is_home(text):
    if text.strip()[0] == '@':
        return -1
    elif '@' in text:
        return 0
    return 1

In [5]:
def get_team_matches(url_end):
    _, _, team_id, game_sport_year_ctl_id = url_end.split('/')
    url = f"https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id={game_sport_year_ctl_id}&org_id={team_id}&stats_player_seq=-100"
    #print(url)
    browser.get(url)
    s = BeautifulSoup(browser.page_source)
    soup = s.find('div', {'id': 'game_breakdown_div'}).find_all('table', attrs={'class': 'mytable'})[0]
    date = [row.find_all('td')[0].text.strip() for row in soup.find_all('tr', attrs={'class': None})][::2]
    opponent = [row.find_all('td')[1].find('a', attrs={'class': None}) for row in soup.find_all('tr', attrs={'class': None})][::2]
    result = [row.find_all('td')[2].text.strip() for row in soup.find_all('tr', attrs={'class': None})][::2]
    gf = [format_goals(row.find_all('td')[3].text.strip()) for row in soup.find_all('tr', attrs={'class': None})][::2]
    ga = [format_goals(row.find_all('td')[3].text.strip()) for row in soup.find_all('tr', attrs={'class': None})][1::2]
    time = [row.find_all('td')[13].text.strip() for row in soup.find_all('tr', attrs={'class': None})][1::2]
    played = min(len(opponent), len(gf), len(ga), len(time))
    return pd.DataFrame({
        'team_id': team_id,
        'date': date[0:played],
        'opponent': [y['href'].split('/')[2] for y in opponent[0:played]],
        'home': [is_home(y.text) for y in opponent[0:played]],
        'result': result[0:played],
        'gf': gf[0:played],
        'ga': ga[0:played],
        'time': time[0:played]
    })

In [6]:
def get_year_data(year=2024):
    team_names, team_links = get_wfh_teams(year=year)
    teams = pd.DataFrame({
        'name': team_names,
        'team_id': [x.split('/')[2] for x in team_links]
    })
    df = pd.concat(map(get_team_matches, team_links))
    df.merge(teams, on='team_id', how='inner').to_feather(f"data/matches_{year}.feather", compression="zstd")

In [7]:
for year in [2022, 2023, 2024]:
    get_year_data(year)