In [1]:
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
#import requests
import pandas as pd

browser = webdriver.Firefox()


In [2]:
def get_wfh_teams(year=2024, division=1):
    url = f"https://stats.ncaa.org/team/inst_team_list?academic_year={year}&division={division}&sport_code=WFH"
    browser.get(url)
    source = browser.page_source
    soup = BeautifulSoup(source)
    teams = soup.find('table').find_all('a', href=True)
    team_names = [a.text for a in teams]
    team_links = [a['href'] for a in teams]
    return team_names, team_links

In [3]:
team_names, team_links = get_wfh_teams(year=2022)

In [4]:
teams = pd.DataFrame({
    'name': team_names,
    'team_id': [x.split('/')[2] for x in team_links]
})

In [5]:
import itertools
def format_goals(txt: str) -> int:
    if len(txt) == 0:
        return 0
    return int(''.join([s for s in itertools.islice(txt, 0, None) if s.isdigit()]))

In [6]:
def get_team_matches(url_end):
    _, _, team_id, game_sport_year_ctl_id = url_end.split('/')
    url = f"https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id={game_sport_year_ctl_id}&org_id={team_id}&stats_player_seq=-100"
    print(url)
    browser.get(url)
    s = BeautifulSoup(browser.page_source)
    soup = s.find('div', {'id': 'game_breakdown_div'}).find_all('table', attrs={'class': 'mytable'})[0]
    date = [row.find_all('td')[0].text.strip() for row in soup.find_all('tr', attrs={'class': None})][::2]
    opponent = [row.find_all('td')[1].find('a', attrs={'class': None}) for row in soup.find_all('tr', attrs={'class': None})][::2]
    result = [row.find_all('td')[2].text.strip() for row in soup.find_all('tr', attrs={'class': None})][::2]
    gf = [format_goals(row.find_all('td')[3].text.strip()) for row in soup.find_all('tr', attrs={'class': None})][::2]
    ga = [format_goals(row.find_all('td')[3].text.strip()) for row in soup.find_all('tr', attrs={'class': None})][1::2]
    time = [row.find_all('td')[13].text.strip() for row in soup.find_all('tr', attrs={'class': None})][1::2]
    played = min(len(opponent), len(gf), len(ga), len(time))
    return pd.DataFrame({
        'team_id': team_id,
        'date': date[0:played],
        'opponent': [y['href'].split('/')[2] for y in opponent[0:played]],
        'result': result[0:played],
        'gf': gf[0:played],
        'ga': ga[0:played],
        'time': time[0:played]
    })

In [7]:
get_team_matches(team_links[0])

https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15760&org_id=23&stats_player_seq=-100


Unnamed: 0,team_id,date,opponent,result,gf,ga,time
0,23,08/27/2021,740,2-1 (1),2,1,62:29
1,23,08/29/2021,523,2-3 (1),2,3,66:54
2,23,09/03/2021,180,0-2,0,2,60:00
3,23,09/10/2021,575,2-1,2,1,60:00
4,23,09/12/2021,400,3-2 (2),3,2,73:21
5,23,09/18/2021,342,1-0 (1),1,0,62:20
6,23,09/24/2021,361,3-0,3,0,60:00
7,23,09/26/2021,392,0-4,0,4,60:00
8,23,10/02/2021,83,3-2,3,2,60:00
9,23,10/09/2021,352,2-0,2,0,60:00


In [8]:
df = pd.concat(map(get_team_matches, team_links))
df

https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15760&org_id=23&stats_player_seq=-100
https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15760&org_id=27&stats_player_seq=-100
https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15760&org_id=47&stats_player_seq=-100
https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15760&org_id=52&stats_player_seq=-100
https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15760&org_id=67&stats_player_seq=-100
https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15760&org_id=68&stats_player_seq=-100
https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15760&org_id=80&stats_player_seq=-100
https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15760&org_id=81&stats_player_seq=-100
https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15760&org_id=83&stats_player_seq=-100
https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_i

Unnamed: 0,team_id,date,opponent,result,gf,ga,time
0,23,08/27/2021,740,2-1 (1),2,1,62:29
1,23,08/29/2021,523,2-3 (1),2,3,66:54
2,23,09/03/2021,180,0-2,0,2,60:00
3,23,09/10/2021,575,2-1,2,1,60:00
4,23,09/12/2021,400,3-2 (2),3,2,73:21
...,...,...,...,...,...,...,...
12,813,10/20/2021,590,4-2,4,2,60:00
13,813,10/23/2021,540,1-3,1,3,60:00
14,813,10/24/2021,283,6-1,6,1,60:00
15,813,10/30/2021,172,4-0,4,0,60:00


In [9]:
df.groupby('team_id')['gf'].sum().sort_values()

team_id
47     13
590    16
711    17
172    17
410    17
       ..
688    66
312    66
606    67
509    79
355    80
Name: gf, Length: 79, dtype: int64

In [10]:
df.merge(teams, on='team_id', how='inner').to_feather("data/matches_2022.feather", compression="zstd")