In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [None]:
seasons = ['f24', 's24', 'f23', 's23', 'f22','s22']

df_races = pd.DataFrame()
try:
  df_races = pd.read_csv("racesf24s24f23.csv")
except:
    df_races = pd.DataFrame(columns=["Score", "Div", "Sailor", "Position", "Partner", "Venue", "Regatta", "Teams"])

regattas = {}
for season in seasons:
  url = f"https://scores.collegesailing.org/{season}/"
  page = requests.get(url)
  listSoup = BeautifulSoup(page.content, 'html.parser')
  
  tbody = listSoup.find('table', class_="season-summary").find('tbody')
  
  for link in tbody.find_all("a", href=True):
    if (season + "/" + link['href']) not in df_races['Regatta'].unique():
      scoring = link.parent.next_sibling.next_sibling.next_sibling.text
      if (scoring == "3 Divisions" or scoring == "2 Divisions" or scoring == "Combined"):
        regattas[season + "/" + link['href']] = {"link":season + "/" + link['href'], "scoring":scoring}

In [16]:
regattaSoups = {}

for i, regatta in enumerate(list(regattas.values())):
    betterVenue = list(regattas.keys())[i]
    print(f"({i + 1}/{len(list(regattas.values()))}) getting soup for {betterVenue}")
    # full scores
    url = f"https://scores.collegesailing.org/{regatta['link']}/full-scores/"
    page = requests.get(url)
    fullScores = BeautifulSoup(page.content, 'html.parser')

    # sailors
    url = f"https://scores.collegesailing.org/{regatta['link']}/sailors/"
    page = requests.get(url)
    sailors = BeautifulSoup(page.content, 'html.parser')
    
    regattaSoups[betterVenue] = {"fullScores": fullScores, "sailors": sailors, "scoring": regatta['scoring']}

(1/694) getting soup for f24/fall-pacific-coast
(2/694) getting soup for f24/fall-pcc-jv
(3/694) getting soup for f24/2024-triangle-tango-intercollegiate
(4/694) getting soup for f24/frosh-soph
(5/694) getting soup for f24/mcsa-open-fall
(6/694) getting soup for f24/north-north
(7/694) getting soup for f24/kiara-broudy
(8/694) getting soup for f24/saisa-fall-women
(9/694) getting soup for f24/women-urn
(10/694) getting soup for f24/maisa-women-fall-dinghy
(11/694) getting soup for f24/war
(12/694) getting soup for f24/neisa-fall-tournament
(13/694) getting soup for f24/schell-neisa-open
(14/694) getting soup for f24/pccsc-women-fall
(15/694) getting soup for f24/saisa-fall-open
(16/694) getting soup for f24/seisa-women-fall
(17/694) getting soup for f24/nickerson
(18/694) getting soup for f24/cedarfest
(19/694) getting soup for f24/kingfisher
(20/694) getting soup for f24/top-fall-invite
(21/694) getting soup for f24/terrier-invite
(22/694) getting soup for f24/open-atlantic-coast-fina

In [17]:
def getRaceNums(oldNums, scoresLen):
    newNums = []
    if oldNums == [['']]:
        newNums = list(range(1, scoresLen + 1))
    elif len(oldNums) > 0:
        for i, num in enumerate(oldNums):
            if len(num) > 1:
                for j in range(int(num[0]), int(num[1]) + 1):
                    newNums.append(j)
            else:
                newNums.append(int(num[0]))
    return newNums
def makeRaceSeries(score, team, raceNum, division, name, position, partner, venue, regatta, teams, date, teamlink):
    raceSeries = pd.Series()
    raceSeries['raceID'] = "" + regatta + "/" + str(raceNum) + division
    if isinstance(score, int):
        raceSeries["Score"] = score
    else:
        raceSeries["Score"] = len(teams) + 1
    raceSeries["Date"] = date
    raceSeries["Div"] = division
    raceSeries["Sailor"] = name
    raceSeries["Position"] = position
    raceSeries["Partner"] = partner
    raceSeries["Team"] = team
    raceSeries["Teamlink"] = teamlink
    raceSeries["Venue"] = venue
    raceSeries["Regatta"] = regatta
    raceSeries["Teams"] = teams
    return raceSeries

In [18]:
for i, regatta in enumerate(list(regattaSoups.keys())):
    print(f"({i + 1}/{len(list(regattas.values()))}) analyzing {regatta}")
    fullScores = regattaSoups[regatta]['fullScores']
    sailors = regattaSoups[regatta]['sailors']
    scoring = regattaSoups[regatta]['scoring']
    
    if len(fullScores.find_all('table', class_="results")) == 0: 
        print(f"no scores entered for {regatta}, skipping")
        continue
    
    scoreData = fullScores.find_all('table', class_="results")[
        0].contents[1].contents
    sailorData = sailors.find('table', class_="sailors").contents[1].contents
    header = fullScores.find(
        'table', class_="results").find_all('th', class_="right")
    raceCount = int(header[len(header) - 2].text)
        
    
    numDivisions = 1
    if scoreData[1]['class'][0] == 'divB' and scoreData[2]['class'][0] == 'totalrow':
        numDivisions = 2
    if scoreData[2]['class'][0] == 'divC':
        numDivisions = 3


    teamCount = int(len(scoreData) / (numDivisions + 1))
    
    teamHomes = [(scoreData[(k*(numDivisions + 1)) - (numDivisions + 1)].find('a').text)
                 for k in range(teamCount)]
    
    host = fullScores.find("span", itemprop='location').text
    date = fullScores.find("time").attrs['datetime']
    date = date[:10]
    # date = (date.split("-")[0], date.split("-")[1], date.split("-")[2][:2])
    
    if scoring == "Combined":
        teamHomes = teamHomes * numDivisions

    # loop through teams

    for i in range(1, teamCount):
        teamHome = scoreData[(i*(numDivisions + 1)) - (numDivisions + 1)].find('a').text
        teamName = scoreData[(i*(numDivisions + 1)) - (numDivisions + 1) + 1].contents[2].text
        teamLink = scoreData[(i*(numDivisions + 1)) - (numDivisions + 1)].find('a')['href']
        teamScores = {'A': [], 'B': [], 'C':[]}

        teamScores["A"] = [int(scoreData[(i*(numDivisions + 1)) - (numDivisions + 1)].contents[j].text) for j in range(
            4, (4 + raceCount)) if scoreData[(i*(numDivisions + 1)) - (numDivisions + 1)].contents[j].text.isdigit()]
        if numDivisions > 1:
            teamScores["B"] = [int(scoreData[(i*(numDivisions + 1)) - (numDivisions + 1) + 1].contents[j].text) for j in range(
                4, (4 + raceCount)) if scoreData[(i*(numDivisions + 1)) - (numDivisions + 1) + 1].contents[j].text.isdigit()]
        if numDivisions > 2:
            teamScores["C"] = [int(scoreData[(i*(numDivisions + 1)) - (numDivisions + 1) + 2].contents[j].text) for j in range(
                4, (4 + raceCount)) if scoreData[(i*(numDivisions + 1)) - (numDivisions + 1) + 2].contents[j].text.isdigit()]

        teamNameEls = [i for i in sailors.find_all(
            'td', class_="teamname") if i.text == teamName]
        
        if len(teamNameEls) == 0:
            print("team name entered wrong. Skipping team", teamName)
            continue
        
        teamNameEl = teamNameEls[0]

        rowClass = teamNameEl.parent['class'][1]

        index = 0
        row = teamNameEl.parent
        
        prevSkipper = ""
        prevCrew = ""
        
        while row.next_sibling is not None and row['class'][0] != "topborder" and row['class'][0] != "reserves-row" or index == 0:
            curRow = row
            while curRow.find_all('td', class_="division-cell") == []:
                curRow = curRow.previous_sibling
            division = curRow.find_all('td', class_="division-cell")[0].text

            # Get Skipper
            skipper = row.contents[len(row.contents) - 4]
            skipperName = skipper.text.split(" '", 1)[0]
            
            if skipperName == "No show":
                skipperName = ""

            # Get Crew
            crew = row.contents[len(row.contents) - 2]
            crewName = crew.text.split(" '", 1)[0]
            
            if crewName == "No show":
                crewName = ""
            
            if skipperName != "" and crewName != "":
                skipperRaceNums = skipper.next_sibling.text.split(",")
                skipperRaceNums = getRaceNums([i.split("-", 1) for i in skipperRaceNums], len(teamScores[division]))
                if len(skipperRaceNums) == 0:
                    print(f"skipper {skipperName} sailed no races? skipping")
                    row = row.next_sibling
                    index += 1
                    continue
                
                crewRaceNums = crew.next_sibling.text.split(",")
                crewRaceNums = getRaceNums([i.split("-", 1) for i in crewRaceNums], len(teamScores[division]))
                
                if len(crewRaceNums) == 0:
                    print(f"crew {crewName} sailed no races? skipping")
                    row = row.next_sibling
                    index += 1
                    continue
                
                
                skipperPartners = [(crewName if curRace in crewRaceNums else "Unknown") for curRace in range(1, skipperRaceNums[-1] + 1)]
                crewPartners = [(skipperName if curRace in skipperRaceNums else "Unknown") for curRace in range(1, crewRaceNums[-1] + 1)]
                
                for i, score in enumerate(teamScores[division]):
                    if i + 1 in skipperRaceNums:
                        df_races = pd.concat([df_races, makeRaceSeries(score, teamHome, i + 1, division, skipperName, "Skipper", skipperPartners[i], host,regatta,[t for t in teamHomes], date, teamLink).to_frame().T])
                    if i + 1 in crewRaceNums:
                        df_races = pd.concat([df_races, makeRaceSeries(score, teamHome, i + 1, division, crewName, "Crew", crewPartners[i], host,regatta,[t for t in teamHomes],date, teamLink).to_frame().T])
                
                prevSkipper = skipperName
                prevCrew = crewName
            
            elif skipperName != "":
                raceNums = skipper.next_sibling.text.split(",")
                raceNums = getRaceNums([i.split("-", 1) for i in raceNums], len(teamScores[division]))

                # need to implement previous persion if unknown
                # previous implementation doesn't work anymore becase it relied on changing a person's races. 
                # now have to edit the dataframe if we use same approach. 

                for i, score in enumerate(teamScores[division]):
                    if i + 1 in raceNums:
                        df_races = pd.concat([df_races, makeRaceSeries(score, teamHome, i + 1, division, skipperName, "Skipper", "Unknown", host,regatta,[t for t in teamHomes],date, teamLink).to_frame().T])
                
                        
                prevSkipper = skipperName
                
            elif crewName != "":
                raceNums = crew.next_sibling.text.split(",")
                raceNums = getRaceNums([i.split("-", 1) for i in raceNums], len(teamScores[division]))
                
                # still need prev person
                
                for i, score in enumerate(teamScores[division]):
                    if i + 1 in raceNums:
                        df_races = pd.concat([df_races, makeRaceSeries( score, teamHome, i + 1, division, crewName, "Crew", "Unknown", host,regatta,[t for t in teamHomes],date, teamLink).to_frame().T])
    
                    
                prevCrew = crewName

            row = row.next_sibling
            index += 1

(1/694) analyzing f24/fall-pacific-coast
(2/694) analyzing f24/fall-pcc-jv
skipper Maximilian Miesen sailed no races? skipping
(3/694) analyzing f24/2024-triangle-tango-intercollegiate
(4/694) analyzing f24/frosh-soph
(5/694) analyzing f24/mcsa-open-fall
(6/694) analyzing f24/north-north
(7/694) analyzing f24/kiara-broudy
(8/694) analyzing f24/saisa-fall-women
(9/694) analyzing f24/women-urn
(10/694) analyzing f24/maisa-women-fall-dinghy
(11/694) analyzing f24/war
(12/694) analyzing f24/neisa-fall-tournament
(13/694) analyzing f24/schell-neisa-open
(14/694) analyzing f24/pccsc-women-fall
(15/694) analyzing f24/saisa-fall-open
(16/694) analyzing f24/seisa-women-fall
(17/694) analyzing f24/nickerson
(18/694) analyzing f24/cedarfest
(19/694) analyzing f24/kingfisher
(20/694) analyzing f24/top-fall-invite
(21/694) analyzing f24/terrier-invite
(22/694) analyzing f24/open-atlantic-coast-final
(23/694) analyzing f24/open-atlantic-coast-tournament
(24/694) analyzing f24/yale-women-interconfere

In [38]:
len(df_races['Regatta'].unique())

367

In [19]:
# df_races.to_csv(f"{seasons}races.csv", index=False)
df_races.to_csv(f"races.csv", index=False)