In [1]:
import pandas as pd
import numpy as np
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

cred = credentials.Certificate("thecrowsnestapp-creds.json")
firebase_admin.initialize_app(cred)

db = firestore.client()

In [2]:
from openskill.models import PlackettLuce, BradleyTerryFull
model = PlackettLuce(beta=25.0/120.0)

In [None]:
%load_ext scalene

In [3]:
targetElo = 1000
baseElo = 500
targetSeason = 'f24'
targetTRSeason = 's24'
# baseSigma = baseElo // 3
# offset = baseElo * 2

In [4]:
class Sailor:
    def __init__(self, name,key, year, links, teams, seasons=[], skipperRank=0, crewRank=0,skipperRating=baseElo, crewRating=baseElo, races=[], gender=""):
        self.name = name
        self.key = key
        self.gender = gender
        self.year = year
        self.links = links
        self.teams = teams
        
        self.skipperRank = skipperRank
        self.crewRank = crewRank
        self.womenSkipperRank = 0
        self.womenCrewRank = 0
        
        self.skipperRankTR = 0
        self.crewRankTR = 0
        self.womenSkipperRankTR = 0
        self.womenCrewRankTR = 0
        
        self.seasons = seasons
        self.races = []
        self.rivals = {}
        # self.sr = model.rating(skipperRating, skipperRating / 3, name)
        # self.cr = model.rating(crewRating, crewRating / 3, name)
        
        # fleet racing
        self.wsr = model.rating(name=name)
        self.wcr = model.rating(name=name)
        self.sr = model.rating(name=name)
        self.cr = model.rating(name=name)
        # Team racing 
        self.wtsr = model.rating(name=name)
        self.wtcr = model.rating(name=name)
        self.tsr = model.rating(name=name)
        self.tcr = model.rating(name=name)
        self.avgSkipperRatio = 0
        self.avgCrewRatio = 0
        
    def __repr__(self):
        return f"{self.name}: {self.teams}, {str(self.sr.ordinal())} {str(self.cr.ordinal())} {self.seasons} {len(self.races)}"

In [5]:
def adjust_race_id(row):
    if row['Scoring'] == 'Combined':
        return row['raceID'][:-1]  # Remove the last character (A/B) for combined scoring
    return row['raceID']

In [6]:
df_races = pd.read_json("races_new_test2.json")
# converters={"Teams": lambda x: [y.strip().split("'")[1] for y in x.strip("[]").split(", ")]}
df_races['raceNum'] = df_races['raceID'].apply(lambda id: int(id.split("/")[2][:-1]))  # Numeric part
df_races['raceDiv'] = df_races['raceID'].apply(lambda id: id.split("/")[2][-1])  # Division part (e.g., 'A', 'B')
df_races['adjusted_raceID'] = df_races.apply(adjust_race_id, axis=1) # to make combined division combined
df_races['Link'] = df_races['Link'].fillna('Unknown') # fill empty links
# df_races['key'] = np.where(df_races['Link'] == 'Unknown', df_races['Sailor'], df_races['Link'])
df_races['key'] = df_races.apply(
    lambda row: row['Sailor'] + "-" + row['Team'] if row['Link'] == 'Unknown' else row['Link'],
    axis=1
)
df_races['partnerKey'] = df_races.apply(
    lambda row: row['Partner'] + "-" + row['Team'] if row['PartnerLink'] == 'Unknown' else row['PartnerLink'],
    axis=1
)
# df_races = df_races[df_races['Regatta'] == 'f24/seisa-women-fall'] # example combined regatta

# to exclude f24
# df_races = df_races.loc[df_races['raceID'].apply(lambda id: id.split("/")[0] != 'f24')]
# df_races = df_races.loc[df_races['raceID'].apply(lambda id: id.split("/")[1] != 'east-open-national-semi-final')]
# df_races = df_races.loc[df_races['raceID'].apply(lambda id: id.split("/")[1] != 'open-dinghy-national')]

df_races_full = df_races.sort_values(['Date', 'raceNum', 'raceDiv']).reset_index(drop=True)

df_races_skipper = df_races_full.loc[df_races_full['Position'].str.contains('Skipper')].sort_values(['Date', 'raceNum']).reset_index(drop=True)
df_races_crew = df_races_full.loc[df_races_full['Position'].str.contains('Crew')].sort_values(['Date', 'raceNum']).reset_index(drop=True) 

df_sailor_info = pd.read_json("sailor_data2.json")

df_races_tr = pd.read_json("allseasonsteamraces.json")
df_sailorTRinfo = pd.read_json('trSailorInfoAll.json')
df_races_tr = df_races_tr.sort_values(['date', 'raceNum']).reset_index(drop=True)

# df_sailor_info
# df_races

In [7]:
teamRegions = {'Hawaii': 'PCCSC', 'Brown': 'NEISA', 'Southern Cal': 'PCCSC', 'Salve Regina': 'NEISA', 'UC Santa Barbara': 'PCCSC', 'Cal Poly': 'PCCSC', 'Washington': 'NWICSA', 'Channel Islands': 'PCCSC', 'UC San Diego': 'PCCSC', 'British Columbia': 'NWICSA', 'UC Los Angeles': 'PCCSC', 'Westmont College': 'PCCSC', 'Arizona State': 'PCCSC', 'Texas A&M Galveston': 'SEISA', 'Texas A&M': 'SEISA', 'Tulane': 'SEISA', 'Rice': 'SEISA', 'Texas': 'SEISA', 'Oklahoma State': 'SEISA', 'Texas A&M C. Christ': 'SEISA', 'Central Oklahoma': 'SEISA', 'Notre Dame': 'MCSA', 'Jacksonville': 'SAISA', 'Florida': 'SAISA', 'Tennessee': 'SAISA', 'Rollins': 'SAISA', 'North Carolina State': 'SAISA', 'Georgia Tech': 'SAISA', 'Auburn': 'SAISA', 'Charleston': 'SAISA', 'South Florida': 'SAISA', 'Old Dominion': 'MAISA', 'Eckerd': 'SAISA', 'Florida State': 'SAISA', 'U. Miami': 'SAISA', 'UW Milwaukee': 'MCSA', 'Stony Brook': 'MAISA', 'Duke': 'SAISA', 'Clemson': 'SAISA', 'U South Carolina': 'SAISA', 'UNC Wilmington': 'SAISA', 'Georgia': 'SAISA', 'Berkeley': 'PCCSC', 'CSU Long Beach': 'PCCSC', 'Monterey Bay': 'PCCSC', 'UC Irvine': 'PCCSC', 'UC Davis': 'PCCSC', 'Rhode Island': 'NEISA', 'Georgetown': 'MAISA', 'Dartmouth': 'NEISA', 'MIT': 'NEISA', 'George Washington': 'MAISA', 'Navy': 'MAISA', 'Fordham': 'MAISA', 'Northeastern': 'NEISA', 'Christopher Newport': 'MAISA', 'Victoria': 'NWICSA', 'Boston University': 'NEISA', 'Miami University': 'MCSA', 'Hampton': 'MAISA', 'Virginia': 'MAISA', 'Stevens': 'MAISA', 'Columbia': 'MAISA', 'NY Maritime': 'MAISA', 'Kings Point': 'MAISA', "St. Mary's": 'MAISA', 'Maryland': 'MAISA', 'Virginia Tech': 'MAISA', 'Drexel': 'MAISA', 'Maryland/Baltimore': 'MAISA', 'Buffalo': 'MAISA', 'UC Santa Cruz': 'PCCSC', 'Santa Clara': 'PCCSC', 'Wisconsin': 'MCSA', 'Michigan': 'MCSA', 'Washington College': 'MAISA', 'Minnesota': 'MCSA', 'Yale': 'NEISA', 'Hobart & William': 'MAISA', 'Vermont': 'NEISA', 'Connecticut College': 'NEISA', 'Harvard': 'NEISA', 'Roger Williams': 'NEISA', 'Syracuse': 'MAISA', 'Tufts': 'NEISA', 'Middlebury': 'NEISA', 'New College': 'SAISA', 'William and Mary': 'MAISA', 'Gannon': 'MAISA', 'Boston College': 'NEISA', 'Stanford': 'PCCSC', 'Bowdoin': 'NEISA', 'Lewis & Clark': 'NWICSA', 'Monmouth': 'MAISA', 'American': 'MAISA', 'Michigan State': 'MCSA', 'Hope': 'MCSA', 'Western Michigan': 'MCSA', 'Toledo': 'MCSA', 'Ohio State': 'MCSA', 'Mass Maritime': 'NEISA', 'Coast Guard': 'NEISA', 'Bates': 'NEISA', 'Fairfield': 'NEISA', 'Sacred Heart': 'NEISA', 'Wentworth Institute': 'NEISA', 'Providence': 'NEISA', 'Iowa State': 'MCSA', 'Iowa': 'MCSA', 'Indiana': 'MCSA', 'Davidson': 'SAISA', 'Oregon State': 'NWICSA', 'Western Washington': 'NWICSA', 'U. Rochester': 'MAISA', 'Army': 'MAISA', 'New Hampshire': 'NEISA', 'U. Connecticut': 'NEISA', 'UMass Dartmouth': 'NEISA', 'Wesleyan': 'NEISA', 'U. Mass/ Amherst': 'NEISA', 'U New England': 'NEISA', 'Denison': 'MCSA', 'Northern Michigan': 'MCSA', 'Ohio': 'MCSA', 'Pennsylvania': 'MAISA', 'Villanova': 'MAISA', 'Maine Maritime': 'NEISA', 'Michigan Tech': 'MCSA', 'Illinois': 'MCSA', 'Chicago': 'MCSA', 'Northwestern': 'MCSA', 'Grand Valley State': 'MCSA', 'Washington U': 'MCSA', 'Marquette': 'MCSA', 'Lake Forest': 'MCSA', 'Cornell': 'MAISA', 'Oregon': 'NWICSA', 'Portland State': 'NWICSA', 'Princeton': 'MAISA', "Queen's": 'MAISA', 'Penn State': 'MAISA', 'Ocean County': 'MAISA', 'Delaware': 'MAISA', 'Rutgers': 'MAISA', 'Worcester Polytech': 'NEISA', 'Emmanuel College': 'NEISA', "St. John's": 'MAISA', 'U Pittsburgh': 'MAISA', 'Webb Institute': 'MAISA', 'McGill': 'NEISA', 'Citadel': 'SAISA', 'Colgate': 'MAISA', 'Catholic U America': 'MAISA', 'Loyola College': 'MAISA', 'Ottawa': 'MAISA', 'Royal Military': 'MAISA', 'Dalhousie': 'NEISA', 'U Toronto': 'MAISA', 'New Orleans': 'SEISA', 'Kansas': 'SEISA', 'Bentley': 'NEISA', 'Brandeis': 'NEISA', 'Cal Maritime': 'PCCSC', 'San Diego State': 'PCCSC', 'Loyola': 'SEISA', 'North Texas': 'SEISA', 'Vanderbilt': 'SAISA', 'Purdue': 'MCSA', 'North Carolina': 'SAISA', 'Hillsdale': 'MCSA', 'Amherst': 'NEISA', 'Williams': 'NEISA', 'Hamilton': 'MAISA', 'Rochester': 'MAISA', 'Wellesley': 'NEISA', 'Hosei Univerisity': 'GUEST', 'Colorado': 'SEISA', 'John Carroll': 'MCSA', 'U.  Mass/ Boston': 'NEISA', 'Mercyhurst': 'MAISA', 'Penn State Behrend': 'MAISA', 'Indiana U Pennsylvan': 'MAISA', 'U Nebraska': 'MCSA', 'U Maine': 'NEISA', 'Texas Christian': 'SEISA', 'Embry-Riddle': 'SAISA', 'Palm Beach Atlantic': 'SAISA', 'U of Central Florida': 'SAISA', 'Baldwin-Wallace': 'MCSA', "Saint Mary's College": 'MCSA', 'Olin': 'NEISA', 'Baylor': 'SEISA', 'Texas Tech': 'SEISA', 'Wake Forest': 'SAISA', 'Georgia Southern': 'SAISA', 'East Carolina': 'SAISA', 'Florida Tech': 'SAISA', 'Saint Thomas': 'MCSA', 'Cincinnati': 'MCSA', 'Florida Gulf Coast': 'SAISA', 'Saginaw Valley': 'MCSA', 'Coastal Georgia': 'SAISA', 'Cleveland State': 'MCSA', 'Sewanee': 'SAISA', 'Case Western': 'MCSA', 'Oklahoma': 'SEISA', 'Gonzaga': 'PCCSC'}


In [8]:
merges = [{'first': 'carter-anderson', 'second': 'carter-anderson-2027'}, {'first': 'elliott-bates', 'second':'elliott-bates-2021'}, {'first': 'ian-hopkins-guerra','second': 'ian-hopkins-guerra-2026'}, {'first': 'connor-nelson', 'second':'connor-nelson-2024'}]

In [9]:
# Function to add a sailor to the dictionary
def add_sailor(group,names_group, links_group, seasons_group,teams_group, years_group, people):
    """Summary

    Args:
        group (pandas group): The grouped list of sailor keys and teams
        names_group (pandas group): The list of names grouped by sailor key
        links_group (pandas group): The list of links grouped by sailor key
        seasons_group (pandas group): The list of seasons grouped by sailor key
        teams_group (pandas group): The list of teams grouped by sailor key
        years_group (pandas group): The list of years grouped by sailor key
        people (dict): The people dictionary to be added to
    """

    for key, teams in group.items():
        if key not in people.keys():
            # If no teams are associated, set "Unknown"
            teams = teams if len(teams) > 0 else ["Unknown"]

            # Retrieve the precomputed values
            name = names_group.get(key,[])[0]
            link = links_group.get(key,[])
            seasons = seasons_group.get(key,[])
            teams = teams_group.get(key,[])
            year = years_group.get(key, [])[0]
            gender = ""
            
            if key in list(df_sailor_info['link']):
                data = df_sailor_info.loc[df_sailor_info['link'] == key]
                gender = data['gender'].iat[0]
                year = data['year'].iat[0]
            
            # Add the sailor to the people dictionary
            people[key] = Sailor(name, key, year, list([link]), teams, {'skipper': seasons.get('Skipper', []), 'crew': seasons.get('Crew', [])}, gender=gender)
            
def setupPeople():
    """Generates a dictionary with all of the sailors based on the df_races_full dataframe

    Returns:
        dict: The filled out dictionary of people
    """
    
    people = {}
    
    try:
        df_s = pd.read_json("sailorsasf.json")
    except:
        df_s = pd.DataFrame(columns=['Sailor'])

    # create sailors from file (NOT WORKING)
    for sailor in list(df_s['Sailor'].unique()):
        # print(sailor)
        positions = df_s.loc[df_s['Sailor'] == sailor, 'Pos']
        for pos in positions:
            teams = df_s.loc[(df_s['Sailor'] == sailor)& (df_s['Pos'] == pos), 'Teams'].iat[0]
            seasons = df_s.loc[(df_s['Sailor'] == sailor) & (df_s['Pos'] == pos), 'Seasons'].iat[0]
            year = df_s.loc[(df_s['Sailor'] == sailor) & (df_s['Pos'] == pos), 'GradYear'].iat[0]
            link = df_s.loc[(df_s['Sailor'] == sailor) & (df_s['Pos'] == pos), 'Link'].iat[0]
            rating = df_s.loc[(df_s['Sailor'] == sailor) & (df_s['Pos'] == pos), 'Elo'].iat[0]
            rank = df_s.loc[(df_s['Sailor'] == sailor) & (df_s['Pos'] == pos), 'Rank'].iat[0]
            races = df_s.loc[(df_s['Sailor'] == sailor) & (df_s['Pos'] == pos), 'Races'].iat[0]
            people[link] = Sailor(sailor, year, link, teams, pos, seasons, rank, rating, races)
    
    # Do merges if necessary (merging two techscore links)
    # We must merge here before the calculation is done, because each new rating will need the accurate history
    for merge in merges:
        if merge['second'] in people.keys():
            people[merge['first']].links.append(people[merge['second']])
            del people[merge['second']]
        df_races_full['key'] = df_races_full['key'].replace(merge['second'], merge['first'])
        df_races_skipper['key'] = df_races_skipper['key'].replace(merge['second'], merge['first'])
        df_races_crew['key'] = df_races_crew['key'].replace(merge['second'], merge['first'])

    # Pre-group the data for skippers and crews
    grouped = df_races_full.groupby(['key'])['Team'].unique()
    
    # Precompute seasons for skippers and crew
    names = (
        df_races_full.assign(Season=df_races_full['raceID'].str.split('/').str[0])
        .groupby('key')['Sailor']
        .unique()
    )
    links = (
        df_races_full.assign(Season=df_races_full['raceID'].str.split('/').str[0])
        .groupby('key')['Link']
        .unique()
    )
    seasons = (
        df_races_full.assign(Season=df_races_full['raceID'].str.split('/').str[0])
        .groupby(['key', 'Position'])['Season']
        .unique()
    )
    teams = (
        df_races_full.assign(Season=df_races_full['raceID'].str.split('/').str[0])
        .groupby('key')['Team']
        .unique()
    )

    years = (
        df_races_full.assign(Season=df_races_full['raceID'].str.split('/').str[0])
        .groupby('key')['GradYear']
        .unique()
    )
    
    # Add all sailors to the people dictionary
    add_sailor(grouped, names, links, seasons, teams, years, people)
    
    return people

In [53]:
# %%scalene # for profiling the code

# Set up people dictionary
people = setupPeople()
# Pre calculate the number of races to rate
leng = len(df_races['raceID'].unique()) * 2

# List of residuals (errors)
residuals = []

# Current race count for print statement
i = 0

# Creates pandas group object
grouped = df_races_full.groupby(['Date', 'Regatta', 'adjusted_raceID'], sort=False)

# Iterate through each race
for (date, regatta, race), row in grouped:
    # Calculate for each position
    for type in ['Skipper', 'Crew']:
        
        # Print status every 1000 to help with performance and output length
        if i % 1000 == 0:
            print(f"Currently analyzing race {i}/{leng} Regatta:{regatta}, Date:{date}")
        i += 1
        
        # Filter by current position 
        scores = row[row['Position'] == type] 
        # Grab lists for each datapoint
        keys = scores['key'] # the sailor keys
        names = scores['Sailor'] # the sailor names
        scoreVals = list(scores['Score']) # the score values
        
        # check for invalid race conditions
        if len(keys) < 2: # less than two sailors
            continue
        if np.isnan(scoreVals[0]): # B division did not complete the set
            continue
        
        # Grab people objects 
        racers = [people[key] if key != 'Unknown'
                and key is not None 
                else people[name] for key,name in zip(keys,names)]
        
        # Check for womens regatta
        partnerKeys = row[row['Position'] != type]['key']
        partnerNames = row[row['Position'] != type]['Sailor']
        partners = [people[key] if key != 'Unknown'
                and key is not None 
                else people[name] for key,name in zip(partnerKeys,partnerNames)]
        
        genders = [p.gender for p in racers + partners]
        womenCount = sum([1 if g == "F" else 0 for g in genders])
        womens = 'M' not in genders and womenCount >= 4
        
        # Seperate out the openskill rating objects for use in the model
        if not womens:
            ratings = [[r.sr] if type == 'Skipper' else [r.cr] for r in racers]
        else:
            ratings = [[r.wsr] if type == 'Skipper' else [r.wcr] for r in racers]

        # grab starting rating values for change calculation later
        startingRating = [r[0].ordinal(target=targetElo, alpha=200 / model.sigma) for r in ratings]
        
        # Calculate regatta average
        regattaAvg = sum(startingRating) / len(racers)
        
        # Rate using the model
        ratings = model.rate(ratings, scoreVals)
        
        # Make predictions
        predictions = model.predict_rank(ratings)
        
        # calculate error and add to list (residuals)
        for pred, score, racer in zip(predictions, scoreVals, racers):
            residuals.append(score - pred[0])

        # Update racers' ratings
        for racer, new_rating in zip(racers, ratings):
            if type == 'Skipper':
                if womens:
                    racer.wsr = new_rating[0]
                else: 
                    racer.sr = new_rating[0]
            else:
                if womens:
                    racer.wcr = new_rating[0]
                else:
                    racer.cr = new_rating[0]

        # Pre-calculate lists for sailor's race values
        if womens:
            changes = [(racers[i].wsr.ordinal(target=targetElo, alpha=200 / model.sigma) if type == 'Skipper' else racers[i].wcr.ordinal(target=targetElo, alpha=200 / model.sigma)) - startingRating[i] for i in range(len(racers))]
        else:
            changes = [(racers[i].sr.ordinal(target=targetElo, alpha=200 / model.sigma) if type == 'Skipper' else racers[i].cr.ordinal(target=targetElo, alpha=200 / model.sigma)) - startingRating[i] for i in range(len(racers))]
        
        # Common values for each sailor
        venue = scores['Venue'].iat[0]
        scoring = scores['Scoring'].iat[0]
        actualID = scores['raceID'].iat[0]
        
        # Make list of regions and combine PCCSC and NWICSA (those shouldnt count as cross regional for rating purposes)
        regions = [teamRegions[p.teams[-1]] if p.teams[-1] in teamRegions.keys() else None for p in racers]
        regions = ['PCCSC' if reg == 'NWICSA' else reg for reg in regions]
        
        # Check if race has any out of region sailors
        isCross = 1 if len(set(regions)) > 1 else 0
        
        # Only calculate number of cross regional sailors if it is the current season
        doCr = race.split("/")[0] == targetSeason and isCross == 1
        
        # Loop through each sailor and the associated values
        for sailor, score, pred, change, partner in zip(racers, scoreVals, predictions, changes, partners):
            outLinks = 0
            
            if(isCross == 1):
                # Calculate the number of sailors that are not in the sailor's region
                outLinks = sum(1 for reg in regions # adds 1 each time that a region in the regatta ... 
                               if reg is not None # Double check that the region is not none # first double check that the sailor's team is in the list of regions
                               and sailor.teams[-1] in teamRegions.keys()
                               and ('PCCSC' if reg == 'NWICSA' else reg) != ('PCCSC' if teamRegions[sailor.teams[-1]] == 'NWICSA' else teamRegions[sailor.teams[-1]])) # The sailor's region is not the same as the opponent)
                # Note: We don't need to filter out the sailor themselves from this list, because they will have the same region as themseleves so it will not be counted.
            
            # add race to each sailor's score
            sailor.races.append({
                'score': int(score), # Need to rewrite to include DNF and such (correctly evaluating score but its hard to tell )
                'pos': type,
                'predicted': pred[0],
                'ratio': 1 - ((int(score) - 1) / (len(racers) - 1)), # Calculate ratio here
                'change': change,
                'regAvg': regattaAvg,
                'cross': isCross,
                'outLinks': outLinks,
                'skipperRating': sailor.sr.ordinal(target=targetElo, alpha=200 / model.sigma), # add offset to prevent negative ratings
                'crewRating': sailor.cr.ordinal(target=targetElo, alpha=200 / model.sigma), # add offset to prevent negative ratings
                'womenSkipperRating': sailor.wsr.ordinal(target=targetElo, alpha=200 / model.sigma), # add offset to prevent negative ratings
                'womenCrewRating': sailor.wcr.ordinal(target=targetElo, alpha=200 / model.sigma), # add offset to prevent negative ratings
                'womens': womens,
                'date' :date,
                'partner': {'name': partner.name, 'key': partner.key},
                'venue': venue,
                'raceID': actualID,
                'type': 'fleet',
                'scoring': scoring
            })

# Calculate statiscs about the accuracy of the model. (Lower is better)
me = np.array(residuals).mean()
mse = (np.array(residuals) ** 2).mean()
print(me, mse)

Currently analyzing race 0/65162 Regatta:s16/peter-wenner-rainbow-invite, Date:2016-01-16 00:00:00
Currently analyzing race 1000/65162 Regatta:s16/vietor, Date:2016-03-19 00:00:00
Currently analyzing race 2000/65162 Regatta:s16/oberg, Date:2016-04-16 00:00:00
Currently analyzing race 3000/65162 Regatta:s16/nwicsa-co-ed-fleet-race, Date:2016-04-30 00:00:00
Currently analyzing race 4000/65162 Regatta:f16/fj-invitational, Date:2016-09-11 00:00:00
Currently analyzing race 5000/65162 Regatta:f16/st-mary-fall-interconference, Date:2016-09-24 00:00:00
Currently analyzing race 6000/65162 Regatta:f16/david-lee-arnoff, Date:2016-10-08 00:00:00
Currently analyzing race 7000/65162 Regatta:f16/women-atlantic-coast-backup, Date:2016-11-12 00:00:00
Currently analyzing race 8000/65162 Regatta:s17/hawkeye-invitational, Date:2017-03-25 00:00:00
Currently analyzing race 9000/65162 Regatta:s17/alymers, Date:2017-04-08 00:00:00
Currently analyzing race 10000/65162 Regatta:s17/army-spring-open, Date:2017-04

In [54]:
#TR calculation

i = 0
for _,raceRow in df_races_tr.iterrows():
    i += 1
    # print(raceRow)
    for type in ['Skipper', 'Crew']:
        
        womens = False
        
        teamAName = raceRow['teamAName']
        teamAKeys = [boat[type.lower() + 'Key'] if boat[type.lower() + 'Key'] is not None else 'Unknown' for boat in raceRow['teamABoats']] # the sailor keys
        teamANames = [boat[type.lower() + 'Name'] for boat in raceRow['teamABoats']] # the sailor keys
        teamARacers = [people[key] for key in teamAKeys if 'Unknown' not in key and key in people.keys()]
        
        teamBName = raceRow['teamBName']
        teamBKeys = [boat[type.lower() + 'Key'] if boat[type.lower() + 'Key'] is not None else 'Unknown' for boat in raceRow['teamBBoats']] # the sailor keys
        teamBNames = [boat[type.lower() + 'Name'] for boat in raceRow['teamBBoats']] # the sailor keys
        teamBRacers = [people[key] for key in teamBKeys if 'Unknown' not in key and key in people.keys()]
        
        genders = [p.gender for p in teamARacers + teamBRacers]
        womenCount = sum([1 if g == "F" else 0 for g in genders])
        womens = 'M' not in genders and womenCount >= 4
        
        teamARatings = []
        if womens:
            teamARatings = [r.wtsr if type == 'Skipper' else r.wtcr for r in teamARacers]
        else:
            teamARatings = [r.tsr if type == 'Skipper' else r.tcr for r in teamARacers]
            
        teamBRatings = []
        if womens:
            teamBRatings = [r.wtsr if type == 'Skipper' else r.wtcr for r in teamBRacers]
        else:
            teamBRatings = [r.tsr if type == 'Skipper' else r.tcr for r in teamBRacers]
        
        startingARating = [r.ordinal(target=targetElo, alpha=200 / model.sigma) for r in teamARatings]
        startingBRating = [r.ordinal(target=targetElo, alpha=200 / model.sigma) for r in teamBRatings]
        
        if len(teamARatings) < 1 or len(teamBRatings) < 1:
            print("not enough sailors in this race, skipping", raceRow['raceID'])
            continue
        
        predictions = model.predict_rank([teamARatings, teamBRatings])
        
        ratings = model.rate([teamARatings, teamBRatings], 
                             ranks=[1 if raceRow['teamAOutcome'] == 'win' else 2, 1 if raceRow['teamBOutcome'] == 'win' else 2])
        
        
        for team, name, newRatings in zip([teamARacers, teamBRacers], [teamAName, teamBName], ratings):
            for racer, new_rating in zip(team, newRatings):
                # print(new_rating.ordinal(target=targetElo, alpha=200 / model.sigma))
                # racer.teams = [name]
                if raceRow['raceID'].split("/")[0] not in racer.seasons[type.lower()]:
                    np.append(racer.seasons[type.lower()], [raceRow['raceID'].split("/")[0]])
                if type == 'Skipper':
                    if womens:
                        racer.wtsr = new_rating
                    else: 
                        racer.tsr = new_rating
                else:
                    if womens:
                        racer.wtcr = new_rating
                    else:
                        racer.tcr = new_rating
                        
        if womens:
            teamARatings = [r.wtsr if type == 'Skipper' else r.wtcr for r in teamARacers]
        else:
            teamARatings = [r.tsr if type == 'Skipper' else r.tcr for r in teamARacers]
            
        if womens:
            teamBRatings = [r.wtsr if type == 'Skipper' else r.wtcr for r in teamBRacers]
        else:
            teamBRatings = [r.tsr if type == 'Skipper' else r.tcr for r in teamBRacers]
            
        # if type == 'Skipper':
        #     # print(womens)
        #     # print(ratings)
        #     print(raceRow['teamAOutcome'], raceRow['teamBOutcome'], [1 if raceRow['teamAOutcome'] == 'win' else 2, 1 if raceRow['teamBOutcome'] == 'win' else 2], type) 
        endingARatings = [r.ordinal(target=targetElo, alpha=200 / model.sigma) for r in teamARatings]
        endingBRatings = [r.ordinal(target=targetElo, alpha=200 / model.sigma) for r in teamBRatings]
        #     print(teamAName,startingARating,"->",endingARatings)
        #     print(teamBName,startingBRating,"->",endingBRatings)
        
        AChanges = [e-s for s,e in zip(startingARating, endingARatings)]
        BChanges = [e-s for s,e in zip(startingBRating, endingBRatings)]
        
        for tscore, toutcome, oppt, oppn, i, racers, sratings, eratings, changes in zip([raceRow['teamAScore'], raceRow['teamBScore']], 
                                                   [raceRow['teamAOutcome'],raceRow['teamBOutcome']],
                                                   [raceRow['teamBName'], raceRow['teamAName']], 
                                                   [raceRow['teamBNick'], raceRow['teamANick']], [0,1], 
                                                   [teamARacers, teamBRacers], 
                                                   [startingARating, startingBRating],
                                                   [endingARatings, endingBRatings], 
                                                   [AChanges, BChanges]):
            for racer, sr, er, change in zip(racers, sratings, eratings, changes):
                newRating = 0
                if type == 'Skipper':
                    newRating = racer.wtsr.ordinal(target=targetElo, alpha=200 / model.sigma) if womens else racer.tsr.ordinal(target=targetElo, alpha=200 / model.sigma)
                else:
                    newRating = racer.wtcr.ordinal(target=targetElo, alpha=200 / model.sigma) if womens else racer.tcr.ordinal(target=targetElo, alpha=200 / model.sigma)

                racer.races.append({'raceID': raceRow['raceID'], 'raceNum': raceRow['raceNum'], 'round':  raceRow['round'], 
                                    'pos': type,
                                    'type': 'womens' if womens else 'open',
                                    'opponentTeam': raceRow['teamBName'], 
                                    'opponentNick': raceRow['teamBNick'],
                                    'score': raceRow['teamAScore'],
                                    'outcome': raceRow['teamAOutcome'], 
                                    'predicted': 'win' if predictions[0][0] == 1 else 'lose',
                                    'newRating': newRating,
                                    'change': float(change),
                                    'type': 'team'
                                    })
        # for racer, change in zip(teamBRacers, BChanges):
        #     newRating = 0
        #     if type == 'Skipper':
        #         newRating = racer.wtsr.ordinal(target=targetElo, alpha=200 / model.sigma) if womens else racer.tsr.ordinal(target=targetElo, alpha=200 / model.sigma)
        #     else:
        #         newRating = racer.wtcr.ordinal(target=targetElo, alpha=200 / model.sigma) if womens else racer.tcr.ordinal(target=targetElo, alpha=200 / model.sigma)
        #     racer.races.append({'raceID': raceRow['raceID'], 'raceNum': raceRow['raceNum'], 'round':  raceRow['round'], 
        #                         'pos': type,
        #                         'type':'womens' if womens else 'open',
        #                         'opponentTeam': raceRow['teamAName'], 
        #                         'opponentNick': raceRow['teamANick'],
        #                         'score': raceRow['teamBScore'],
        #                         'outcome': raceRow['teamBOutcome'], 
        #                         'predicted': 'win' if predictions[1][0] == 1 else 'lose',
        #                         'newRating': newRating, 
        #                         'change': float(change),
        #                         'type': 'team'})
        
        # print(ratings)
    # if i >= 20:
    #     break

not enough sailors in this race, skipping s16/2016-jeremy-mcintyre-team-race/41
not enough sailors in this race, skipping s16/2016-jeremy-mcintyre-team-race/41
not enough sailors in this race, skipping s16/mendelblatt-team-race/55
not enough sailors in this race, skipping s16/mendelblatt-team-race/55
not enough sailors in this race, skipping s16/mendelblatt-team-race/59
not enough sailors in this race, skipping s16/mendelblatt-team-race/59
not enough sailors in this race, skipping s16/mendelblatt-team-race/62
not enough sailors in this race, skipping s16/mendelblatt-team-race/62
not enough sailors in this race, skipping s16/mendelblatt-team-race/67
not enough sailors in this race, skipping s16/mendelblatt-team-race/67
not enough sailors in this race, skipping s16/nw-team-race/6
not enough sailors in this race, skipping s16/nw-team-race/6
not enough sailors in this race, skipping s16/nw-team-race/7
not enough sailors in this race, skipping s16/nw-team-race/7
not enough sailors in this r

In [59]:
# Compute season rivals
grouped = df_races_full.groupby('adjusted_raceID') # [df_races_full['raceID'].str.startswith('f24')]

i = 0
leng = len(grouped)
for raceID, scores in grouped:
    # Report status every 100 races
    if i % 1000 == 0:
        print(f"Currently analyzing race {i}/{leng} Regatta:{raceID}")
    i += 1
    
    season = raceID.split("/")[0]
    
    sailor_scores = scores.set_index('key')[['Position', 'Score', 'Team', 'Sailor']]

    for sailor, sailor_data in sailor_scores.iterrows():
        pos = sailor_data['Position']
        score = sailor_data['Score']
        p = people[sailor]
        
        others = sailor_scores.iterrows()

        for other_key, other_sailor_data in others:
            if other_key != sailor:
                other_pos = other_sailor_data['Position']
                if pos == other_pos:
                    other_score = other_sailor_data['Score']
                    other_team = other_sailor_data['Team']
                    other_name = other_sailor_data['Sailor']
                    
                    if pos not in p.rivals:
                        p.rivals[pos] = {}
                    
                    if other_key not in p.rivals[pos]:
                        p.rivals[pos][other_key] = {'name': other_name,'races': {}, 'team': other_team, 'wins': {}}
                        
                    if season not in p.rivals[pos][other_key]['races'].keys():
                        p.rivals[pos][other_key]['races'][season] = 0
                    if season not in p.rivals[pos][other_key]['wins'].keys():
                        p.rivals[pos][other_key]['wins'][season] = 0
                    
                    p.rivals[pos][other_key]['races'][season] += 1
                    if other_score > score:
                        p.rivals[pos][other_key]['wins'][season] += 1

Currently analyzing race 0/29202 Regatta:f16/34th-stedman-hood/10A
Currently analyzing race 1000/29202 Regatta:f16/nickerson-backup/5B
Currently analyzing race 2000/29202 Regatta:f17/atlantic-coast-championships/8A
Currently analyzing race 3000/29202 Regatta:f17/oberg/2A
Currently analyzing race 4000/29202 Regatta:f18/cedarfest/1B
Currently analyzing race 5000/29202 Regatta:f18/pccsc-fall-women-champs/2
Currently analyzing race 6000/29202 Regatta:f19/canam-cup/3B
Currently analyzing race 7000/29202 Regatta:f19/north-north-jv/6B
Currently analyzing race 8000/29202 Regatta:f21/2021-women-atlantic-coast/2A
Currently analyzing race 9000/29202 Regatta:f21/mccurdy/9B
Currently analyzing race 10000/29202 Regatta:f21/ucsd-open/5A
Currently analyzing race 11000/29202 Regatta:f22/maisa-fall-women/10A
Currently analyzing race 12000/29202 Regatta:f22/tufts-invite/2
Currently analyzing race 13000/29202 Regatta:f23/invite/5B
Currently analyzing race 14000/29202 Regatta:f23/sugar-bowl-womens/7
Curren

In [55]:
# Filter sailors who have 'f24' in their seasons list
eligible_skippers = [p for p in people.values()
                    if targetSeason in p.seasons['skipper'] 
                    and sum([race['outLinks'] for race in p.races if 'outLinks' in race.keys()]) > 70]

eligible_crews = [p for p in people.values()
                    if targetSeason in p.seasons['crew']
                    and sum([race['outLinks'] for race in p.races if 'outLinks' in race.keys()]) > 70]

eligible_skippers_tr = [p for p in people.values()
                    if targetTRSeason in p.seasons['skipper']]
eligible_crews_tr = [p for p in people.values()
                    if targetTRSeason in p.seasons['crew']]


for p in people.values():
    p.skipperRank = 0
    p.crewRank = 0
    p.womenSkipperRank = 0
    p.womenCrewRank = 0
    p.skipperRankTR = 0
    p.crewRankTR = 0
    p.womenSkipperRankTR = 0
    p.womenCrewRankTR = 0

for i,s in enumerate(sorted([p for p in eligible_skippers if p.sr.mu != model.mu], key=lambda p: p.sr.ordinal(), reverse=True)):
    s.skipperRank = i + 1
for i,s in enumerate(sorted([p for p in eligible_crews if p.cr.mu != model.mu], key=lambda p: p.cr.ordinal(), reverse=True)):
    s.crewRank = i + 1

for i,s in enumerate(sorted([p for p in eligible_skippers if p.wsr.mu != model.mu], key=lambda p: p.wsr.ordinal(), reverse=True)):
    s.womenSkipperRank = i + 1
for i,s in enumerate(sorted([p for p in eligible_crews if p.wcr.mu != model.mu], key=lambda p: p.wcr.ordinal(), reverse=True)):
    s.womenCrewRank = i + 1
    
for i,s in enumerate(sorted([p for p in eligible_skippers_tr if p.tsr.mu != model.mu], key=lambda p: p.tsr.ordinal(), reverse=True)):
    s.skipperRankTR = i + 1
for i,s in enumerate(sorted([p for p in eligible_crews_tr if p.tcr.mu != model.mu], key=lambda p: p.tcr.ordinal(), reverse=True)):
    s.crewRankTR = i + 1

for i,s in enumerate(sorted([p for p in eligible_skippers_tr if p.wtsr.mu != model.mu], key=lambda p: p.wtsr.ordinal(), reverse=True)):
    s.womenSkipperRankTR = i + 1
for i,s in enumerate(sorted([p for p in eligible_crews_tr if p.wtcr.mu != model.mu], key=lambda p: p.wtcr.ordinal(), reverse=True)):
    s.womenCrewRankTR = i + 1

allRows = []
for sailor,p in people.items():
    avgSkipperRatio = float(np.array([r['ratio'] for r in p.races if r['pos'] == 'Skipper' and 'ratio' in r.keys()] ).mean())
    avgCrewRatio = float(np.array([r['ratio'] for r in p.races if r['pos'] == 'Crew' and 'ratio' in r.keys()] ).mean())
    p.avgSkipperRatio = avgSkipperRatio
    p.avgCrewRatio = avgCrewRatio
    
    allRows.append([p.name, sailor, 
                    p.skipperRank,p.crewRank, p.womenSkipperRank, 
                    p.womenCrewRank,p.skipperRankTR, p.womenSkipperRankTR,  
                    p.teams, 
                    p.sr.ordinal(target=targetElo, alpha=200 / model.sigma), 
                    p.wsr.ordinal(target=targetElo, alpha=200 / model.sigma), 
                    p.tsr.ordinal(target=targetElo, alpha=200 / model.sigma),
                    p.wtsr.ordinal(target=targetElo, alpha=200 / model.sigma), 
                    p.cr.ordinal(target=targetElo, alpha=200 / model.sigma), 
                    sum([race['outLinks'] for race in p.races if 'outLinks' in race.keys()]), 
                    p.year, p.links, p.sr.mu, 
                    p.cr.mu, avgSkipperRatio,avgCrewRatio, 
                    p.sr.sigma, p.cr.sigma, p.seasons, 
                    sum([race['cross'] for race in p.races if 'cross' in race.keys()]), 
                    p.races,len(p.races), p.rivals])

df_sailors = pd.DataFrame(allRows, columns=['Sailor', 'key', 'SkipperRank', 'CrewRank','WomenSkipperRank', 'WomenCrewRank', 'TRSkipperRank', 'TRWomenSkipperRank', 'Teams', 'SkipperOrdinal','WomenSkipperOrdinal', 'SkipperOrdinalTR', 'WomenSkipperOrdinalTR', 'CrewOrdinal','outLinks','GradYear', 'Links', 'SkipperMU','CrewMU', 'skipperAvgRatio', 'crewAvgRatio', 'SkipperSigma', 'CrewSigma', 'Seasons', 'Cross', 'Races', 'numRaces', 'Rivals'])

# df_sailors.to_json('sailorsexperiment20.json', index=False)
df_sailors.head()

  avgCrewRatio = float(np.array([r['ratio'] for r in p.races if r['pos'] == 'Crew' and 'ratio' in r.keys()] ).mean())
  ret = ret.dtype.type(ret / rcount)
  avgSkipperRatio = float(np.array([r['ratio'] for r in p.races if r['pos'] == 'Skipper' and 'ratio' in r.keys()] ).mean())


Unnamed: 0,Sailor,key,SkipperRank,CrewRank,WomenSkipperRank,WomenCrewRank,TRSkipperRank,TRWomenSkipperRank,Teams,SkipperOrdinal,...,CrewMU,skipperAvgRatio,crewAvgRatio,SkipperSigma,CrewSigma,Seasons,Cross,Races,numRaces,Rivals
0,A.J. Crane,A.J. Crane-Tufts,0,0,0,0,0,0,[Tufts],1044.264422,...,25.0,0.488095,,8.216095,8.333333,"{'skipper': ['s23'], 'crew': []}",0,"[{'score': 10, 'pos': 'Skipper', 'predicted': ...",4,{}
1,Aaron Babier,Aaron Babier-U Toronto,0,0,0,0,0,0,[U Toronto],1000.0,...,17.833008,,0.327381,8.333333,7.210074,"{'skipper': [], 'crew': ['f16']}",14,"[{'score': 9, 'pos': 'Crew', 'predicted': 9, '...",14,{}
2,Aaron Heard,Aaron Heard-Oregon State,0,0,0,0,0,0,[Oregon State],886.801179,...,21.211826,0.27927,0.279377,5.889675,5.90617,"{'skipper': ['s23', 'f23'], 'crew': ['f22', 's...",6,"[{'score': 8, 'pos': 'Crew', 'predicted': 6, '...",41,{}
3,Aaron Klein,Aaron Klein-Tufts,0,0,0,0,0,0,[Tufts],1029.450167,...,25.0,0.397436,,7.911811,8.333333,"{'skipper': ['s23'], 'crew': []}",0,"[{'score': 9, 'pos': 'Skipper', 'predicted': 1...",6,{}
4,Aaron Le Roy,Aaron Le Roy-Marquette,0,0,0,0,0,0,[Marquette],989.631584,...,25.0,0.083333,,7.12988,8.333333,"{'skipper': ['s22'], 'crew': []}",0,"[{'score': 4, 'pos': 'Skipper', 'predicted': 4...",4,{}


In [56]:
# for r in people['benjamin-stone'].races:
#     if r['cross'] == 1:
#         print(f"{r['raceID']} {r['outLinks']}")
        
# print(teamRegions['Western Washington'])
# print(teamRegions['Oregon State'])
# print(teamRegions['Washington'])
# print(teamRegions['Oregon'])
# [race for race in people['carter-anderson'].races if race['type'] == 'team']
people['carter-anderson'].teams

array(['UC Santa Cruz', 'Northeastern'], dtype=object)

In [None]:
win_ratios = []

for rival, stats in people['carter-anderson'].rivals['Skipper'].items():
    races = stats['races']
    wins = stats['wins']
    team = stats['team']
    win_ratio = wins / races if races > 0 else 0  # Handle division by zero
    win_ratios.append((rival, races, win_ratio, team))

# Sort the list by win_ratio in descending order
win_ratios_sorted = sorted(win_ratios, key=lambda x: x[1], reverse=True)

# Print the sorted win ratios
for rival, races, win_ratio, team in win_ratios_sorted:
    print(f"{rival}, Team: {team}, Races: {races}, Win Percentage: {win_ratio * 100:.0f}%")

In [None]:
# Updates only changed sailors
col = db.collection('sailorsElo')

for p in [p for p in people.values() if targetSeason in p['seasons']['skipper'] or targetSeason in p['seasons']['crew']]:
    changes = {}
    if len(p.races) < 1:
        print("no races found for sailor, skipping...")
        continue

    # print(data['Name'])
    # try:
    #     data['Link']
    # except:
    #     changes['Link'] = list(p.link)[0]
    #     # print("No link!")
    # try:
    #     data['Year']
    # except:
    #     changes['Year'] = int(list(p.year)[0][:2])
    #     # print("No year!")

    # print(p.races[0])
    # check races
    changes['races'] = firestore.ArrayUnion(p.races)
    changes['lastUpdate'] = firestore.SERVER_TIMESTAMP
    # for raceID in [r['raceID'] for r in person.races]:
    #     if raceID not in [r['raceID'] for r in data['races']]:
    #         changes['races'].append(raceID)
    #         print("needs update!", data['races'])
    #         print(person.races)
    print(changes)
    col.document(doc_id).update(changes)

In [None]:
people['elliott-chalcraft'].races

In [61]:
# Initialize Firestore client
col = db.collection('sailorsElo')

# Initialize the batch
batch = db.batch()

# Number of documents to commit in each batch
batch_size = 40

# eligible = [p for p in people.values() if targetSeason in p['seasons']['skipper'] 
                                        # or targetSeason in p['seasons']['crew']]

# Iterate over the people values
# for i, p in enumerate(people.values()):
# for p in eligible:

p = people['carter-anderson']
# Prepare the document data to be written
doc_data = {
    "Name": p.name,
    "key": p.key,
    'gender': p.gender,
    "Teams": p.teams.tolist() if isinstance(p.teams, np.ndarray) else p.teams,
    "SkipperRating": int(p.sr.ordinal(target=targetElo, alpha=200 / model.sigma)),
    "CrewRating": int(p.cr.ordinal(target=targetElo, alpha=200 / model.sigma)),
    "WomenSkipperRating": int(p.wsr.ordinal(target=targetElo, alpha=200 / model.sigma)),
    "WomenCrewRating": int(p.wcr.ordinal(target=targetElo, alpha=200 / model.sigma)),
    "SkipperRank": int(p.skipperRank),
    "CrewRank": int(p.crewRank),
    "WomenSkipperRank": int(p.womenSkipperRank),
    "WomenCrewRank": int(p.womenCrewRank),
    "Links": p.links.tolist() if isinstance(p.links, np.ndarray) else p.links if isinstance(p.links, str) else p.links[0].tolist(),
    "Year": p.year,
    "Seasons": {'skipper': list(p.seasons['skipper']), 'crew': list(p.seasons['crew'])},
    "Cross":  sum([race['cross'] for race in p.races if 'cross' in race.keys()]),
    "OutLinks": sum([race['outLinks'] for race in p.races if 'outLinks' in race.keys()]),
    'races': p.races,
    "Rivals": p.rivals,
    "lastUpdate": firestore.SERVER_TIMESTAMP
}

# Add the set operation to the batch
doc_ref = col.document(p.key)
batch.set(doc_ref, doc_data)

# Commit the batch every 20 documents
# if (i + 1) % batch_size == 0:
#     batch.commit()
#     batch = db.batch()  # Start a new batch for the next set of documents

# Commit any remaining operations if there are less than 20 documents left
if (i + 1) % batch_size != 0:
    batch.commit()

In [58]:
# %%scalene
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

def getCounts(races):
    # season_counts = defaultdict(int)
    season_counts = {}
    
    for race in races:
        season = race["raceID"].split("/")[0]
        if season not in season_counts.keys():
            season_counts[season] = {}
        if race['pos'] not in season_counts[season].keys():
            season_counts[season][race['pos']] = 0
        season_counts[season][race['pos']] += 1

    return dict(season_counts)

# batch = db.batch()
col = db.collection('eloTeams')
teams = []
scrape = False
teamNames = teamRegions.keys()
lenteams = len(teamNames)

season_mask = df_sailors['Seasons'].apply(lambda x: targetSeason in x['skipper'] or targetSeason in x['crew'])

# Explode the Teams column to enable grouping
df_exploded = df_sailors.loc[season_mask].explode('Teams')

# Group by team and compute necessary aggregates
team_stats = df_exploded.groupby('Teams').agg(
    numCurMembers=('Teams', 'count'),
    avgSkipperOrdinal=('SkipperOrdinal', 'mean'),
    avgCrewOrdinal=('CrewOrdinal', 'mean'),
    avgSkipperRatio=('skipperAvgRatio', 'mean'),
    avgCrewRatio=('crewAvgRatio', 'mean')
)

# Calculate the average values as in the original code
team_stats['avg'] = (team_stats['avgSkipperOrdinal'] + team_stats['avgCrewOrdinal']) / 2
team_stats['avgRatio'] = (team_stats['avgSkipperRatio'] + team_stats['avgCrewRatio']) / 2

team_stats = team_stats.reindex(teamNames, fill_value=0)

team_link_map = df_races.drop_duplicates('Team').set_index('Team')['Teamlink'].to_dict()

# Optional: Loop for printing (if necessary)
for i, (team, row) in enumerate(team_stats.iterrows()):
    # if team != "MIT":
    #     continue
    print(f"{i}/{len(team_stats)} {team}")
    avg = row['avg']
    avgRatio = row['avgRatio']
    numCurMembers = row['numCurMembers']

# for i,team in enumerate(teamNames):
#     print(f"{i}/{lenteams} {team}")
#     temp = df_sailors.loc[(df_sailors['Teams'].apply(lambda x: team in x)) & (df_sailors['Seasons'].apply(lambda x: 'f24' in x['skipper'] or 'f24' in x['crew']))]
#     avg = (temp.loc[df_sailors['Seasons'].apply(lambda x: 'f24' in x['skipper']), 'SkipperOrdinal'].mean() + temp.loc[df_sailors['Seasons'].apply(lambda x: 'f24' in x['crew']), 'CrewOrdinal'].mean()) / 2
#     avgRatio = (temp.loc[df_sailors['Seasons'].apply(lambda x: 'f24' in x['crew']),'skipperAvgRatio'].mean() + temp.loc[df_sailors['Seasons'].apply(lambda x: 'f24' in x['crew']),'crewAvgRatio'].mean()) / 2
#     numCurMembers = len(temp)
    
    region = teamRegions[team]
    # teamLink = df_races.loc[df_races['Team'] == team, 'Teamlink'].iloc[0]
    teamLink = team_link_map.get(team, None)  # Default to None if team not found
    url = f"https://scores.collegesailing.org/schools/{teamLink.split("/")[2]}"
    
    if scrape:
        page = requests.get(url)
        teamPage = BeautifulSoup(page.content, 'html.parser')
        
        try:
            region = teamPage.find('span', class_="page-info-value").contents[0].contents[0]
        except:
            print(url)
            continue
        
    filtered_people = [p for p in people.values() if team in p.teams]
    
    members = [{"name": p.name,
                "key": p.key,
                "gender": p.gender,
                "year": str(p.year),
                'teams': list(p.teams),
                'skipperRating': int(p.sr.ordinal(target=targetElo, alpha=200 / model.sigma)),
                'crewRating': int(p.cr.ordinal(target=targetElo, alpha=200 / model.sigma)),
                'womenSkipperRating': int(p.wsr.ordinal(target=targetElo, alpha=200 / model.sigma)),
                'womenCrewRating': int(p.wcr.ordinal(target=targetElo, alpha=200 / model.sigma)),
                'tsr': p.tsr.ordinal(target=targetElo, alpha=200 / model.sigma),
                'wtsr': p.wtsr.ordinal(target=targetElo, alpha=200 / model.sigma),
                'tcr': p.tcr.ordinal(target=targetElo, alpha=200 / model.sigma),
                'wtcr': p.wtcr.ordinal(target=targetElo, alpha=200 / model.sigma),
                'avgSkipperRatio': float(p.avgSkipperRatio),
                'avgCrewRatio': float(p.avgCrewRatio),
                'raceCount': getCounts(p.races),
                'seasons':{'skipper': list(p.seasons['skipper']), 'crew': list(p.seasons['crew'])},
                'cross': sum([race['cross'] for race in p.races if 'cross' in race.keys()]),
                'outLinks': sum([race['outLinks'] for race in p.races if 'outLinks' in race.keys()]),
                'skipperRank': int(p.skipperRank),
                'crewRank': int(p.crewRank),
                'womenSkipperRank': int(p.womenSkipperRank),
                'womenCrewRank': int(p.womenCrewRank)
                } for p in filtered_people]
    
    teamRating = 0
    if numCurMembers > 0:
        teamRatingSkipper = sum([p['skipperRating'] * (p['raceCount'][targetSeason]['Skipper']/ 5) for p in members 
                          if targetSeason in p['raceCount'].keys()
                          if 'Skipper' in p['raceCount'][targetSeason].keys()
                          and p['raceCount'][targetSeason]['Skipper'] > 5])
        teamRatingCrew = sum([p['crewRating'] * (p['raceCount'][targetSeason]['Crew']/ 5) for p in members 
                          if targetSeason in p['raceCount'].keys()
                          if 'Crew' in p['raceCount'][targetSeason].keys()
                          and p['raceCount'][targetSeason]['Crew'] > 5])
        teamRating = (teamRatingSkipper + teamRatingCrew) / numCurMembers
    
    topRating = 0
    topWomenRating = 0
    
    numTops = 3
    if numCurMembers > 0:
        topSkippers = sorted([p['skipperRating'] for p in members
                                #   if p['skipperRank'] != 0 # eligible as in outLinks > 70 and f24 in seasons
                                  if p['cross'] > 20
                                  and p['outLinks'] > 70
                                  and targetSeason in p['seasons']['skipper']
                                #   and 'f24' in p['raceCount'].keys() 
                                #   and p['raceCount']['f24'] > 5
                                  ], reverse=True)[:numTops]
        topSkipperSum = sum(topSkippers)
        
        topCrews = sorted([p['crewRating'] for p in members
                            #    if p['crewRank'] != 0
                               if p['cross'] > 20
                               and p['outLinks'] > 70
                               and targetSeason in p['seasons']['crew']
                            #    and 'f24' in p['raceCount'].keys()
                            #    and p['raceCount']['f24'] > 5
                               ], reverse=True)[:numTops]
        topCrewsSum = sum(topCrews)
        
        topRating = (topSkipperSum + topCrewsSum) / (numTops * 2)
        # print(topSkippers, topCrews, topRating)

        # Women's
        numTops = 2
        topWomenSkippers = sorted([p['womenSkipperRating'] for p in members
                                #   if p['skipperRank'] != 0
                                  if p['cross'] > 20
                                  and p['gender'] == 'F'
                                  and targetSeason in p['raceCount'].keys() 
                                  and 'Skipper' in p['raceCount'][targetSeason].keys() 
                                  and p['raceCount'][targetSeason]['Skipper'] > 5
                                  ], reverse=True)[:numTops]
        topWomenSkipperSum = sum(topWomenSkippers)
        
        topWomenCrews = sorted([p['womenCrewRating'] for p in members
                            #    if p['crewRank'] != 0
                               if p['cross'] > 20
                               and p['gender'] == 'F'
                               and targetSeason in p['raceCount'].keys()
                               and 'Crew' in p['raceCount'][targetSeason].keys() 
                               and p['raceCount'][targetSeason]['Crew'] > 5
                               ], reverse=True)[:numTops]
        topWomenCrewsSum = sum(topWomenCrews)
        
        topWomenRating = (topWomenSkipperSum + topWomenCrewsSum) / (numTops * 2)
        
        # Team racing:
        
        numTops = 3
        topSkippersTR = sorted([p['tsr'] for p in members 
                            if p['tsr'] != 1000
                            and targetSeason in p['seasons']['skipper']
                            ], reverse=True)[:numTops]
        topCrewsTR = sorted([p['tcr'] for p in members 
                             if p['tcr'] != 1000 
                             and targetSeason in p['seasons']['crew']], reverse=True)[:numTops]
        
        topRatingTR = (sum(topSkippersTR) + sum(topCrewsTR)) / (numTops * 2)
        
        numTops = 3
        topWomenSkippersTR = sorted([p['wtsr'] for p in members if p['wtsr'] != 1000 and targetSeason in p['seasons']['skipper']], reverse=True)[:numTops]
        topWomenCrewsTR = sorted([p['wtcr'] for p in members if p['wtcr'] != 1000 and targetSeason in p['seasons']['crew']], reverse=True)[:numTops]
        
        topWomenRatingTR = (sum(topWomenSkippersTR) + sum(topWomenCrewsTR)) / (numTops * 2)
    
    #'#1': ,'#2': ,
    teams.append({"name":team, "avg": avg, 'avgRatio': avgRatio, 'topRating': topRating, 'topWomenRating': topWomenRating,'topRatingTR': topRatingTR,'topWomenRatingTR': topWomenRatingTR, 'teamRating': teamRating, "region": region, "link": url, 'memberCount': numCurMembers})
    # col.document(team.replace(" ", "-").replace("/", "-").lower()).set({"name":team, "avg": avg, 'avgRatio': avgRatio, "region": region, "link": url, 'members': members})
    # if i > 20:
    #     break
#     if i % 20 == 0: # commit every 20 documents
#             batch.commit()
# batch.commit()
doc = db.collection('vars').document('eloTeams').set({"teams": teams})
# teams

0/207 Hawaii
1/207 Brown
2/207 Southern Cal
3/207 Salve Regina
4/207 UC Santa Barbara
5/207 Cal Poly
6/207 Washington
7/207 Channel Islands
8/207 UC San Diego
9/207 British Columbia
10/207 UC Los Angeles
11/207 Westmont College
12/207 Arizona State
13/207 Texas A&M Galveston
14/207 Texas A&M
15/207 Tulane
16/207 Rice
17/207 Texas
18/207 Oklahoma State
19/207 Texas A&M C. Christ
20/207 Central Oklahoma
21/207 Notre Dame
22/207 Jacksonville
23/207 Florida
24/207 Tennessee
25/207 Rollins
26/207 North Carolina State
27/207 Georgia Tech
28/207 Auburn
29/207 Charleston
30/207 South Florida
31/207 Old Dominion
32/207 Eckerd
33/207 Florida State
34/207 U. Miami
35/207 UW Milwaukee
36/207 Stony Brook
37/207 Duke
38/207 Clemson
39/207 U South Carolina
40/207 UNC Wilmington
41/207 Georgia
42/207 Berkeley
43/207 CSU Long Beach
44/207 Monterey Bay
45/207 UC Irvine
46/207 UC Davis
47/207 Rhode Island
48/207 Georgetown
49/207 Dartmouth
50/207 MIT
51/207 George Washington
52/207 Navy
53/207 Fordham
54

In [14]:
topSkippers = []
for p in sorted([p for p in people.values() if p.skipperRank <= 100 and p.skipperRank != 0 and 'f24' in p.seasons['skipper']],key=lambda p: p.skipperRank):
    topSkippers.append({'name': p.name,'key': p.key, 'year':p.year, 'rank': int(p.skipperRank), 'team': list(p.teams),'gender': p.gender, 'rating': p.sr.ordinal(target=targetElo, alpha=200 / model.sigma), 'seasons': list(p.seasons['skipper'])})
doc = db.collection('vars').document('topSkippers').set({"sailors": topSkippers})

topCrews = []
for p in sorted([p for p in people.values() if p.crewRank <= 100 and p.crewRank != 0 and 'f24' in p.seasons['crew']],key=lambda p: p.crewRank):
    topCrews.append({'name': p.name,'key': p.key, 'year':p.year, 'rank': int(p.crewRank), 'team': list(p.teams),'gender': p.gender, 'rating': p.cr.ordinal(target=targetElo, alpha=200 / model.sigma), 'seasons': list(p.seasons['crew'])})
doc = db.collection('vars').document('topCrews').set({"sailors": topCrews})

#Womens
topSkippers = []
for p in sorted([p for p in people.values() if p.womenSkipperRank <= 100 and p.womenSkipperRank != 0 and 'f24' in p.seasons['skipper']],key=lambda p: p.womenSkipperRank):
    topSkippers.append({'name': p.name,'key': p.key, 'year':p.year, 'rank': int(p.womenSkipperRank), 'team': list(p.teams),'gender': p.gender, 'rating': p.wsr.ordinal(target=targetElo, alpha=200 / model.sigma), 'seasons': list(p.seasons['skipper'])})
doc = db.collection('vars').document('topWomenSkippers').set({"sailors": topSkippers})
topCrews = []
for p in sorted([p for p in people.values() if p.womenCrewRank <= 100 and p.womenCrewRank != 0 and 'f24' in p.seasons['crew']],key=lambda p: p.womenCrewRank):
    topCrews.append({'name': p.name,'key': p.key, 'year':p.year, 'rank': int(p.womenCrewRank), 'team': list(p.teams),'gender': p.gender, 'rating': p.wcr.ordinal(target=targetElo, alpha=200 / model.sigma), 'seasons': list(p.seasons['crew'])})
doc = db.collection('vars').document('topWomenCrews').set({"sailors": topCrews})

In [52]:
import json
flattened_dict = {p.key: {'team': p.teams[-1], 'year': p.year, 'name': p.name} for p in people.values() if 'f24' in p.seasons['skipper'] or 'f24' in p.seasons['crew']}
# print(flattened_dict)
doc = db.collection('vars').document('allSailors').set({'allSailors': json.dumps(flattened_dict, separators=(',', ':'))})

In [16]:
len(df_races_full['Regatta'].unique())

1724

In [None]:

# 'score': int(score), # Need to rewrite to include DNF and such (correctly evaluating score but its hard to tell )
#                 'pos': type,
#                 'predicted': pred[0],
#                 'ratio': 1 - ((int(score) - 1) / (len(racers) - 1)), # Calculate ratio here
#                 'change': change,
#                 'regAvg': regattaAvg,
#                 'cross': isCross,
#                 'outLinks': outLinks,
#                 'skipperRating': sailor.sr.ordinal(target=targetElo, alpha=200 / model.sigma), # add offset to prevent negative ratings
#                 'crewRating': sailor.cr.ordinal(target=targetElo, alpha=200 / model.sigma), # add offset to prevent negative ratings
#                 'womenSkipperRating': sailor.wsr.ordinal(target=targetElo, alpha=200 / model.sigma), # add offset to prevent negative ratings
#                 'womenCrewRating': sailor.wcr.ordinal(target=targetElo, alpha=200 / model.sigma), # add offset to prevent negative ratings
#                 'womens': womens,
#                 'date' :date,
#                 'partner': {'name': partner, 'link': partnerLink},
#                 'venue': venue,
#                 'raceID': actualID,
#                 'scoring': scoring
                
for regatta in list(df_races_full['Regatta'].unique()):
    races = df_races_full[df_races_full['Regatta'] == regatta]
    raceIDs = list(races['raceID'].unique())
    links = races['Sailor'].unique()
    date = races['Date'].unique()[0]
    venue = races['Venue'].unique()[0]
    scoring = races['scoring'].unique()[0]
    
    racePpl = [{
        "key":p.key, 
        "Name":p.name, 
        'Year': p.year,
        "Teams": list(p.teams),
        "Rating": int(p.r.mu),
        "GlobalRank": int(p.rank),
        "races": [{
                'sailor': p.name,
                'key': p.key,
                'pos': race['pos'],
                "raceID": race['raceID'],
                "score": float(race['score']),
                "predicted": int(race['predicted']), 
                "change": float(race['change']), 
                'newRating': float(race['newRating']),
                'partner':race['partner'],
                'ratio': float(race['ratio']),
                } for race in p.races if race['raceID'].split("/")[0] + "/" + race['raceID'].split("/")[1] == regatta]
        } for p in people.values() if p.name in links]
    
    # race = {'raceID':'', 'raceNum':0, 'div': '', 'sailors':[]}
    # person = {'name':'', 'rating':0, 'change':0, 'team': '', 'pos': '', 'div':'', 'partner': ''}
    doc = {'regattaName': regatta,'raceIDs':raceIDs,  'sailors': racePpl}
    
    # for race in races['raceID'].unique():
    #     sailors = races[races['raceID'] == race, 'Sailor'].unique()
    # for p in [p for p in people.values() if p.name in sailors]:
        # racePpl.append({'name':p.name, 'rating':p.rating, 'changes':p.changes, 'team': p.team, 'pos': '', 'div':'', 'partner': ''})
    # print(regatta)
    # db.collection('eloRegattas').document().set({'regattaName': regatta,'raceIDs':raceIDs, 'sailors': racePpl}, timeout=15)

s16/peter-wenner-rainbow-invite 2016-01-16 00:00:00
s16/jeff-simon 2016-01-23 00:00:00
s16/seisa-qualifier 2016-01-30 00:00:00
s16/harris-kempner 2016-02-06 00:00:00
s16/old-south 2016-02-13 00:00:00
s16/usf-women 2016-02-20 00:00:00
s16/nelson-roltsch 2016-02-20 00:00:00
s16/saisa-open 2016-02-20 00:00:00
s16/anteater-open 2016-02-20 00:00:00
s16/charleston-women 2016-02-27 00:00:00
s16/barnyard-bazaar 2016-02-27 00:00:00
s16/sea-to-sky 2016-02-27 00:00:00
s16/eckerd-interconference 2016-03-05 00:00:00
s16/odu-open 2016-03-05 00:00:00
s16/mustang-open 2016-03-05 00:00:00
s16/saisa-north-points 2016-03-05 00:00:00
s16/charleston-spring-coed 2016-03-12 00:00:00
s16/navy-spring-women 2016-03-12 00:00:00
s16/2016-sailpack-invitational 2016-03-12 00:00:00
s16/ice-breaker 2016-03-12 00:00:00
s16/south-points-ncf 2016-03-12 00:00:00
s16/saisa-sp 2016-03-12 00:00:00
s16/woollum 2016-03-12 00:00:00
s16/st-mary-women-interconference 2016-03-19 00:00:00
s16/west-canada-cup 2016-03-19 00:00:00
s1

KeyboardInterrupt: 

In [None]:
def delete_collection(coll_ref, batch_size):
    if batch_size == 0:
        return

    docs = coll_ref.list_documents(page_size=batch_size)
    deleted = 0

    for doc in docs:
        if deleted % 50 == 0:
            print(f"{deleted} Deleting doc {doc.id} => {doc.get().to_dict()}")
        doc.delete()
        deleted = deleted + 1

    if deleted >= batch_size:
        return delete_collection(coll_ref, batch_size)
col = db.collection('eloRegattas')
delete_collection(col, 400)