In [1]:
from openskill.models import PlackettLuce
import pandas as pd
import numpy as np
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

cred = credentials.Certificate("thecrowsnestapp-creds.json")
firebase_admin.initialize_app(cred)

db = firestore.client()
model = PlackettLuce()

In [2]:
class Sailor:
    def __init__(self, name, teams, pos, seasons=[], rank=0, rating=1500):
        self.name = name
        self.teams = teams
        self.pos = pos
        self.rank = rank
        self.seasons = seasons
        self.races = []
        self.changes = []
        self.r = model.rating(rating,rating // 3,name)
        
    def rerate(self, rating):
        self.r = model.rating(rating.mu, rating.sigma, self.name)
        
    def __repr__(self):
        return f"{self.name}: {self.teams}, {self.pos} {str(self.r)}"

In [93]:
p1 = Sailor("p1", ['nu'], 'Skipper')
p2 = Sailor("p2", ['nu'], 'Skipper')
p3 = Sailor("p3", ['nu'], 'Skipper')
p4 = Sailor("p4", ['nu'], 'Skipper')
players = [p1,p2,p3,p4]
ratings = [[p.r] for p in players]
ratings = model.rate(ratings, [3,2,1,4], weights=[[3.0]] * 4)
for p,n in zip(players, ratings):
    p.r = n[0]
print(p1,p2,p3,p4)

p1: Plackett-Luce Player Data: 

id: ad16bdc0c0104be0921137ba9ff870f2
name: p1
mu: 1489.5836948596302
sigma: 489.5844366494107
 p2: Plackett-Luce Player Data: 

id: 52c34cc14e5649d98d05ec0fb4a48f5a
name: p2
mu: 1708.3261028073944
sigma: 473.70353025398146
 p3: Plackett-Luce Player Data: 

id: 031367610f89420bbcc83782462d1683
name: p3
mu: 1874.98698505331
sigma: 488.14186177390417
 p4: Plackett-Luce Player Data: 

id: 376a0d80cc2047c78423cab70c0a5555
name: p4
mu: 1364.5880331751937
sigma: 489.5844366494107



In [13]:
#create people
# converters={"Teams": lambda x: [y.strip().split("'")[1] for y in x.strip("[]").split(", ")]}
df_races = pd.read_json("races.json")

df_races['Ratio'] = 1 - (df_races['Score'] / df_races['Teams'].apply(len))
df_races['Ratio'] = df_races['Ratio'].astype(float)
df_races['numTeams'] = df_races['Teams'].apply(len)
df_races['raceNum'] = df_races['raceID'].apply(lambda id: int(id.split("/")[2][:-1]))

# df_races['Date'] = df_races['Date'].apply(lambda date: (int(date.split("-")[0]), int(date.split("-")[1]), int(date.split("-")[2])))
df_races_full = df_races.sort_values(['Date', 'raceNum']).reset_index(drop=True)

# df_races = df_races.loc[df_races['raceID'].str.contains('f24')] # filter for f24
df_races_skipper = df_races_full.loc[df_races_full['Position'].str.contains('Skipper')] # filter for skippers
df_races_crew = df_races_full.loc[df_races_full['Position'].str.contains('Crew')] # filter for skippers
people = {}

# Pre-group the data for skippers and crews
skipper_groups = df_races_skipper.groupby('Sailor')['Team'].unique()
crew_groups = df_races_crew.groupby('Sailor')['Team'].unique()

# Precompute seasons for skippers and crew
skipper_seasons = (
    df_races_skipper.assign(Season=df_races_skipper['raceID'].str.split('/').str[0])
    .groupby('Sailor')['Season']
    .unique()
)

crew_seasons = (
    df_races_crew.assign(Season=df_races_crew['raceID'].str.split('/').str[0])
    .groupby('Sailor')['Season']
    .unique()
)

# Function to add a sailor to the dictionary
def add_sailor(group, seasons_group, role):
    for sailor, teams in group.items():
        # If no teams are associated, set "Unknown"
        teams = teams if len(teams) > 0 else ["Unknown"]

        # Retrieve the precomputed seasons
        seasons = seasons_group.get(sailor, [])
        
        # Add the sailor to the people dictionary
        people[f"{sailor}/{role}"] = Sailor(sailor, teams, role, list(seasons))

# Add skippers and crew
add_sailor(skipper_groups, skipper_seasons, 'Skipper')
add_sailor(crew_groups, crew_seasons, 'Crew')

In [14]:
stype = 'Ratio'
i = 0
residuals = []

for type, df_races in zip(['/Skipper', '/Crew'], [df_races_skipper, df_races_crew]):
    grouped = df_races.groupby(['Date', 'Regatta', 'raceID'])

    for (date, regatta, race), scores in grouped:
        if i % 1000 == 0:
            print(f"Currently analyzing race {i}/{len(df_races['raceID'].unique()) * 2} Regatta:{regatta}, Date:{date}")
        i += 1

        sailors = scores['Sailor']
        if sailors.empty:
            continue

        # Recalculate global average
        globalAvg = sum([p.r.mu for p in people.values()]) / len(people)

        # Compute regatta average
        regattaAvg = sum([people[p + type].r.mu for p in sailors]) / len(sailors)
        multiplier = regattaAvg / globalAvg

        # Initialize racers and ratings
        racers = [people[p + type] for p in sailors]
        startingElos = [r.r.mu for r in racers]
        ratings = [[r.r] for r in racers]

        # Skip races with fewer than 2 participants
        if len(ratings) < 2:
            print(regatta, "did not have enough sailors??")
            continue

        # Rate using the model
        ratings = model.rate(ratings, list(scores['Score']), weights=[[multiplier]] * len(ratings))

        predictions = model.predict_rank(ratings)
        # print(predictions)
        
        for pred, score in zip(predictions, scores['Score']):
            residuals.append(score - pred[0])
        # print(list(scores['Score']))

        # Update racers' ratings
        for racer, new_rating in zip(racers, ratings):
            racer.r = new_rating[0]

        # Calculate changes
        changes = [racers[i].r.mu - startingElos[i] for i in range(len(racers))]

        # Update sailors' race data
        for idx, sailor in enumerate(sailors):
            sailor_obj = people[sailor + type]
            sailor_obj.races.append(race)
            sailor_obj.changes.append({
                'score': scores.loc[scores['Sailor'] == sailor, 'Score'].iat[0],
                'predicted': predictions[idx][0],
                'change':changes[idx],
                'regAvg':regattaAvg,
                'newRating':sailor_obj.r.mu,
                'date':date,
                'partner':scores.loc[scores['Sailor'] == sailor, 'Partner'].iat[0],
                'ratio':scores.loc[scores['Sailor'] == sailor, 'Ratio'].iat[0], 
                'venue':scores.loc[(scores['Sailor'] == sailor) & (scores['raceID'] == race), 'Venue'].iat[0],
                'raceID': race
            })
#         if i == 5:
#             break
#     if i == 5:
#         break
# print(residuals)

Currently analyzing race 0/65010 Regatta:s16/peter-wenner-rainbow-invite, Date:2016-01-16 00:00:00
Currently analyzing race 1000/65010 Regatta:s16/navy-spring, Date:2016-04-16 00:00:00
Currently analyzing race 2000/65010 Regatta:f16/lark-invitational, Date:2016-09-10 00:00:00
Currently analyzing race 3000/65010 Regatta:f16/philly-fleet-race, Date:2016-10-01 00:00:00
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have enough races??
f16/south did not have 

In [11]:
mse = (np.array(residuals)**2).mean()
# r2 = 1 - (mse/ynew.var())
print(mse)

8.625


In [16]:
import plotly.express as px
import plotly.figure_factory as ff

# predictions = model.predict_rank([[p.r] for p in list(people.values()) if p.name in ['Carter Anderson', 'Justin Callahan', 'Blake Roberts'] and p.pos == 'Skipper'])
# print([p.name for p in list(people.values()) if p.name in ['Carter Anderson', 'Justin Callahan', 'Blake Roberts']  and p.pos == 'Skipper'])
# print(predictions)

# for raceID in list(df_races_full['raceID'].unique()):
# print(raceID)
# raceID = 'f24/open-atlantic-coast-final/1A'
raceID = "f24/mcsa-open-fall/10A"
# race = df_races_full.loc[df_races_full['raceID'] == raceID]
race = df_races_full[df_races_full['raceID'] == raceID]
df_preds3 = pd.DataFrame(columns=['pred', 'actual', 'Sailor'])
for pos in ['Skipper', 'Crew']:
    sailors = [p for p in people.values() if p.name in list(race['Sailor']) and race.loc[race['Sailor'] == p.name, 'Position'].iat[0] == pos and p.pos == pos]
    predictions = model.predict_rank([[p.r] for p in sailors])

    for pred, sailor in zip(predictions, sailors):
        score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
        # print(f"predicted: {pred[0]}, actual: {int(score)}, name: {sailor.name}")
        df_preds3.loc[len(df_preds3)] = [float(pred[0]), score, sailor.name]
    

df_preds = pd.read_csv('predsLR.csv')
df_preds2 = pd.read_csv('predsRF.csv')

df_preds3['predLR'] = df_preds['pred']
df_preds3['predRF'] = df_preds2['Predicted Score']

err = (abs(df_preds3['pred'] - df_preds3['actual'])).mean()
print(f"Mean Absolute Error for predictions in Race {raceID}: {err:.2f}")
err = (abs(df_preds3['predLR'] - df_preds3['actual'])).mean()
print(f"Mean Absolute Error for predictions in Race {raceID}: {err:.2f}")
err = (abs(df_preds3['predRF'] - df_preds3['actual'])).mean()
print(f"Mean Absolute Error for predictions in Race {raceID}: {err:.2f}")

fig = px.bar(df_preds3,x='Sailor', y=['actual', 'pred', 'predLR', 'predRF'],color_discrete_map={'actual': '#0101fd', 'pred': '#ffa500'})
fig.update_layout(width=1100, title='Model Predicted and Actual Score vs Sailor (lower is better)',
    xaxis_title='Num Teams',
    
    yaxis_title='Score',
    barmode='group')
fig.show()

  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = float(race.loc[race['Sailor'] == sailor.name, 'Score'])
  score = 

Mean Absolute Error for predictions in Race f24/mcsa-open-fall/10A: 2.00
Mean Absolute Error for predictions in Race f24/mcsa-open-fall/10A: 3.88
Mean Absolute Error for predictions in Race f24/mcsa-open-fall/10A: 3.56


In [25]:
df_elo = pd.DataFrame(columns=['Sailor', 'Teams', 'Pos', 'Elo'])

new_rows = []
for p in people.values():
    df_elo.loc[len(df_elo)] = [p.name, p.teams, p.pos, p.r.mu]
    row = {"Sailor": p.name}
    row.update(zip(p.races, [(int(c['score']), float(c['change']), float(c['regAvg']), float(c['newRating'])) for c in p.changes]))
    new_rows.append(row)

all_keys = ['Sailor'] + list(df_races['raceID'].unique())
new_df = pd.DataFrame(new_rows, columns=all_keys)

df_elo = pd.concat([df_elo.set_index('Sailor'), new_df.set_index('Sailor')], axis=1).reset_index()

# Filter sailors who have 'f24' in their seasons list
eligible_sailors = [p.name for p in people.values() if 'f24' in p.seasons]

# Filter the df_elo DataFrame to only include eligible sailors
df_elo_filtered = df_elo[df_elo['Sailor'].isin(eligible_sailors)]

# Sort the filtered df_elo by Elo and reset index
df_elo_filtered = df_elo_filtered.sort_values('Elo', ascending=False)
df_elo_filtered.reset_index(drop=True, inplace=True)

# Rank sailors within each position (Skipper/Crew) based on Elo
df_elo_filtered['Rank'] = df_elo_filtered.groupby('Pos')['Elo'].rank(method='dense', ascending=False).astype(int)

# Merge the df_elo_filtered with the sailors' data to get the rank in bulk
# df_rank = df_elo_filtered[['Sailor', 'Pos', 'Rank']]

# # Merge with people data to assign ranks in one go
# people_df = pd.DataFrame({
#     'Sailor': [p.name for p in people.values()],
#     'Pos': [p.pos for p in people.values()],
#     'rank': [None] * len(people)  # Initialize with None
# })

# Perform a merge to get the rank assigned to each sailor
# people_df = people_df.merge(df_rank, on=['Sailor', 'Pos'], how='left')

# Update the rank attribute of each sailor in the people dictionary
# for p in people.values():
    # p.rank = people_df.loc[people_df['Sailor'] == p.name, 'Rank'].values[0]


# Update rank attribute for each sailor (only for those in the filtered list)
for i,p in enumerate(people.values()):
    if i % 1000 == 0:
        print(f"{i}/{len(people.values())}")
    if p.name in eligible_sailors:
        p.rank = df_elo_filtered.loc[(df_elo_filtered['Sailor'] == p.name) & (df_elo_filtered['Pos'] == p.pos), 'Rank'].values[0]

0/17009
1000/17009
2000/17009
3000/17009
4000/17009
5000/17009
6000/17009
7000/17009
8000/17009
9000/17009
10000/17009
11000/17009
12000/17009
13000/17009
14000/17009
15000/17009
16000/17009
17000/17009


In [None]:
col = db.collection('sailorsElo')
batch = db.batch()

for i,p in enumerate(people.values()):
    batch.set(col.document(),
        {"Name":p.name, 
         "Position": p.pos,
         "Teams": list(p.teams),
         "Rating": int(p.r.mu),
         "GlobalRank": int(p.rank),
         "races": [{"raceID": raceid, 
                    "score": float(c['score']), 
                    "change": float(c['change']), 
                    'regAvg': float(c['regAvg']), 
                    'newRating': float(c['newRating']),
                    'date': c['date'],
                    'partner':c['partner'],
                    'ratio': float(c['ratio']), 
                    'venue': c['venue']
                    } for raceid, c in zip(p.races, p.changes)]})
    if i % 20 == 0: # commit every 20 documents
        batch.commit()
batch.commit()

[update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 },
 update_time {
   seconds: 1733634325
   nanos: 651949000
 }]

In [10]:
import requests
from bs4 import BeautifulSoup

batch = db.batch()
col = db.collection('eloTeams')
lenteams = len(list(df_races['Team'].unique()))
teams = []
bySailors = df_races_full.groupby('Sailor')

for i,team in enumerate(list(df_races['Team'].unique())):
    print(f"{i}/{lenteams} {team}")
    avg = df_elo.loc[df_elo['Teams'].apply(lambda x: team in x), 'Elo'].mean()
    region = df_races
    
    teamLink = df_races.loc[df_races['Team'] == team, 'Teamlink'].iloc[0]
    url = f"https://scores.collegesailing.org/schools/{teamLink.split("/")[2]}"
    # print(url)
    page = requests.get(url)
    teamPage = BeautifulSoup(page.content, 'html.parser')
    
    try:
        region = teamPage.find('span', class_="page-info-value").contents[0].contents[0]
        # print(team, avg, region)
    except:
        print(url)
        
    members = [{"name": p.name, 
                'pos':p.pos,
                'rating': int(p.r.mu),
                'seasons':list(df_races_full.loc[df_races_full['Sailor'] == p.name]['raceID'].str.split('/').str[0].unique()), 
                'globalrank': int(p.rank)} for p in people.values() if team in p.teams]
    
    teams.append({"name":team, "avg": avg,"region": region, "link": url})
    batch.set(col.document(),{"name":team, "avg": avg,"region": region, "link": url, 'members': members})
    if i % 20 == 0: # commit every 20 documents
            batch.commit()
batch.commit()
doc = db.collection('vars').document('eloTeams').set({"teams": teams})

0/184 Hawaii
1/184 UC Santa Barbara
2/184 UC Los Angeles
3/184 Washington
4/184 Cal Maritime
5/184 Oregon
6/184 UC Davis
7/184 Berkeley
8/184 Cal Poly
9/184 UC San Diego
10/184 Western Washington
11/184 CSU Long Beach
12/184 Channel Islands
13/184 Southern Cal
14/184 UC Santa Cruz
15/184 San Diego State
16/184 UC Irvine
17/184 Arizona State
18/184 Embry-Riddle
19/184 Rollins
20/184 Florida Tech
21/184 Texas A&M Galveston
22/184 Texas A&M
23/184 Texas
24/184 North Texas
25/184 Central Oklahoma
26/184 Lewis & Clark
27/184 Texas A&M C. Christ
28/184 Florida
29/184 Palm Beach Atlantic
30/184 New College
31/184 Georgia Tech
32/184 Georgia
33/184 North Carolina State
34/184 South Florida
35/184 Jacksonville
36/184 U. Miami
37/184 Salve Regina
38/184 Eckerd
39/184 Northwestern
40/184 Charleston
41/184 Clemson
42/184 U South Carolina
43/184 UNC Wilmington
44/184 North Carolina
45/184 Duke
46/184 Auburn
47/184 Georgetown
48/184 Navy
49/184 Old Dominion
50/184 Michigan
51/184 Maine Maritime
52/1

In [11]:
topSkippers = []
for p in sorted([p for p in people.values() if p.rank <= 100 and p.pos == 'Skipper'],key=lambda p: p.rank):
    topSkippers.append({'name': p.name, 'rank': int(p.rank), 'pos': p.pos, 'team': list(p.teams), 'rating': p.r.mu, 'seasons': list(p.seasons)})
topCrews = []
for p in sorted([p for p in people.values() if p.rank <= 100 and p.pos == 'Crew'],key=lambda p: p.rank):
    topCrews.append({'name': p.name, 'rank': int(p.rank), 'pos': p.pos, 'team': list(p.teams), 'rating': p.r.mu, 'seasons': list(p.seasons)})
doc = db.collection('vars').document('topSailors').set({"skippers": topSkippers, "crews": topCrews})

In [42]:
len(list(df_races_full['Regatta'].unique()))

1723

In [44]:
for regatta in list(df_races_full['Regatta'].unique())[704:]:
    races = df_races_full[df_races_full['Regatta'] == regatta]
    raceIDs = list(races['raceID'].unique())
    sailors = races['Sailor'].unique()
    
    racePpl = [{"Name":p.name, 
         "Position": p.pos,
         "Teams": list(p.teams),
         "Rating": int(p.r.mu),
         "GlobalRank": int(p.rank),
         "races": [{"raceID": raceid, 
                    "score": float(c['score']), 
                    "predicted": int(c['predicted']), 
                    "change": float(c['change']), 
                    'regAvg': float(c['regAvg']), 
                    'newRating': float(c['newRating']),
                    'date': c['date'],
                    'partner':c['partner'],
                    'ratio': float(c['ratio']),
                    'venue': c['venue']
                    } for raceid, c in zip(p.races, p.changes) if raceid.split("/")[0] + "/" +raceid.split("/")[1] == regatta]
        } for p in people.values() if p.name in sailors]
    
    # race = {'raceID':'', 'raceNum':0, 'div': '', 'sailors':[]}
    # person = {'name':'', 'rating':0, 'change':0, 'team': '', 'pos': '', 'div':'', 'partner': ''}
    
    # for race in races['raceID'].unique():
    #     sailors = races[races['raceID'] == race, 'Sailor'].unique()
    # for p in [p for p in people.values() if p.name in sailors]:
        # racePpl.append({'name':p.name, 'rating':p.rating, 'changes':p.changes, 'team': p.team, 'pos': '', 'div':'', 'partner': ''})
    # print(regatta)
    db.collection('eloRegattas').document().set({'regattaName': regatta,'raceIDs':raceIDs, 'sailors': racePpl}, timeout=10)

In [7]:
df_elo.to_csv("elo19.csv",index=False)