In [1]:
import pandas as pd
from datetime import datetime

In [2]:
import re 

def clean_string(s):
    """
    Removes whitespace and non-alphanumeric characters from a string.
    
    Parameters:
    s (str): The input string to clean.

    Returns:
    str: A cleaned string with only alphanumeric characters.
    """
    if isinstance(s, str):
        return re.sub(r'\W+', '', s).lower()  # Removes non-alphanumeric characters and whitespace
    else:
        return ''  # Handle non-string cases, e.g., None or NaN


In [3]:
result_cols = ["bib", "racer_id", "run1", "run2"]
results1 = pd.read_csv('2024/240114FnGResultsBoth.csv', header=None, names=result_cols)
#results2 = pd.read_csv('2024/240121FnGResultsBoth.csv', header=None, names=result_cols)
#results3 = pd.read_csv('2024/240211FnGResultsBoth.csv', header=None, names=result_cols)
#results4 = pd.read_csv('2024/240218FnGResultsBoth.csv', header=None, names=result_cols)
#results1.columns = ["Bib", "Discipline", "racer_id", "Tier", "Club"]

In [4]:
startList_cols_keep = ["bib", "CAT", "name", "tier", "team"]
startList = pd.read_csv('raw/24FnGStartList.csv')[startList_cols_keep]
startList.columns = ["bib", "discipline", "racer_id", "tier", "team"]

In [5]:
startList.head(2)

Unnamed: 0,bib,discipline,racer_id,tier,team
0,1,SKI,Jennifer Hsiung,1,Mitch Perreault
1,2,SKI,Maurice Cacho,1,Will Carter


In [6]:
startList.tail(2)

Unnamed: 0,bib,discipline,racer_id,tier,team
98,122,SNBD,Kevin Kilmer Choi,13,Adam Grossman
99,123,SNBD,Bernard Oegema,13,Don French


In [7]:
results1.head(2)

Unnamed: 0,bib,racer_id,run1,run2
0,2,Maurice Cacho,34.16,32.71
1,4,Justin Rosenberg,35.81,33.66


In [8]:
# Clean up whitespace in Bib and Name
join_keys = ["bib", "racer_id"]
results1["racer_id"] = results1["racer_id"].apply(clean_string)
#results2["racer_id"] = results2["racer_id"].apply(clean_string)
#results3["racer_id"] = results3["racer_id"].apply(clean_string)
#results4["racer_id"] = results4["racer_id"].apply(clean_string)
startList["racer_id"] = startList["racer_id"].apply(clean_string)

In [9]:
results1.head(2)

Unnamed: 0,bib,racer_id,run1,run2
0,2,mauricecacho,34.16,32.71
1,4,justinrosenberg,35.81,33.66


In [10]:
combined1 = startList.merge(results1, how="left", on=join_keys)
#combined2 = startList.merge(results2, how="left", on=join_keys)
#combined3 = startList.merge(results3, how="left", on=join_keys)
#combined4 = startList.merge(results4, how="left", on=join_keys)

In [13]:
def calculate_points_corrected(df):
    df.replace({'DNF': 9998,"DSQ": 9998, "DNS": 9999, pd.NA: 9999}, inplace=True)
    df['run1'] = pd.to_numeric(df['run1']) # errors='coerce')
    df['run2'] = pd.to_numeric(df['run2']) # errors='coerce')
    df['best_time'] = df[['run1', 'run2']].min(axis=1)
    # Initialize a column for points
    df['points'] = 0
    
    # Process each tier
    for tier in df['tier'].unique():
        # Filter the tier
        tier_df = df[df['tier'] == tier]
    
        # Sort by best_time
        tier_sorted = tier_df.sort_values(by='best_time')
    
        # Assign points based on the number of racers in the tier
        num_racers = len(tier_sorted)
        tier_sorted['points'] = range(8, 8-num_racers, -1)
    
        # Set absent racer points to zero:
        tier_sorted.loc[tier_sorted['best_time'] == 9999, 'points'] = 0
        
        # Handle ties for DNF (split points for racers with 9998 as their best_time)
        dnf_racers = tier_sorted[tier_sorted['best_time'] == 9998]
        if not dnf_racers.empty:
            dnf_points = dnf_racers['points'].sum() / len(dnf_racers)
            tier_sorted.loc[tier_sorted['best_time'] == 9998, 'points'] = dnf_points
    
        # Update the main dataframe
        df.loc[df['tier'] == tier, 'points'] = tier_sorted['points']

    # Calculate team points
    teams = df['team'].unique()
    team_dfs = []
    team_points = {}
    for team in teams:
        team_df = df[df['team'] == team]
        # If a team is missing a racer in a tier, give them the average points of that tier
        for tier in df['tier'].unique():
            if tier not in team_df['tier'].values:
                #avg_points = df[df['tier'] == tier]['points'].mean()
                #team_df.loc[len(team_df)] = [pd.NA, pd.NA, tier, team, pd.NA, pd.NA, pd.NA, avg_points]
                team_df.loc[-1] = [pd.NA, pd.NA, pd.NA, tier, team, pd.NA, pd.NA, pd.NA, 4.5] #, avg_points]
                team_df.reset_index(drop=True, inplace=True)
        team_points[team] = team_df['points'].sum()
        team_dfs.append(team_df)
    
    df_out = pd.concat(team_dfs)
    
    return team_points, df_out
    

In [14]:
combined1.head(2)

Unnamed: 0,bib,discipline,racer_id,tier,team,run1,run2
0,1,SKI,jenniferhsiung,1,Mitch Perreault,,
1,2,SKI,mauricecacho,1,Will Carter,34.16,32.71


In [22]:
combined1.shape

(100, 9)

In [15]:
team_points1, points = calculate_points_corrected(combined1)
#team_points2, points2 = calculate_points_corrected(combined2)
#team_points3, points3 = calculate_points_corrected(combined3)
#team_points4, points4 = calculate_points_corrected(combined4)

  team_df.loc[-1] = [pd.NA, pd.NA, pd.NA, tier, team, pd.NA, pd.NA, pd.NA, 4.5] #, avg_points]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df.loc[-1] = [pd.NA, pd.NA, pd.NA, tier, team, pd.NA, pd.NA, pd.NA, 4.5] #, avg_points]
  team_df.loc[-1] = [pd.NA, pd.NA, pd.NA, tier, team, pd.NA, pd.NA, pd.NA, 4.5] #, avg_points]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df.loc[-1] = [pd.NA, pd.NA, pd.NA, tier, team, pd.NA, pd.NA, pd.NA, 4.5] #, avg_points]
  team_df.loc[-1] = [pd.NA, pd.NA, pd.NA, tier, team, pd.NA, pd.NA, pd.NA, 4.5] #, avg_points]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://

# Alrighty! things are finally looking good, 
now time to upload results, in a way that makes this an easy to use function:
1. using 

In [11]:
points.to_csv("240114_scores_groupbyClub.csv")
points.sort_values(["Tier", 'Points'], ascending=False).to_csv("240114_scores_groupbyTier.csv")

points2.to_csv("240121_scores_groupbyClub.csv")
points2.sort_values(["Tier", 'Points'], ascending=False).to_csv("240121_scores_groupbyTier.csv")

points3.to_csv("240211_scores_groupbyClub.csv")
points3.sort_values(["Tier", 'Points'], ascending=False).to_csv("240211_scores_groupbyTier.csv")

points4.to_csv("240218_scores_groupbyClub.csv")
points4.sort_values(["Tier", 'Points'], ascending=False).to_csv("240218_scores_groupbyTier.csv")

In [21]:
for i in range (13, 0, -1):
    assert points[points.tier == i].sort_values(["tier", 'points'], ascending=False).shape == (8,9)
    #assert points2[points2.Tier == i].sort_values(["Tier", 'Points'], ascending=False).shape == (8,9)
    #assert points3[points3.Tier == i].sort_values(["Tier", 'Points'], ascending=False).shape == (8,9)
    #assert points4[points4.Tier == i].sort_values(["Tier", 'Points'], ascending=False).shape == (8,9)

In [11]:
to_drop = ["Bib", "Name", "Tier", "Club"]
to_rename = ["Run1", "Run2", "Best_Time", "Points"]
for col in to_rename:
    points.rename(columns={col: f"Race_1_{col}"},inplace=True)
    points2.rename(columns={col: f"Race_2_{col}"},inplace=True)
    points3.rename(columns={col: f"Race_3_{col}"},inplace=True)
    points4.rename(columns={col: f"Race_4_{col}"},inplace=True)
    

In [12]:
points.index = points.Club + points.Tier.astype(str)
points2.index = points2.Club + points2.Tier.astype(str)
points3.index = points3.Club + points3.Tier.astype(str)
points4.index = points4.Club + points4.Tier.astype(str)

points2.drop(columns=to_drop, inplace=True)
points3.drop(columns=to_drop, inplace=True)
points4.drop(columns=to_drop, inplace=True)

In [13]:
combined_points = pd.concat([points, points2, points3, points4], axis=1)

In [14]:
combined_points['sum_top_3_finishes'] = combined_points.apply(lambda row: sum(sorted(row[['Race_1_Points', 'Race_2_Points', 'Race_3_Points', 'Race_4_Points']], reverse=True)[:3]), axis=1)

In [16]:
combined_points

Unnamed: 0,Bib,Name,Tier,Club,Race_1_Run1,Race_1_Run2,Race_1_Best_Time,Race_1_Points,Race_2_Run1,Race_2_Run2,...,Race_2_Points,Race_3_Run1,Race_3_Run2,Race_3_Best_Time,Race_3_Points,Race_4_Run1,Race_4_Run2,Race_4_Best_Time,Race_4_Points,sum_top_3_finishes
Mitch Perreault1,1,JenniferHsiung,1,Mitch Perreault,9999.0,9999.0,9999.0,0.0,39.63,35.94,...,2.0,54.9,55.54,54.9,3.0,54.02,52.8,52.8,2.0,7.0
Mitch Perreault2,13,KevinBrown,2,Mitch Perreault,34.17,9999.0,34.17,4.0,31.36,9999.0,...,6.0,9998.0,49.98,49.98,8.0,47.47,48.49,47.47,7.0,21.0
Mitch Perreault3,22,RichardFridman,3,Mitch Perreault,33.05,31.37,31.37,6.0,9999.0,9999.0,...,0.0,9998.0,44.21,44.21,6.0,44.94,45.01,44.94,7.0,19.0
Mitch Perreault4,30,MitchPerreault,4,Mitch Perreault,31.73,9999.0,31.73,6.0,29.59,29.82,...,4.0,40.79,41.05,40.79,7.0,43.81,43.93,43.81,7.0,20.0
Mitch Perreault5,38,SheriRamshaw,5,Mitch Perreault,32.88,31.84,31.84,5.0,29.44,30.47,...,4.0,9998.0,46.75,46.75,4.0,44.06,45.21,44.06,7.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Graham Ramshaw 10,74,GrahamRamshaw,10,Graham Ramshaw,9998.0,25.92,25.92,3.0,24.34,24.54,...,3.0,31.63,9998.0,31.63,4.0,37.89,37.88,37.88,4.0,11.0
Graham Ramshaw 11,103,MarkSandell,11,Graham Ramshaw,48.02,43.83,43.83,7.0,48.53,9999.0,...,6.0,52.81,47.41,47.41,6.0,9999.0,9999.0,9999.0,0.0,19.0
Graham Ramshaw 12,111,SheriRamshaw,12,Graham Ramshaw,50.37,51.92,50.37,3.0,50.38,49.19,...,5.0,51.74,50.75,50.75,5.0,32.89,33.73,32.89,6.0,16.0
Graham Ramshaw 13,121,GrahamRamshaw,13,Graham Ramshaw,44.74,42.34,42.34,6.0,44.85,50.82,...,3.0,45.46,9998.0,45.46,4.0,28.53,28.58,28.53,6.0,16.0


In [24]:
combined_points.groupby("Club").agg({"sum_top_3_finishes": "sum"}).sort_values("sum_top_3_finishes", ascending=False).to_csv("foo.csv")

In [23]:
combined_points.to_csv("Combined_Points.csv")