In [10]:
import pandas as pd
import joblib

In [11]:
def extract_year(df):
    df['year'] = df['gameID'].str[:4]
    return df

def calculate_completions_and_attempts(df):
    df['completion'] = df['turnover'].apply(lambda x: 1 if x == 0 else 0)
    df['attempt'] = 1  
    return df

def calculate_yearly_stats(df):
    yearly_stats = df.groupby('year').agg(
        completions=('completion', 'sum'),
        attempts=('attempt', 'sum')
    ).reset_index()
    yearly_stats['completion_percent'] = (yearly_stats['completions'] / yearly_stats['attempts']) * 100
    return yearly_stats

def calculate_completions_per_game(df):
    completions_per_game = df.groupby(['year', 'gameID']).agg(completions_per_game=('completion', 'sum')).reset_index()
    completions_per_game_avg = completions_per_game.groupby('year').agg(
        completions_per_game_avg=('completions_per_game', 'mean')
    ).reset_index()
    return completions_per_game_avg

def calculate_completions_per_possession(df):
    completions_per_possession = df.groupby(['year', 'gameID', 'home_team_score', 'away_team_score', 'possession_num']).agg(
        completions_per_possession=('completion', 'sum')
    ).reset_index()
    completions_per_possession_avg = completions_per_possession.groupby('year').agg(
        completions_per_possession_avg=('completions_per_possession', 'mean')
    ).reset_index()
    return completions_per_possession_avg

def calculate_average_scores(df):
    # Calculate average scores for each game
    avg_scores = df.groupby('gameID').agg(
        winning_team_score=('home_team_score', 'max'),
        losing_team_score=('away_team_score', 'max')
    ).reset_index()
    avg_scores['year'] = avg_scores.gameID.apply(lambda x:x[:4])
    # Calculate yearly averages
    yearly_avg_scores = avg_scores.groupby('year').agg(
        avg_winning_score=('winning_team_score', 'mean'),
        avg_losing_score=('losing_team_score', 'mean')
    ).reset_index()
    yearly_avg_scores.rename(columns={yearly_avg_scores.columns[0]: 'year'}, inplace=True)

    # Calculate overall averages
    overall_avg_scores = avg_scores.groupby(['winning_team_score', 'losing_team_score']).agg(
        avg_winning_score=('winning_team_score', 'mean'),
        avg_losing_score=('losing_team_score', 'mean')
    ).reset_index()

    return yearly_avg_scores, overall_avg_scores

def calculate_summary_stats(df):
    # Create a summary DataFrame for yearly stats
    yearly_stats = df.groupby('year').agg(
        Total_Throws=('attempt', 'sum'),
        Number_of_Games=('gameID', 'nunique'),
        Number_of_Goals=('receiver_y', lambda x: (x > 100).sum()),
        Number_of_Turnovers=('turnover', 'sum'),
        Unique_Players=('thrower', lambda x: len(set(x).union(df.loc[x.index, 'receiver'])))  # Unique throwers and receivers
    ).reset_index()

    # Calculate overall stats
    overall_stats = {
        'year': 'Overall',
        'Total_Throws': df['attempt'].sum(),
        'Number_of_Games': df['gameID'].nunique(),
        'Number_of_Goals': (df['receiver_y'] > 100).sum(),
        'Number_of_Turnovers': df['turnover'].sum(),
        'Unique_Players': len(set(df['thrower']).union(df['receiver']))  # Unique throwers and receivers
    }

    # Append overall stats to yearly stats
    overall_stats_df = pd.DataFrame([overall_stats])
    final_summary = pd.concat([yearly_stats, overall_stats_df], ignore_index=True)

    return final_summary


In [12]:
df = joblib.load('../data/processed/data_1003.jblb')['df']

df = extract_year(df)
df = calculate_completions_and_attempts(df)

yearly_stats = calculate_yearly_stats(df)
completions_per_game_avg = calculate_completions_per_game(df)
completions_per_possession_avg = calculate_completions_per_possession(df)
yearly_avg_scores, avg_scores = calculate_average_scores(df)

# Merge stats
yearly_stats = yearly_stats.merge(completions_per_game_avg, on='year', how='left')
yearly_stats = yearly_stats.merge(completions_per_possession_avg, on='year', how='left')
yearly_stats = yearly_stats.merge(yearly_avg_scores, on='year', how='left')

# Overall statistics
overall_stats = pd.DataFrame({
    'year': ['Overall'],
    'completions': [df['completion'].sum()],
    'attempts': [df['attempt'].sum()],
    'completion_percent': [(df['completion'].sum() / df['attempt'].sum()) * 100],
    'completions_per_game_avg': [completions_per_game_avg['completions_per_game_avg'].mean()],
    'completions_per_possession_avg': [completions_per_possession_avg['completions_per_possession_avg'].mean()],
    'avg_winning_score': [avg_scores['avg_winning_score'].mean()],
    'avg_losing_score': [avg_scores['avg_losing_score'].mean()]
})
final_stats = pd.concat([yearly_stats, overall_stats], ignore_index=True)

summary_stats = calculate_summary_stats(df)


In [13]:
final_stats

Unnamed: 0,year,completions,attempts,completion_percent,completions_per_game_avg,completions_per_possession_avg,avg_winning_score,avg_losing_score
0,2021,55782,60007,92.959155,453.512195,9.375126,18.650407,18.276423
1,2022,74226,79795,93.020866,478.877419,9.060791,21.187097,19.890323
2,2023,69115,74155,93.203425,476.655172,9.921763,19.103448,17.772414
3,2024,71864,76869,93.488923,475.92053,10.294227,18.417219,17.298013
4,Overall,270987,290826,93.178395,471.241329,9.662977,18.836879,17.978723


In [14]:
summary_stats

Unnamed: 0,year,Total_Throws,Number_of_Games,Number_of_Goals,Number_of_Turnovers,Unique_Players
0,2021,60007,123,5785,4225,4272
1,2022,79795,155,8124,5569,5527
2,2023,74155,145,6723,5040,5131
3,2024,76869,151,6817,5005,5119
4,Overall,290826,574,27449,19839,18464
