In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os import path
from typing import Union


def _compile_ranks(folder_path: str) -> pd.DataFrame:
    """ 
    Compiles all daily rankings csvs into one df.

    Parameters
    ----------
    folder_path
        Full path to the folder.

    Returns
    -------
    DataFrame
        Day/Player level data from every day in the
        folder path.
    """

    files = listdir(folder_path)

    dfs = []
    for file in files:
        if file[:15] == 'df_player_ranks':
            full_path = path.join(folder_path, file)

            df = pd.read_csv(full_path)

            dfs.append(df)

    df = pd.concat(dfs)

    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year

    df['player_key'] = df['player'] \
                    + ' - ' + df['date'].astype('str') \
                    + ' - ' + df['adp'].astype('str')   

    # Required to differentiate from derived rank
    df.rename(columns={'rank': 'rank_actual'}, inplace=True)

    return df


def read_drafts(folder_path: str, years: list) -> pd.DataFrame:
    """ 
    Compiles each year of draft data into one df.

    Parameters
    ----------
    folder_path
        Full path to the main data folder.
    years
        The years of draft data to pull in.
        Note that each year is a folder within the folder_path.

    Returns
    -------
    DataFrame
        Draft/Pick level draft data from every day in the
        folder path.
    """

    dfs = []
    for year in years:
        df_drafts = pd.read_csv(path.join(folder_path, f'{year}/df_drafts.csv'))
        df_info = pd.read_csv(path.join(folder_path, f'{year}/df_league_info.csv'))

        df_info = df_info[['id', 'source', 'title']]

        rename_vars = {'id': 'draft_id', 'source': 'draft_source', 'title': 'draft_title'}
        df_info.rename(columns=rename_vars, inplace=True)

        df = pd.merge(df_drafts, df_info, how='left', on='draft_id')
        dfs.append(df)

    df = pd.concat(dfs)
    df['full_name'] = df['first_name'] + ' ' + df['last_name']

    drop_vars = ['id', 'pick_slot_id', 'points', 'projection_points'
                , 'swapped', 'player_id', 'first_name', 'last_name']
    df.drop(columns=drop_vars, inplace=True)

    # Noticed a draft being duplicated.
    df.drop_duplicates(inplace=True)

    return df


def read_ranks(folder_path: str, years: list) -> pd.DataFrame:
    """ 
    Compiles each year of rankings data into one df.

    Note
    ----
    Some years will contain *actual* rankings (i.e. the true ranking
    of each player on a particular day) while others will used derived
    rankings (i.e. determined by the adp of each player from a draft for 
    a day).

    Parameters
    ----------
    folder_path
        Full path to the main data folder.
    years
        The years of draft data to pull in.
        Note that each year is a folder within the folder_path.

    Returns
    -------
    DataFrame
        Day/Player level rankings data from every day in the
        folder path.
    """

    dfs = []
    for year in years:
        df = _compile_ranks(path.join(folder_path, f'{year}/player_ranks'))
        dfs.append(df)
    
    df = pd.concat(dfs)

    # type field only available if a custom ranks file is created.
    try:
        df['type'] = df['type'].fillna('actual')
    except:
        df['type'] = 'actual'

    return df


def read_lookups(folder_path: str, years: list) -> pd.DataFrame:
    """ 
    Compiles each year of lookups data into one df.
    This is necessary to map each unique player from the 
    draft data to the rankings data.

    Parameters
    ----------
    folder_path
        Full path to the main data folder.
    years
        The years of draft data to pull in.
        Note that each year is a folder within the folder_path.

    Returns
    -------
    DataFrame
        Year/Lookup Type level data from year.
    """

    dfs = []
    for year in years:
        df = pd.read_csv(path.join(folder_path, f'{year}/lookups.csv'))
        dfs.append(df)
    
    df = pd.concat(dfs)

    return df


def update_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    Updates draft data columns to more appropriate dyptes. 

    Parameters
    ----------
    df
        Raw draft data.

    Returns
    -------
    DataFrame
        Draft/Pick level draft data.
    """
    
    # Replace null adps and update to float.
    df['projection_adp'] = np.where(df['projection_adp'] == '-', 216, df['projection_adp'])
    df['projection_adp'] = df['projection_adp'].astype('float')

    # Update created_at to datetime to use as possible filter.
    df['created_at'] = pd.to_datetime(df['created_at'], infer_datetime_format=True)

    return df


def drafts_w_player_data(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    Filters out drafts which do not have player attributes (team, position, etc.)
    as these will likely serve as features for the model.

    Parameters
    ----------
    df
        Draft data processed through update_dtypes.

    Returns
    -------
    DataFrame
        Draft/Pick level draft data.
    """

    df = df.copy()

    null_drafts = df.loc[df['full_name'].isnull()]

    null_drafts = null_drafts.drop_duplicates(subset='draft_id')['draft_id'].to_frame()
    null_drafts['ind_null_name_draft'] = 1

    df = pd.merge(df, null_drafts, on='draft_id', how='left')
    df = df.loc[df['ind_null_name_draft'].isnull()]

    df.drop(columns='ind_null_name_draft', inplace=True)

    return df


def _add_draft_dt(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    Filters out drafts which do not have player attributes (team, position, etc.)
    as these will likely serve as features for the model.

    Parameters
    ----------
    df
        Draft data processed through update_dtypes.

    Returns
    -------
    DataFrame
        Draft/Pick level draft data.
    """
    
    df_drafts = df[['draft_id', 'created_at']].copy()
    df_drafts.sort_values(by=['draft_id', 'created_at'], inplace=True)

    df_drafts.drop_duplicates(subset='draft_id', keep='first', inplace=True)
    df_drafts.rename(columns={'created_at': 'draft_datetime'}, inplace=True)

    df_drafts['draft_date'] = df_drafts['draft_datetime'].dt.normalize()
    df_drafts['draft_year'] = df_drafts['draft_datetime'].dt.year

    df = pd.merge(df, df_drafts, on='draft_id', how='left')

    return df


def add_draft_attrs(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    Adds draft level attributes to the df.

    Parameters
    ----------
    df
        Draft data processed through update_dtypes.

    Returns
    -------
    DataFrame
        Draft/Pick level draft data.
    """

    # Adds number of teams by draft.
    by_vars = ['draft_id', 'draft_entry_id']
    draft_teams = df[by_vars].drop_duplicates(subset=by_vars)

    num_teams = draft_teams.groupby('draft_id').size().to_frame('num_teams')

    df = pd.merge(df, num_teams, on='draft_id', how='left')

    # Adds round and pick of the round by draft.
    df['round'] = ((df['number'] - 1) / df['num_teams']).astype('int') + 1
    df['round_pick'] = df['number'] - ((df['round'] - 1) * df['num_teams'])

    # Add datetime, date, and year of draft and year.
    df = _add_draft_dt(df)

    return df


def add_draft_rank_type(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    This adds a rank_type column to the df which is required to determine
    if the date in df needs to be adjusted to map to the rankings used
    for the draft (see add_ranks_lookups function).

    Note
    ----
    IMPORTANT: The logic for this will need updated if more custom
    ranks are ever added.

    NO LONGER NEEDED - Custom ranks also need to be offset by one
    to prevent a player/day from having multiple adps.

    Parameters
    ----------
    df
        Draft data processed through add_draft_attrs.

    Returns
    -------
    DataFrame
        Draft/Pick level draft data.
    """

    df['ranks_type'] = np.where(df['draft_year'] == 2021, 'custom', 'actual')

    return df


def _add_lookup_vals(df_base: pd.DataFrame, df_lookups: pd.DataFrame, lookup_type: str
                    , join_col_name: str, final_col_name: str) -> pd.DataFrame:
    """ 
    Adds the ranks_val from the df_lookups dataset to df_base based off the lookup_type
    and updates its name to the final_col_name.
    Point of this is for the player attributes in the drafts data to align with those
    in the ranks data.

    Note
    ----
    IMPORTANT: If other years ever end up being added, they must be all be found on
    the df passed to df_lookups. Otherwise, only the last year's values will be shown.

    Parameters
    ----------
    df_base
        Draft data processed through add_draft_attrs.
    df_lookups
        Lookups data pulled from read_lookups.
    lookup_type
        Value from lookup_type field in df_lookups which indicates what will be mapped
        between the draft and rankings data.
        This can be "player", "team", or "position.
    join_col_name
        Name of the field from df_base that will be mapped to the rankings data.
    final_col_name
        Name of the column created.

    Returns
    -------
    DataFrame
        Draft/Pick level draft data.
    """

    df_base = df_base.copy()
    df_lookups = df_lookups.loc[df_lookups['lookup_type'] == lookup_type].copy()

    df = pd.merge(df_base, df_lookups, how='left'
                , left_on=['draft_year', join_col_name]
                , right_on=['draft_year', 'drafts_val'])

    df.drop(columns=['lookup_type', 'drafts_val'], inplace=True)
    df.rename(columns={'ranks_val': final_col_name}, inplace=True)

    return df


def _add_rank_draft_date(df: pd.DataFrame, hour_thresh: int=5) -> pd.DataFrame:
    """ 
    Adds a date col which aligns with the rankings that were used for the draft
    and accounts for early morning drafts which use rankings from the prior day.

    Parameters
    ----------
    df
        Draft data processed through add_draft_attrs.
    hour_thresh
        Last hour of the day that the date will be shifted
        back by a day.

    Returns
    -------
    DataFrame
        Draft/Pick level draft data.
    """

    date_change_filter = (df['draft_datetime'].dt.hour <= hour_thresh)
    df['ranks_draft_date'] = np.where(date_change_filter
                                    , df['draft_date'] - pd.Timedelta(days=1)
                                    , df['draft_date'])

    return df


def add_ranks_lookups(df: pd.DataFrame, df_lookups: pd.DataFrame) -> pd.DataFrame:
    """ 
    Adds the lookups required to map to the ranks df.

    Note
    ----
    Passed df must contain draft_year.

    Parameters
    ----------
    df
        Draft data processed through add_draft_attrs.
    df_lookups
        Lookups data pulled from read_lookups.

    Returns
    -------
    DataFrame
        Draft/Pick level draft data.
    """

    df = _add_lookup_vals(df, df_lookups, 'player', 'full_name', 'drafted_player')
    df = _add_lookup_vals(df, df_lookups, 'team', 'team_name', 'drafted_team')
    df = _add_lookup_vals(df, df_lookups, 'position', 'position', 'drafted_position')

    df.drop(columns=['full_name', 'team_name', 'position'], inplace=True)

    # Draft date appears to be offset by a day relative to the ranks
    # for early morning drafts (or at least those with that timestamp).
    df = _add_rank_draft_date(df, hour_thresh=5)

    # Ranks data will be stacked with derived ranks from drafts w/o ranks data.
    # This will allow those drafts to link back to the stacked ranks data.
    df['drafted_player_key'] = df['drafted_player'] \
                    + ' - ' + df['ranks_draft_date'].astype('str') \
                    + ' - ' + df['projection_adp'].astype('str') 

    return df


def _expand_draft(df_draft: pd.DataFrame, df_ranks: pd.DataFrame,
                num_players: int) -> pd.DataFrame:
    """ 
    Expands the draft data so that each pick is represented by the top 
    number of num_picks players left on the board and creates the data 
    level that is necessary for modeling.

    Parameters
    ----------
    df_draft
        Draft data for ONE draft processed through add_ranks_lookups.
    df_lookups
        Rankings data pulled from read_ranks.
    num_players
        Number of the top available players that will be expanded
        for each pick (e.g. if num_players = 40, then the row count
        will increase by a factor of 40).

    Returns
    -------
    DataFrame
        Draft/Pick/Available Player level draft data.
    """

    # Lagged value used to build list of players already selected for each pick.
    keep_vars = ['draft_id', 'ranks_draft_date', 'number', 'drafted_player_key']
    df = df_draft[keep_vars].sort_values(by='number')
    df['drafted_player_key_l1'] = df['drafted_player_key'].shift(1)

    # Required to pull the ranks used for the draft.
    draft_date = df['ranks_draft_date'].iloc[0].strftime('%Y-%m-%d')

    keep_vars = ['player_key', 'rank_actual', 'team', 'pos', 'adp']
    rename_vars = {'rank_actual': 'avail_rank_actual', 'team': 'avail_team'
                    , 'pos': 'avail_position', 'adp': 'avail_projection_adp'}    
    _df_ranks = df_ranks.loc[df_ranks['date'] == draft_date][keep_vars]
    _df_ranks.rename(columns=rename_vars, inplace=True)

    # Loops through each individual player selection.
    zipped_cols = zip(df['draft_id'], df['drafted_player_key'], df['drafted_player_key_l1'])
    selections = []
    dfs = []
    for draft, player, player_l1 in zipped_cols:
        selections.append(player_l1)

        top_x_players = _df_ranks.loc[~_df_ranks['player_key'].isin(selections)].iloc[:num_players]
        top_x_players.rename(columns={'player_key': 'avail_player_key'}, inplace=True)
        
        # Expands player selection row by the top num_picks available players.
        _df = pd.DataFrame([[draft, player]], columns=['draft_id', 'drafted_player_key'])
        _df = pd.merge(_df, top_x_players, how='cross')

        dfs.append(_df)

    keep_vars = ['drafted_player_key', 'avail_player_key', 'avail_rank_actual'
                , 'avail_team', 'avail_position', 'avail_projection_adp']
    df_expanded = pd.concat(dfs)[keep_vars]

    df_draft = pd.merge(df_expanded, df_draft, on='drafted_player_key', how='left')

    return df_draft


def expand_all_drafts(df: pd.DataFrame, df_ranks: pd.DataFrame
                    , num_players: int) -> pd.DataFrame:
    """ 
    Expands each individual draft to the selected player/available player
    level with final df at the draft/selected player/available player level.

    Parameters
    ----------
    df_draft
        Draft data for ALL drafts processed through add_ranks_lookups.
    df_lookups
        Rankings data pulled from read_ranks.
    num_players
        Number of the top available players that will be expanded
        for each pick (e.g. if num_players = 40, then the row count
        will increase by a factor of 40).

    Returns
    -------
    DataFrame
        Draft/Pick/Available Player level draft data.
    """

    drafts = []
    draft_ids = list(df['draft_id'].drop_duplicates())
    for draft_id in draft_ids:
        df_draft = df.loc[df['draft_id'] == draft_id].copy()
        df_draft = _expand_draft(df_draft, df_ranks, num_players)

        drafts.append(df_draft)

    df_all_drafts = pd.concat(drafts)

    return df_all_drafts


def _add_avail_player_number(df_expanded: pd.DataFrame) -> pd.DataFrame:
    """ 
    Adds the pick each available player was actually drafted at.
    This will be used to determine if the player was available
    in the next round for the user.

    Parameters
    ----------
    df_expanded
        Draft data for ALL drafts processed through expand_all_drafts.

    Returns
    -------
    DataFrame
        Draft/Pick/Available Player level draft data.
    """

    keep_vars = ['draft_id', 'drafted_player_key', 'number']
    df_drafted_players = df_expanded[keep_vars].drop_duplicates()

    rename_vars = {'number': 'avail_number', 'drafted_player_key': 'avail_player_key'}
    df_drafted_players.rename(columns=rename_vars, inplace=True)

    df = pd.merge(df_expanded, df_drafted_players
                    , on=['draft_id', 'avail_player_key']
                    , how='left')

    return df


def _add_next_pick_number(df_expanded: pd.DataFrame) -> pd.DataFrame:
    """ 
    Adds the pick number of the next time the user will draft.
    Used to determine if the player was available in the next round.

    Note
    ----
    Picks at the turn use the pick following the next since the user
    will also be drafting back to back.

    Each user's last pick will be NULL, so these are filtered out.
    The second to last pick of the user drafting last will also removed
    since this is a turn pick.

    Parameters
    ----------
    df_expanded
        Draft data for ALL drafts processed through expand_all_drafts.

    Returns
    -------
    DataFrame
        Draft/Pick/Available Player level draft data.
    """

    df = df_expanded[['draft_id', 'draft_entry_id', 'number']].drop_duplicates()
    df.sort_values(by=['draft_id', 'draft_entry_id', 'number'], inplace=True)

    df['number_rl1'] = df['number'].shift(-1)
    df['number_rl2'] = df['number'].shift(-2)

    # Accounts for picks at the turn.
    df['next_pick_number'] = np.where(df['number_rl1'] - df['number'] == 1
                                    , df['number_rl2']
                                    , df['number_rl1'])

    # Fills picks pulled from other users/drafts with null values.
    df['next_pick_number'] = np.where(df['next_pick_number'] - df['number'] < 0
                                    , np.nan
                                    , df['next_pick_number'])

    df = pd.merge(df_expanded, df[['draft_id', 'number', 'next_pick_number']]
                    , on=['draft_id', 'number'], how='left')

    df = df.loc[df['next_pick_number'].notna()]

    return df


def add_picked_indicator(df_expanded: pd.DataFrame) -> pd.DataFrame:
    """ 
    Adds a flag indicating if the available player was available in 
    the next round.

    This will serve as the response variable for the model.

    Parameters
    ----------
    df_expanded
        Draft data for ALL drafts processed through expand_all_drafts.

    Returns
    -------
    DataFrame
        Draft/Pick/Available Player level draft data.
    """

    df = _add_avail_player_number(df_expanded)
    df = _add_next_pick_number(df)

    df['ind_avail'] = np.where(df['avail_number'] >= df['next_pick_number'], 1, 0)

    return df


def process_data(df_drafts: pd.DataFrame, df_ranks: pd.DataFrame
                , df_lookups: pd.DataFrame, num_players: int) -> pd.DataFrame:
    """ 
    Processes a (mostly) featureless dataset at the 
    Draft/Pick/Available Player level for modeling.

    Note
    ----
    Filters out the follwing: a\n
    - Drafts without player attributes a\n
    - Last pick of the draft for each user

    Drafts which use derived ranks will need further filtering to 
    account for later rounds lacking an adequate representation
    of rankings.

    Parameters
    ----------
    df_drafts
        Raw Draft/Pick level draft data.
    df_ranks
        Raw Day/Player level rankings data.
    df_lookups
        Raw Year/Lookup Type/Lookup Value lookups data. 
    num_players
        Number of the top available players that will be expanded
        for each pick (e.g. if num_players = 40, then the row count
        will increase by a factor of 40).

    Returns
    -------
    DataFrame
        Draft/Pick/Available Player level draft data.
    """
    
    df_updated_types = update_dtypes(df_drafts)
    df_complete_players = drafts_w_player_data(df_updated_types)
    df_draft_attrs = add_draft_attrs(df_complete_players)
    df_w_rank_type = add_draft_rank_type(df_draft_attrs)
    df_w_rank_lookups = add_ranks_lookups(df_w_rank_type, df_lookups)
    df_expanded = expand_all_drafts(df_w_rank_lookups, df_ranks, 40)
    df_final = add_picked_indicator(df_expanded)

    return df_final


def add_model_vars(df: pd.DataFrame) -> pd.DataFrame:
    """ Adds additional variables to test in the model. """

    df['actual_proj_adp_diff'] = df['projection_adp'] - df['number']
    df['rank_pick_diff'] = df['rank_actual'] - df['number']

    return df


pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 300)

DATA_FOLDER = '/home/cdelong/Python-Projects/UD-Draft-Model/Repo-Work/UD-Draft-Model/data'
RANKS_FOLDER = '/home/cdelong/Python-Projects/UD-Draft-Model/Repo-Work/UD-Draft-Model\
/data/2022/player_ranks'

df_ranks = read_ranks(DATA_FOLDER, [2021, 2022])
df_lookups = read_lookups(DATA_FOLDER, [2021, 2022])
df_drafts = read_drafts(DATA_FOLDER, [2021, 2022])

# df_updated_types = update_dtypes(df_drafts)
# df_complete_players = drafts_w_player_data(df_updated_types)
# df_draft_attrs = add_draft_attrs(df_complete_players)
# df_w_rank_type = add_draft_rank_type(df_draft_attrs)
# df_w_rank_lookups = add_ranks_lookups(df_w_rank_type, df_lookups)
# df_expanded = expand_all_drafts(df_w_rank_lookups, df_ranks, 40)
# df_final = add_picked_indicator(df_expanded)

df_final = process_data(df_drafts, df_ranks, df_lookups, 40)

print(df_final.columns)
print(df_ranks.columns)

# print(df_drafts.columns)
# df_drafts
# df_final
# df_final
# df_ranks
df_final

Index(['drafted_player_key', 'avail_player_key', 'avail_rank_actual',
       'avail_team', 'avail_position', 'avail_projection_adp', 'appearance_id',
       'created_at', 'draft_entry_id', 'number', 'projection_adp', 'draft_id',
       'draft_source', 'draft_title', 'num_teams', 'round', 'round_pick',
       'draft_datetime', 'draft_date', 'draft_year', 'ranks_type',
       'drafted_player', 'drafted_team', 'drafted_position',
       'ranks_draft_date', 'avail_number', 'next_pick_number', 'ind_avail'],
      dtype='object')
Index(['player', 'pos', 'team', 'adp', 'rank_actual', 'date', 'type', 'year',
       'player_key'],
      dtype='object')


Unnamed: 0,drafted_player_key,avail_player_key,avail_rank_actual,avail_team,avail_position,avail_projection_adp,appearance_id,created_at,draft_entry_id,number,projection_adp,draft_id,draft_source,draft_title,num_teams,round,round_pick,draft_datetime,draft_date,draft_year,ranks_type,drafted_player,drafted_team,drafted_position,ranks_draft_date,avail_number,next_pick_number,ind_avail
0,Christian McCaffrey - 2021-08-24 - 1.0,Christian McCaffrey - 2021-08-24 - 1.0,1,Carolina Panthers,RB,1.0,78a5634d-93aa-4cd9-b1af-dfb829df452c,2021-08-25 01:44:33,74b3fdcc-6128-4e7a-ae1a-fda04c1859d3,1,1.0,d525469e-276a-4cf3-ad07-a268841faea3,,,12,1,1,2021-08-25 01:44:33,2021-08-25,2021,custom,Christian McCaffrey,Carolina Panthers,RB,2021-08-24,1.0,24.0,0
1,Christian McCaffrey - 2021-08-24 - 1.0,Dalvin Cook - 2021-08-24 - 2.1,2,Minnesota Vikings,RB,2.1,78a5634d-93aa-4cd9-b1af-dfb829df452c,2021-08-25 01:44:33,74b3fdcc-6128-4e7a-ae1a-fda04c1859d3,1,1.0,d525469e-276a-4cf3-ad07-a268841faea3,,,12,1,1,2021-08-25 01:44:33,2021-08-25,2021,custom,Christian McCaffrey,Carolina Panthers,RB,2021-08-24,2.0,24.0,0
2,Christian McCaffrey - 2021-08-24 - 1.0,Alvin Kamara - 2021-08-24 - 3.7,3,New Orleans Saints,RB,3.7,78a5634d-93aa-4cd9-b1af-dfb829df452c,2021-08-25 01:44:33,74b3fdcc-6128-4e7a-ae1a-fda04c1859d3,1,1.0,d525469e-276a-4cf3-ad07-a268841faea3,,,12,1,1,2021-08-25 01:44:33,2021-08-25,2021,custom,Christian McCaffrey,Carolina Panthers,RB,2021-08-24,3.0,24.0,0
3,Christian McCaffrey - 2021-08-24 - 1.0,Derrick Henry - 2021-08-24 - 4.3,4,Tennessee Titans,RB,4.3,78a5634d-93aa-4cd9-b1af-dfb829df452c,2021-08-25 01:44:33,74b3fdcc-6128-4e7a-ae1a-fda04c1859d3,1,1.0,d525469e-276a-4cf3-ad07-a268841faea3,,,12,1,1,2021-08-25 01:44:33,2021-08-25,2021,custom,Christian McCaffrey,Carolina Panthers,RB,2021-08-24,4.0,24.0,0
4,Christian McCaffrey - 2021-08-24 - 1.0,Ezekiel Elliott - 2021-08-24 - 5.6,5,Dallas Cowboys,RB,5.6,78a5634d-93aa-4cd9-b1af-dfb829df452c,2021-08-25 01:44:33,74b3fdcc-6128-4e7a-ae1a-fda04c1859d3,1,1.0,d525469e-276a-4cf3-ad07-a268841faea3,,,12,1,1,2021-08-25 01:44:33,2021-08-25,2021,custom,Christian McCaffrey,Carolina Panthers,RB,2021-08-24,5.0,24.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1295490,Tyrion Davis-Price - 2022-05-15 - 179.8,Mitchell Trubisky - 2022-05-15 - 214.5,237,PIT,QB,214.5,cb618523-b22c-40d6-825c-a0ced8339239,2022-05-15 23:58:14,c488824b-b11d-42f0-b16c-e59fc361e425,203,179.8,133d89ab-ba4f-4230-9148-396bee781f5c,sit_and_go,,12,17,11,2022-05-15 23:09:01,2022-05-15,2022,actual,Tyrion Davis-Price,SF,RB,2022-05-15,,206.0,0
1295491,Tyrion Davis-Price - 2022-05-15 - 179.8,T.Y. Hilton - 2022-05-15 - 214.5,238,IND,WR,214.5,cb618523-b22c-40d6-825c-a0ced8339239,2022-05-15 23:58:14,c488824b-b11d-42f0-b16c-e59fc361e425,203,179.8,133d89ab-ba4f-4230-9148-396bee781f5c,sit_and_go,,12,17,11,2022-05-15 23:09:01,2022-05-15,2022,actual,Tyrion Davis-Price,SF,RB,2022-05-15,,206.0,0
1295492,Tyrion Davis-Price - 2022-05-15 - 179.8,Matt Breida - 2022-05-15 - 214.9,240,NYG,RB,214.9,cb618523-b22c-40d6-825c-a0ced8339239,2022-05-15 23:58:14,c488824b-b11d-42f0-b16c-e59fc361e425,203,179.8,133d89ab-ba4f-4230-9148-396bee781f5c,sit_and_go,,12,17,11,2022-05-15 23:09:01,2022-05-15,2022,actual,Tyrion Davis-Price,SF,RB,2022-05-15,,206.0,0
1295493,Tyrion Davis-Price - 2022-05-15 - 179.8,Nelson Agholor - 2022-05-15 - 214.9,241,NE,WR,214.9,cb618523-b22c-40d6-825c-a0ced8339239,2022-05-15 23:58:14,c488824b-b11d-42f0-b16c-e59fc361e425,203,179.8,133d89ab-ba4f-4230-9148-396bee781f5c,sit_and_go,,12,17,11,2022-05-15 23:09:01,2022-05-15,2022,actual,Tyrion Davis-Price,SF,RB,2022-05-15,214.0,206.0,1


In [12]:
df = df_final.copy()

df = df.loc[df['next_pick_number'].notna()]
# df_g = df.groupby(['draft_id', 'draft_year'])['round'].agg('min').reset_index()

# df = df.loc[df['draft_id'] == '0072c9f8-d403-46e3-818c-ecfccc6dfa43']

df

Unnamed: 0,drafted_player_key,avail_player_key,avail_rank_actual,avail_team,avail_position,avail_projection_adp,appearance_id,created_at,draft_entry_id,number,projection_adp,draft_id,draft_source,draft_title,num_teams,round,round_pick,draft_datetime,draft_date,draft_year,ranks_type,drafted_player,drafted_team,drafted_position,ranks_draft_date,avail_number,next_pick_number,ind_avail
0,Christian McCaffrey - 2021-08-24 - 1.0,Christian McCaffrey - 2021-08-24 - 1.0,1,Carolina Panthers,RB,1.0,78a5634d-93aa-4cd9-b1af-dfb829df452c,2021-08-25 01:44:33,74b3fdcc-6128-4e7a-ae1a-fda04c1859d3,1,1.0,d525469e-276a-4cf3-ad07-a268841faea3,,,12,1,1,2021-08-25 01:44:33,2021-08-25,2021,custom,Christian McCaffrey,Carolina Panthers,RB,2021-08-24,1.0,24.0,0
1,Christian McCaffrey - 2021-08-24 - 1.0,Dalvin Cook - 2021-08-24 - 2.1,2,Minnesota Vikings,RB,2.1,78a5634d-93aa-4cd9-b1af-dfb829df452c,2021-08-25 01:44:33,74b3fdcc-6128-4e7a-ae1a-fda04c1859d3,1,1.0,d525469e-276a-4cf3-ad07-a268841faea3,,,12,1,1,2021-08-25 01:44:33,2021-08-25,2021,custom,Christian McCaffrey,Carolina Panthers,RB,2021-08-24,2.0,24.0,0
2,Christian McCaffrey - 2021-08-24 - 1.0,Alvin Kamara - 2021-08-24 - 3.7,3,New Orleans Saints,RB,3.7,78a5634d-93aa-4cd9-b1af-dfb829df452c,2021-08-25 01:44:33,74b3fdcc-6128-4e7a-ae1a-fda04c1859d3,1,1.0,d525469e-276a-4cf3-ad07-a268841faea3,,,12,1,1,2021-08-25 01:44:33,2021-08-25,2021,custom,Christian McCaffrey,Carolina Panthers,RB,2021-08-24,3.0,24.0,0
3,Christian McCaffrey - 2021-08-24 - 1.0,Derrick Henry - 2021-08-24 - 4.3,4,Tennessee Titans,RB,4.3,78a5634d-93aa-4cd9-b1af-dfb829df452c,2021-08-25 01:44:33,74b3fdcc-6128-4e7a-ae1a-fda04c1859d3,1,1.0,d525469e-276a-4cf3-ad07-a268841faea3,,,12,1,1,2021-08-25 01:44:33,2021-08-25,2021,custom,Christian McCaffrey,Carolina Panthers,RB,2021-08-24,4.0,24.0,0
4,Christian McCaffrey - 2021-08-24 - 1.0,Ezekiel Elliott - 2021-08-24 - 5.6,5,Dallas Cowboys,RB,5.6,78a5634d-93aa-4cd9-b1af-dfb829df452c,2021-08-25 01:44:33,74b3fdcc-6128-4e7a-ae1a-fda04c1859d3,1,1.0,d525469e-276a-4cf3-ad07-a268841faea3,,,12,1,1,2021-08-25 01:44:33,2021-08-25,2021,custom,Christian McCaffrey,Carolina Panthers,RB,2021-08-24,5.0,24.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1295490,Tyrion Davis-Price - 2022-05-15 - 179.8,Mitchell Trubisky - 2022-05-15 - 214.5,237,PIT,QB,214.5,cb618523-b22c-40d6-825c-a0ced8339239,2022-05-15 23:58:14,c488824b-b11d-42f0-b16c-e59fc361e425,203,179.8,133d89ab-ba4f-4230-9148-396bee781f5c,sit_and_go,,12,17,11,2022-05-15 23:09:01,2022-05-15,2022,actual,Tyrion Davis-Price,SF,RB,2022-05-15,,206.0,0
1295491,Tyrion Davis-Price - 2022-05-15 - 179.8,T.Y. Hilton - 2022-05-15 - 214.5,238,IND,WR,214.5,cb618523-b22c-40d6-825c-a0ced8339239,2022-05-15 23:58:14,c488824b-b11d-42f0-b16c-e59fc361e425,203,179.8,133d89ab-ba4f-4230-9148-396bee781f5c,sit_and_go,,12,17,11,2022-05-15 23:09:01,2022-05-15,2022,actual,Tyrion Davis-Price,SF,RB,2022-05-15,,206.0,0
1295492,Tyrion Davis-Price - 2022-05-15 - 179.8,Matt Breida - 2022-05-15 - 214.9,240,NYG,RB,214.9,cb618523-b22c-40d6-825c-a0ced8339239,2022-05-15 23:58:14,c488824b-b11d-42f0-b16c-e59fc361e425,203,179.8,133d89ab-ba4f-4230-9148-396bee781f5c,sit_and_go,,12,17,11,2022-05-15 23:09:01,2022-05-15,2022,actual,Tyrion Davis-Price,SF,RB,2022-05-15,,206.0,0
1295493,Tyrion Davis-Price - 2022-05-15 - 179.8,Nelson Agholor - 2022-05-15 - 214.9,241,NE,WR,214.9,cb618523-b22c-40d6-825c-a0ced8339239,2022-05-15 23:58:14,c488824b-b11d-42f0-b16c-e59fc361e425,203,179.8,133d89ab-ba4f-4230-9148-396bee781f5c,sit_and_go,,12,17,11,2022-05-15 23:09:01,2022-05-15,2022,actual,Tyrion Davis-Price,SF,RB,2022-05-15,214.0,206.0,1


In [36]:
_df = df.loc[df['draft_year'] == 2022]
df_g = _df.groupby('draft_id').size().to_frame('num_players').reset_index()

df_g

Unnamed: 0,draft_id,num_players
0,03c05fa1-0f9b-4390-b2e2-1822135a4791,8640
1,106c07dd-72ad-4405-b2e1-c1b699ff423a,8640
2,133d89ab-ba4f-4230-9148-396bee781f5c,8640
3,15ce5cbe-f227-4255-9a34-dbc9282ff5c5,8640
4,18b7bc6c-edd1-44d9-bb04-2448d0b224da,8640
5,1a0c4e6a-cd7c-44e0-b5b3-9112b2d845ab,8640
6,26d9229b-9ce5-4a5d-993a-175294a397c6,8640
7,2a35ed1c-8e6a-456a-93c9-71b3d436f056,8640
8,34b0228a-98af-4b65-b675-5da3c1fff455,8640
9,37bd69de-5060-4dca-a912-a3977de0af4c,8640


In [None]:
""""
POTENTIAL TESTS:
    - Verify data types for imports
    - Any ranks lookup vals that are null? (i.e. does anything need added to the lookup file)
    - Does the draft ADP align with Ranks ADP when joined on player/position/date?

"""

In [None]:
###########################################################################################
################################### Scratch to keep #######################################
###########################################################################################

In [23]:
def validate_adp_ranks(df_drafts: pd.DataFrame, df_ranks: pd.DataFrame) -> pd.DataFrame:
    """ Verifies the adp from the drafts and ranks data is the same """
    
    df_base = df_drafts.copy()
    df_ranks = df_ranks.copy()

    keep_vars = ['draft_id', 'draft_datetime', 'created_at', 'drafted_player'
                , 'drafted_position', 'drafted_team'
                , 'ranks_draft_date', 'projection_adp']
    df_base = df_base[keep_vars]

    keep_vars = ['player', 'pos', 'team', 'date', 'adp']
    df_ranks = df_ranks[keep_vars]

    df = pd.merge(df_base, df_ranks, how='left'
                , left_on=['drafted_player', 'projection_adp', 'ranks_draft_date']
                , right_on=['player', 'adp', 'date'])

    df = df.loc[df['projection_adp'] != df['adp']]

    return df


def find_missing_lookups(df_drafts: pd.DataFrame, df_ranks: pd.DataFrame) -> pd.DataFrame:
    """
    Find players in the drafts data that do not map to the ranks data.
    Note that these will need added to the lookups file.
    """

    ranks_var = 'player'
    # drafts_var = 'full_name'
    drafts_var = 'final_player_name'

    df_ranks = df_ranks[[ranks_var]].drop_duplicates(subset=ranks_var)
    df_drafts = df_drafts[[drafts_var]].drop_duplicates(subset=drafts_var)

    df = pd.merge(df_drafts, df_ranks, how='left'
                    , left_on=drafts_var, right_on=ranks_var)

    df = df.loc[df['player'].isnull()]

    return df


def summarize_actual_der_rnk_diff(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    Summarizes the difference between actual and derived rank by round.
    Used to determine which rounds to drop for drafts that don't have
    actual rank data (e.g. 2021 drafts).
    """

    keep_vars = ['draft_id', 'final_player_name', 'number', 'round'
                , 'rank_derived', 'projection_adp', 'rank_actual', 'appearance_id']
    df = df[keep_vars].copy()

    df['ind_rank_diff'] = np.where(abs(df['rank_actual'] - df['rank_derived']) > 1, 1, 0)

    df = df.groupby('round').agg({'appearance_id': 'count'
                                , 'ind_rank_diff': 'sum'}).reset_index()

    df.rename(columns={'appearance_id': 'total_num_picks'
                    , 'ind_rank_diff': 'num_picks_w_diff'}, inplace=True)

    return df


_df_drafts = df_final.copy()
_df_ranks = df_ranks.copy()

df_adp_val = validate_adp_ranks(_df_drafts, _df_ranks)
# df_missing_lookups = find_missing_lookups(_df_drafts, _df_ranks)
# df_rank_diff_summary = summarize_actual_der_rnk_diff(_df_drafts)

df_adp_val


Unnamed: 0,draft_id,draft_datetime,created_at,drafted_player,drafted_position,drafted_team,ranks_draft_date,projection_adp,player,pos,team,date,adp


In [None]:
###########################################################################################
################################### Basic Exploration #####################################
###########################################################################################

In [77]:
df = df_final.copy()
condition = (df['drafted_player_key'] == df['avail_player_key'])
df['ind_selected'] = np.where(condition, 1, 0)
df['diff_abs_number_rank'] = df['avail_rank_actual'] - df['next_pick_number']
df['sq_diff_abs_number_rank'] = df['diff_abs_number_rank'] ** 2
df['log_diff_abs_number_rank'] = np.log(df['diff_abs_number_rank'])

df = df.loc[(df['draft_year'] == 2022) & (df['round'] < 10)]

corr_vars = ['ind_avail', 'diff_abs_number_rank', 'sq_diff_abs_number_rank'
            , 'log_diff_abs_number_rank'] 
df[corr_vars].corr()

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,ind_avail,diff_abs_number_rank,sq_diff_abs_number_rank,log_diff_abs_number_rank
ind_avail,1.0,0.743844,0.204675,0.340893
diff_abs_number_rank,0.743844,1.0,0.498436,0.908638
sq_diff_abs_number_rank,0.204675,0.498436,1.0,0.773892
log_diff_abs_number_rank,0.340893,0.908638,0.773892,1.0


In [198]:
# df = df_final.copy()
df = df_final.loc[df_final['draft_year'] == 2022].copy()

dfs = []
for round in range(1, 19):
    df = df_final.loc[df_final['round'] == round]

    df = df['rank_pick_diff'].quantile([.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]).to_frame()
    df = df.transpose()

    df['round'] = round

    cols = df.columns.to_list()
    cols = cols[-1:] + cols[:-1]

    df = df[cols]

    dfs.append(df)

df = pd.concat(dfs)


# df[['0.01', '0.05']]
df


Unnamed: 0,round,0.01,0.05,0.25,0.5,0.75,0.95,0.99
rank_pick_diff,1,-3.0,-2.0,-1.0,0.0,1.0,3.0,5.0
rank_pick_diff,2,-6.0,-4.0,-1.25,0.0,2.0,5.0,9.0
rank_pick_diff,3,-8.0,-6.0,-2.0,0.0,2.0,6.0,12.01
rank_pick_diff,4,-10.0,-7.0,-2.25,0.0,3.0,10.0,14.0
rank_pick_diff,5,-12.01,-9.0,-4.0,0.0,3.0,10.0,17.0
rank_pick_diff,6,-13.01,-8.0,-3.0,0.0,3.0,10.0,18.01
rank_pick_diff,7,-13.0,-9.0,-3.0,0.0,4.0,11.0,22.0
rank_pick_diff,8,-16.01,-11.0,-4.0,0.0,4.0,12.0,21.01
rank_pick_diff,9,-18.01,-11.0,-4.0,0.0,5.0,12.05,19.02
rank_pick_diff,10,-19.0,-12.0,-5.0,-1.0,4.0,15.0,38.0


In [121]:
# Check correlations between primary modeling variables and draft pick
df[['number', 'projection_adp', 'actual_proj_adp_diff']].corr()

Unnamed: 0,number,projection_adp,actual_proj_adp_diff
number,1.0,0.989551,-0.024938
projection_adp,0.989551,1.0,0.119459
actual_proj_adp_diff,-0.024938,0.119459,1.0


In [70]:
df = df_complete_players.copy()
df = df[['draft_id', 'draft_source']].drop_duplicates(subset='draft_id')

df = df.groupby('draft_source', dropna=False).size().to_frame('num_drafts').reset_index()

df

Unnamed: 0,draft_source,num_drafts
0,sit_and_go,68
1,tournament,33
2,,55
