In [52]:
import pandas as pd
import numpy as np
from os import listdir
from os import path
from typing import Union


def compile_ranks(folder_path: str) -> pd.DataFrame:
    """ Compiles all csvs in the folder path into one df. """

    files = listdir(folder_path)

    dfs = []
    for file in files:
        if file[:15] == 'df_player_ranks':
            full_path = path.join(folder_path, file)

            df = pd.read_csv(full_path)

            dfs.append(df)

    df = pd.concat(dfs)

    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year

    df['final_player_id'] = df['player'] \
                    + ' - ' + df['date'].astype('str') \
                    + ' - ' + df['adp'].astype('str')   

    # Required to differentiate from derived rank
    df.rename(columns={'rank': 'rank_actual'}, inplace=True)

    return df


def read_drafts(folder_path: str, years: list) -> pd.DataFrame:

    dfs = []
    for year in years:
        df_drafts = pd.read_csv(path.join(folder_path, f'{year}/df_drafts.csv'))
        df_info = pd.read_csv(path.join(folder_path, f'{year}/df_league_info.csv'))

        df_info = df_info[['id', 'source', 'title']]

        rename_vars = {'id': 'draft_id', 'source': 'draft_source', 'title': 'draft_title'}
        df_info.rename(columns=rename_vars, inplace=True)

        df = pd.merge(df_drafts, df_info, how='left', on='draft_id')
        dfs.append(df)

    df = pd.concat(dfs)
    df['full_name'] = df['first_name'] + ' ' + df['last_name']

    drop_vars = ['id', 'pick_slot_id', 'points', 'projection_points'
                , 'swapped', 'player_id', 'first_name', 'last_name']
    df.drop(columns=drop_vars, inplace=True)

    return df


def read_ranks(folder_path: str, years: list) -> pd.DataFrame:
    """  Reads in all Ranks data. """

    dfs = []
    for year in years:
        df = compile_ranks(path.join(DATA_FOLDER, f'{year}/player_ranks'))
        dfs.append(df)
    
    df = pd.concat(dfs)

    return df


def read_lookups(folder_path: str, years: list) -> pd.DataFrame:
    """ Reads in all Lookups files. """
    dfs = []
    for year in years:
        df = pd.read_csv(path.join(DATA_FOLDER, f'{year}/lookups.csv'))
        dfs.append(df)
    
    df = pd.concat(dfs)

    return df


def add_lookup_vals(df_base: pd.DataFrame, df_lookups: pd.DataFrame, lookup_type: str
                    , join_col_name: str, final_col_name: str) -> pd.DataFrame:
    """ 
    Adds the ranks_val from the df_lookups dataset to df_base based off the lookup_type
    and updates its name to the final_col_name.
    Point of this is for the player attributes in the drafts data to align with those
    in the ranks data.
    IMPORTANT: If other years ever end up being added, they must be all be found on
    the df passed to df_lookups. Otherwise, only the last year's values will be shown.
    """

    df_base = df_base.copy()
    df_lookups = df_lookups.loc[df_lookups['lookup_type'] == lookup_type].copy()

    df = pd.merge(df_base, df_lookups, how='left'
                , left_on=['draft_year', join_col_name]
                , right_on=['draft_year', 'drafts_val'])

    df.drop(columns=['lookup_type', 'drafts_val'], inplace=True)
    df.rename(columns={'ranks_val': final_col_name}, inplace=True)

    return df


def update_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """ Updates columns to more appropriate dyptes. """
    
    # Replace null adps and update to float
    df['projection_adp'] = np.where(df['projection_adp'] == '-', 216, df['projection_adp'])
    df['projection_adp'] = df['projection_adp'].astype('float')

    # Update created_at to datetime to use as possible filter
    df['created_at'] = pd.to_datetime(df['created_at'], infer_datetime_format=True)

    return df


def drafts_w_player_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filters out drafts which do not have player attributes (team, position, etc.)
    as these will likely serve as features for the model.
    """

    df = df.copy()

    null_drafts = df.loc[df['full_name'].isnull()]

    null_drafts = null_drafts.drop_duplicates(subset='draft_id')['draft_id'].to_frame()
    null_drafts['ind_null_name_draft'] = 1

    df = pd.merge(df, null_drafts, on='draft_id', how='left')
    df = df.loc[df['ind_null_name_draft'].isnull()]

    df.drop(columns='ind_null_name_draft', inplace=True)

    return df


def _add_draft_dt(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    Adds the datetime, date, and year of the draft.
    Note that created_at is datetime of each pick.
    """
    
    df_drafts = df[['draft_id', 'created_at']].copy()
    df_drafts.sort_values(by=['draft_id', 'created_at'], inplace=True)

    df_drafts.drop_duplicates(subset='draft_id', keep='first', inplace=True)
    df_drafts.rename(columns={'created_at': 'draft_datetime'}, inplace=True)

    df_drafts['draft_date'] = df_drafts['draft_datetime'].dt.normalize()
    df_drafts['draft_year'] = df_drafts['draft_datetime'].dt.year

    df = pd.merge(df, df_drafts, on='draft_id', how='left')

    return df


def add_draft_attrs(df: pd.DataFrame) -> pd.DataFrame:
    """ Adds draft level attributes. """

    # Adds number of teams by draft
    by_vars = ['draft_id', 'draft_entry_id']
    draft_teams = df[by_vars].drop_duplicates(subset=by_vars)

    num_teams = draft_teams.groupby('draft_id').size().to_frame('num_teams')

    df = pd.merge(df, num_teams, on='draft_id', how='left')

    # Adds round and pick of the round by draft
    df['round'] = ((df['number'] - 1) / df['num_teams']).astype('int') + 1
    df['round_pick'] = df['number'] - ((df['round'] - 1) * df['num_teams'])

    # Add datetime, date, and year of draft and year
    df = _add_draft_dt(df)

    return df


def add_ranks_lookups(df: pd.DataFrame, df_lookups: pd.DataFrame) -> pd.DataFrame:
    """ 
    Adds the lookups required to map to the ranks df.
    IMPORATANT: Passed df must contain draft_year
    """

    df = add_lookup_vals(df, df_lookups, 'player', 'full_name', 'final_player_name')
    df = add_lookup_vals(df, df_lookups, 'team', 'team_name', 'final_team_name')
    df = add_lookup_vals(df, df_lookups, 'position', 'position', 'final_position')

    # Draft date appears to be offset by a day relative to the ranks
    # for early morning drafts (or at least those with that timestamp).
    df['final_draft_date'] = np.where(df['draft_datetime'].dt.hour <= 5
                                    , df['draft_date'] - pd.Timedelta(days=1)
                                    , df['draft_date'])

    # Ranks data will be stacked with derived ranks from drafts w/o ranks data.
    # This will allow those drafts to link back to the stacked ranks data.
    # IMPORTANT:
    player = np.where(df['final_player_name'].isnull(), df['full_name']
                    , df['final_player_name'])
    df['final_player_id'] = player \
                    + ' - ' + df['final_draft_date'].astype('str') \
                    + ' - ' + df['projection_adp'].astype('str') 

    return df


def _add_rank_actual(df_drafts: pd.DataFrame, df_ranks: pd.DataFrame) -> pd.DataFrame:
    """ 
    Adds the actual rank of each player for every draft.
    This is to determine how often actual rank differs from derived.
    """
    
    df_base = df_drafts.copy()
    df_ranks = df_ranks.copy()

    keep_vars = ['player', 'date', 'adp', 'rank_actual']
    df_ranks = df_ranks[keep_vars]

    rename_vars = {'player': 'final_player_name'
                    , 'adp': 'projection_adp'
                    , 'date': 'final_draft_date'}
    df_ranks.rename(columns=rename_vars, inplace=True)

    df = pd.merge(df_base, df_ranks, how='left'
                , on=['final_player_name', 'projection_adp', 'final_draft_date'])

    return df


def _add_rank_derived(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    Adds derived rank for each draft/player based off adp. 
    Note that even early round derived ranks won't align
    with actual due to multiple players having the same ADP.
    """

    df = df.copy()
    df.sort_values(by=['draft_id', 'projection_adp', 'number'], inplace=True)

    df['rank_derived'] = 1
    df['rank_derived'] = df.groupby('draft_id')['rank_derived'].cumsum()

    df.sort_values(by=['draft_date', 'draft_id', 'number'], inplace=True)

    return df


def add_ranks(df_drafts: pd.DataFrame, df_ranks: pd.DataFrame) -> pd.DataFrame:
    """ Adds all rank versions to df_drafts. """
    
    df = _add_rank_actual(df_drafts, df_ranks)
    df = _add_rank_derived(df)

    return df


def add_model_vars(df: pd.DataFrame) -> pd.DataFrame:
    """ Adds additional variables to test in the model. """

    df['actual_proj_adp_diff'] = df['projection_adp'] - df['number']
    df['rank_pick_diff'] = df['rank_actual'] - df['number']

    return df


pd.set_option('display.max_rows', 300)

DATA_FOLDER = '/home/cdelong/Python-Projects/UD-Draft-Model/Repo-Work/UD-Draft-Model/data'
RANKS_FOLDER = '/home/cdelong/Python-Projects/UD-Draft-Model/Repo-Work/UD-Draft-Model\
/data/2022/player_ranks'

df_ranks = read_ranks(DATA_FOLDER, [2021, 2022])
df_lookups = read_lookups(DATA_FOLDER, [2021, 2022])
df_drafts = read_drafts(DATA_FOLDER, [2021, 2022])

df_updated_types = update_dtypes(df_drafts)
df_complete_players = drafts_w_player_data(df_updated_types)
df_draft_attrs = add_draft_attrs(df_complete_players)
df_rank_lookups = add_ranks_lookups(df_draft_attrs, df_lookups)
df_w_ranks = add_ranks(df_rank_lookups, df_ranks)

df_final = add_model_vars(df_w_ranks)


print(df_final.columns)
print(df_ranks.columns)

# print(df_drafts.columns)
# df_drafts
# df_final
df_lookups

Index(['appearance_id', 'created_at', 'draft_entry_id', 'number',
       'projection_adp', 'draft_id', 'position', 'team_name', 'draft_source',
       'draft_title', 'full_name', 'num_teams', 'round', 'round_pick',
       'draft_datetime', 'draft_date', 'draft_year', 'final_player_name',
       'final_team_name', 'final_position', 'final_draft_date',
       'final_player_id', 'rank_actual', 'rank_derived',
       'actual_proj_adp_diff', 'rank_pick_diff'],
      dtype='object')
Index(['player', 'pos', 'team', 'adp', 'rank_actual', 'date', 'year',
       'final_player_id'],
      dtype='object')


Unnamed: 0,lookup_type,draft_year,drafts_val,ranks_val
0,player,2021,Christian McCaffrey,Christian McCaffrey
1,player,2021,Dalvin Cook,Dalvin Cook
2,player,2021,Derrick Henry,Derrick Henry
3,player,2021,Saquon Barkley,Saquon Barkley
4,player,2021,Alvin Kamara,Alvin Kamara
...,...,...,...,...
331,team,2022,Washington Commanders,WAS
332,position,2022,RB,RB
333,position,2022,WR,WR
334,position,2022,TE,TE


In [47]:
df_ranks = compile_ranks(path.join(DATA_FOLDER, '2022/player_ranks'))
df_ranks

Unnamed: 0,player,pos,team,adp,rank_actual,date,year,final_player_id
0,Jonathan Taylor,RB,IND,1.5,1,2022-08-19,2022,Jonathan Taylor - 2022-08-19 - 1.5
1,Christian McCaffrey,RB,CAR,1.9,2,2022-08-19,2022,Christian McCaffrey - 2022-08-19 - 1.9
2,Justin Jefferson,WR,MIN,3.3,3,2022-08-19,2022,Justin Jefferson - 2022-08-19 - 3.3
3,Cooper Kupp,WR,LA,3.9,4,2022-08-19,2022,Cooper Kupp - 2022-08-19 - 3.9
4,Ja'Marr Chase,WR,CIN,5.4,5,2022-08-19,2022,Ja'Marr Chase - 2022-08-19 - 5.4
...,...,...,...,...,...,...,...,...
395,Dennis Houston,WR,DAL,216.0,396,2022-07-11,2022,Dennis Houston - 2022-07-11 - 216.0
396,Phillip Dorsett,WR,HOU,216.0,397,2022-07-11,2022,Phillip Dorsett - 2022-07-11 - 216.0
397,Todd Gurley,RB,,216.0,398,2022-07-11,2022,Todd Gurley - 2022-07-11 - 216.0
398,Tim Tebow,TE,,216.0,399,2022-07-11,2022,Tim Tebow - 2022-07-11 - 216.0


In [35]:
def _read_raw_data(folder_path):
    """
    Reads in the raw csvs and combines into one df.
    Might want to make more dynamic at some point.
    """

    df_raw_2021 = pd.read_csv(path.join(folder_path, '2021/df_drafts.csv'))

    df_drafts_2022 = pd.read_csv(path.join(folder_path, '2022/df_drafts.csv'))
    df_league_info_2022 = pd.read_csv(path.join(folder_path, '2022/df_league_info.csv'))
    df_league_info_2022 = df_league_info_2022[['id', 'source', 'title']]

    rename_vars = {'id': 'draft_id', 'source': 'draft_source', 'title': 'draft_title'}
    df_league_info_2022.rename(columns=rename_vars, inplace=True)

    df_raw_2022 = pd.merge(df_drafts_2022, df_league_info_2022, how='left', on='draft_id')
    df_raw_2022

    df = pd.concat([df_raw_2021, df_raw_2022])
    df['full_name'] = df['first_name'] + ' ' + df['last_name']

    drop_vars = ['id', 'pick_slot_id', 'points', 'projection_points'
                , 'swapped', 'player_id', 'first_name', 'last_name']
    df.drop(columns=drop_vars, inplace=True)

    return df


def read_raw_data(folder_path: str, years: list) -> pd.DataFrame:

    dfs = []
    for year in years:
        df_drafts = pd.read_csv(path.join(folder_path, f'{year}/df_drafts.csv'))
        df_info = pd.read_csv(path.join(folder_path, f'{year}/df_league_info.csv'))

        df_info = df_info[['id', 'source', 'title']]

        rename_vars = {'id': 'draft_id', 'source': 'draft_source', 'title': 'draft_title'}
        df_info.rename(columns=rename_vars, inplace=True)

        df = pd.merge(df_drafts, df_info, how='left', on='draft_id')
        dfs.append(df)

    df = pd.concat(dfs)
    df['full_name'] = df['first_name'] + ' ' + df['last_name']

    drop_vars = ['id', 'pick_slot_id', 'points', 'projection_points'
                , 'swapped', 'player_id', 'first_name', 'last_name']
    df.drop(columns=drop_vars, inplace=True)

    return df

df_drafts = read_raw_data(DRAFTS_FOLDER, [2021, 2022])
df_old = _read_raw_data(DRAFTS_FOLDER)
df_old2 = _read_raw_data(DRAFTS_FOLDER)


# print(len(list(df_drafts.columns)))
# print(df_drafts.columns)
# print(df_drafts2.columns)

print(len(df_drafts.columns))
print(len(df_old.columns))

11
11


In [39]:
# df_drafts.sort_values(by=['draft_id', 'created_at'], inplace=True)
# df_old.sort_values(by=['draft_id', 'created_at'], inplace=True)
# df_old2.sort_values(by=['draft_id', 'created_at'], inplace=True)

df_drafts.drop(columns=['draft_source', 'draft_title'], inplace=True)
df_old.drop(columns=['draft_source', 'draft_title'], inplace=True)

In [40]:
# cols = list(df_drafts.columns)
# for col in cols:
#     if df_old2[col].all == df_old[col].all:
#         print(col, ":", True)
#     else:
#         print(col, ":", False)

from pandas.util.testing import assert_frame_equal
assert_frame_equal(df_drafts, df_old)

In [5]:
keep_vars = ['draft_id', 'final_player_name', 'number', 'round', 'final_draft_date'
            , 'rank_derived', 'projection_adp', 'rank_actual', 'appearance_id']
df = df_final.loc[df_final['draft_year'] == 2022][keep_vars].copy()
df = df.loc[df['draft_id'] == '133d89ab-ba4f-4230-9148-396bee781f5c']
df = df.loc[df['final_player_name'] == 'Justin Jefferson']

_df_ranks = df_ranks.loc[df_ranks['date'] == '2022-05-15']
_df_ranks = _df_ranks.loc[_df_ranks['rank_actual'] <= 20]

df_merged = pd.merge(df, _df_ranks, how='cross')

df_merged

Unnamed: 0,draft_id,final_player_name,number,round,final_draft_date,rank_derived,projection_adp,rank_actual_x,appearance_id,player,pos,team,adp,rank_actual_y,date,year,final_player_id
0,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson,1,1,2022-05-15,4,4.6,4.0,62154a84-9ad4-41f8-b3f3-23801013ebc8,Jonathan Taylor,RB,IND,1.1,1,2022-05-15,2022,Jonathan Taylor - 2022-05-15 - 1.1
1,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson,1,1,2022-05-15,4,4.6,4.0,62154a84-9ad4-41f8-b3f3-23801013ebc8,Cooper Kupp,WR,LA,2.6,2,2022-05-15,2022,Cooper Kupp - 2022-05-15 - 2.6
2,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson,1,1,2022-05-15,4,4.6,4.0,62154a84-9ad4-41f8-b3f3-23801013ebc8,Christian McCaffrey,RB,CAR,3.1,3,2022-05-15,2022,Christian McCaffrey - 2022-05-15 - 3.1
3,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson,1,1,2022-05-15,4,4.6,4.0,62154a84-9ad4-41f8-b3f3-23801013ebc8,Justin Jefferson,WR,MIN,4.6,4,2022-05-15,2022,Justin Jefferson - 2022-05-15 - 4.6
4,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson,1,1,2022-05-15,4,4.6,4.0,62154a84-9ad4-41f8-b3f3-23801013ebc8,Ja'Marr Chase,WR,CIN,5.3,5,2022-05-15,2022,Ja'Marr Chase - 2022-05-15 - 5.3
5,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson,1,1,2022-05-15,4,4.6,4.0,62154a84-9ad4-41f8-b3f3-23801013ebc8,Derrick Henry,RB,TEN,5.9,6,2022-05-15,2022,Derrick Henry - 2022-05-15 - 5.9
6,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson,1,1,2022-05-15,4,4.6,4.0,62154a84-9ad4-41f8-b3f3-23801013ebc8,Austin Ekeler,RB,LAC,6.9,7,2022-05-15,2022,Austin Ekeler - 2022-05-15 - 6.9
7,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson,1,1,2022-05-15,4,4.6,4.0,62154a84-9ad4-41f8-b3f3-23801013ebc8,Najee Harris,RB,PIT,8.4,8,2022-05-15,2022,Najee Harris - 2022-05-15 - 8.4
8,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson,1,1,2022-05-15,4,4.6,4.0,62154a84-9ad4-41f8-b3f3-23801013ebc8,Davante Adams,WR,LV,9.9,9,2022-05-15,2022,Davante Adams - 2022-05-15 - 9.9
9,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson,1,1,2022-05-15,4,4.6,4.0,62154a84-9ad4-41f8-b3f3-23801013ebc8,Stefon Diggs,WR,BUF,10.4,10,2022-05-15,2022,Stefon Diggs - 2022-05-15 - 10.4


In [13]:
keep = ['final_player_name', 'team_name']
_df1 = df_final.loc[df_final['draft_year'] == 2022].drop_duplicates(subset=keep)[keep]

# _df1.drop_dup
print(len(_df1))

_df1.drop_duplicates(subset='final_player_name', inplace=True)

print(len(_df1))

300
300


In [24]:
keep = ['player', 'team']
_df1 = df_ranks.loc[df_ranks['year'] == 2022].drop_duplicates(subset=keep)[keep]

# _df1.drop_duplicates(subset='player', inplace=True)
df_g = _df1.groupby('player').size().to_frame('num_teams').reset_index()
df_g.sort_values(by='num_teams', ascending=False, inplace=True)

# df_g
df_ranks.loc[df_ranks['player'] == 'Baker Mayfield'].sort_values(by='date')

Unnamed: 0,player,pos,team,adp,rank_actual,date,year,final_player_id
255,Baker Mayfield,QB,CLE,216.0,256,2022-05-02,2022,Baker Mayfield - 2022-05-02 - 216.0
255,Baker Mayfield,QB,CLE,214.9,256,2022-05-03,2022,Baker Mayfield - 2022-05-03 - 214.9
258,Baker Mayfield,QB,CLE,215.1,259,2022-05-04,2022,Baker Mayfield - 2022-05-04 - 215.1
281,Baker Mayfield,QB,CLE,215.6,282,2022-05-05,2022,Baker Mayfield - 2022-05-05 - 215.6
283,Baker Mayfield,QB,CLE,215.7,284,2022-05-06,2022,Baker Mayfield - 2022-05-06 - 215.7
279,Baker Mayfield,QB,CLE,215.6,280,2022-05-07,2022,Baker Mayfield - 2022-05-07 - 215.6
292,Baker Mayfield,QB,CLE,215.8,293,2022-05-08,2022,Baker Mayfield - 2022-05-08 - 215.8
270,Baker Mayfield,QB,CLE,215.6,271,2022-05-09,2022,Baker Mayfield - 2022-05-09 - 215.6
270,Baker Mayfield,QB,CLE,215.6,271,2022-05-10,2022,Baker Mayfield - 2022-05-10 - 215.6
267,Baker Mayfield,QB,CLE,215.6,268,2022-05-11,2022,Baker Mayfield - 2022-05-11 - 215.6


In [6]:
def expand_draft(df_draft: pd.DataFrame, df_ranks: pd.DataFrame,
                num_picks: int) -> pd.DataFrame:
    """
    Expands the draft data so that each pick is represented by the top 
    number of num_picks players left on the board and creates the data 
    level that is necessary for modeling.
    """

    keep_vars = ['draft_id', 'draft_date', 'number', 'final_player_id']
    df = df_draft[keep_vars].copy()
    df['final_player_id_l1'] = df['final_player_id'].shift(1)

    keep_vars = ['final_player_id', 'rank_actual']
    _df_ranks = df_ranks.loc[df_ranks['date'] == '2022-05-15'][keep_vars]

    zipped_cols = zip(df['draft_id'], df['final_player_id'], df['final_player_id_l1'])
    selections = []
    dfs = []
    for draft, player, player_l1 in zipped_cols:
        selections.append(player_l1)

        top_x_players = _df_ranks.loc[~_df_ranks['final_player_id'].isin(selections)].iloc[:num_picks]
        top_x_players.rename(columns={'final_player_id': 'avail_player_id'}, inplace=True)
        
        _df = pd.DataFrame([[draft, player]], columns=['draft_id', 'final_player_id'])
        _df = pd.merge(_df, top_x_players, how='cross')

        dfs.append(_df)

    df_expanded = pd.concat(dfs)[['final_player_id', 'avail_player_id']]

    df_draft = pd.merge(df_expanded, df_draft, on='final_player_id', how='left')

    return df_draft

    
df = df_final.loc[df_final['draft_id'] == '133d89ab-ba4f-4230-9148-396bee781f5c'].copy()

df_draft = expand_draft(df, df_ranks, 40)

df_draft.columns


Index(['final_player_id', 'avail_player_id', 'appearance_id', 'created_at',
       'draft_entry_id', 'number', 'projection_adp', 'draft_id', 'position',
       'team_name', 'draft_source', 'draft_title', 'full_name', 'num_teams',
       'round', 'round_pick', 'draft_datetime', 'draft_date', 'draft_year',
       'final_player_name', 'final_team_name', 'final_position',
       'final_draft_date', 'rank_actual', 'rank_derived',
       'actual_proj_adp_diff', 'rank_pick_diff'],
      dtype='object')

In [2]:
keep_vars = ['draft_id', 'draft_date', 'number', 'final_player_id']
# df = df_final.loc[df_final['draft_year'] == 2022][keep_vars]
df = df_final.loc[df_final['draft_id'] == '133d89ab-ba4f-4230-9148-396bee781f5c'][keep_vars]
df['final_player_id_l1'] = df['final_player_id'].shift(1)

# df = df.iloc[:10]

keep_vars = ['final_player_id', 'rank_actual']
_df_ranks = df_ranks.loc[df_ranks['date'] == '2022-05-15'][keep_vars]

zipped_cols = zip(df['draft_id'], df['final_player_id'], df['final_player_id_l1'])
selections = []
dfs = []
for draft, player, player_l1 in zipped_cols:
    selections.append(player_l1)

    top_x_players = _df_ranks.loc[~_df_ranks['final_player_id'].isin(selections)].iloc[:40]
    top_x_players.rename(columns={'final_player_id': 'avail_player_id'}, inplace=True)
    
    _df = pd.DataFrame([[draft, player]], columns=['draft_id', 'final_player_id'])
    _df = pd.merge(_df, top_x_players, how='cross')

    dfs.append(_df)
    
df_model = pd.concat(dfs).reset_index()

df_model
# for x, y in zip(df['draft_id'], df['final_player_id']):
#     print(x, y)

# df

# df.iloc[:3]
# _df_ranks

Unnamed: 0,index,draft_id,final_player_id,avail_player_id,rank_actual
0,0,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson - 2022-05-15 - 4.6,Jonathan Taylor - 2022-05-15 - 1.1,1
1,1,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson - 2022-05-15 - 4.6,Cooper Kupp - 2022-05-15 - 2.6,2
2,2,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson - 2022-05-15 - 4.6,Christian McCaffrey - 2022-05-15 - 3.1,3
3,3,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson - 2022-05-15 - 4.6,Justin Jefferson - 2022-05-15 - 4.6,4
4,4,133d89ab-ba4f-4230-9148-396bee781f5c,Justin Jefferson - 2022-05-15 - 4.6,Ja'Marr Chase - 2022-05-15 - 5.3,5
...,...,...,...,...,...
8635,35,133d89ab-ba4f-4230-9148-396bee781f5c,Calvin Austin - 2022-05-15 - 215.5,Quez Watkins - 2022-05-15 - 215.2,251
8636,36,133d89ab-ba4f-4230-9148-396bee781f5c,Calvin Austin - 2022-05-15 - 215.5,Desmond Ridder - 2022-05-15 - 215.2,252
8637,37,133d89ab-ba4f-4230-9148-396bee781f5c,Calvin Austin - 2022-05-15 - 215.5,Nick Westbrook-Ikhine - 2022-05-15 - 215.3,253
8638,38,133d89ab-ba4f-4230-9148-396bee781f5c,Calvin Austin - 2022-05-15 - 215.5,Terrace Marshall - 2022-05-15 - 215.3,254


In [None]:
draft_date = df

In [169]:
a = [[1, [1, 2]]
    , [2, [3, 4]]
    , [3, [5, 6]]
]
# b = [[1, 2], [3, 4], [5, 6]]

df = pd.DataFrame(a, columns=['a', 'b'])

# df['check'] = np.where(df['a'].isin(df['b']), 1, 0)
# df['check'] = df['a'].isin(df['b'].tolist())
df['check'] = df['b'].to_list()
# df['check'] = np.where(df['a'].isin(df['b'].tolist()), 1, 0)
# df['check'] = np.where(df['a'].isin([1, 2]), 1, 0)
# df = pd.DataFrame([a, b])

df

Unnamed: 0,a,b,check
0,1,"[1, 2]","[1, 2]"
1,2,"[3, 4]","[3, 4]"
2,3,"[5, 6]","[5, 6]"


In [None]:
""""
POTENTIAL TESTS:
    - Verify data types for imports
    - 
    - Any ranks lookup vals that are null? (i.e. does anything need added to the lookup file)
    - Does the draft ADP align with Ranks ADP when joined on player/position/date?

"""

In [None]:
###########################################################################################
################################### Scratch to keep #######################################
###########################################################################################

In [197]:
def validate_adp_ranks(df_drafts: pd.DataFrame, df_ranks: pd.DataFrame) -> pd.DataFrame:
    """ Verifies the adp from the drafts and ranks data is the same """
    
    df_base = df_drafts.copy()
    df_ranks = df_ranks.copy()

    keep_vars = ['draft_id', 'draft_datetime', 'created_at', 'final_player_name'
                , 'final_position', 'final_team_name'
                , 'final_draft_date', 'projection_adp']
    df_base = df_base[keep_vars]

    keep_vars = ['player', 'pos', 'team', 'date', 'adp']
    df_ranks = df_ranks[keep_vars]

    df = pd.merge(df_base, df_ranks, how='left'
                , left_on=['final_player_name', 'final_position', 'final_draft_date']
                , right_on=['player', 'pos', 'date'])

    df = df.loc[df['projection_adp'] != df['adp']]

    return df


def find_missing_lookups(df_drafts: pd.DataFrame, df_ranks: pd.DataFrame) -> pd.DataFrame:
    """
    Find players in the drafts data that do not map to the ranks data.
    Note that these will need added to the lookups file.
    """

    ranks_var = 'player'
    # drafts_var = 'full_name'
    drafts_var = 'final_player_name'

    df_ranks = df_ranks[[ranks_var]].drop_duplicates(subset=ranks_var)
    df_drafts = df_drafts[[drafts_var]].drop_duplicates(subset=drafts_var)

    df = pd.merge(df_drafts, df_ranks, how='left'
                    , left_on=drafts_var, right_on=ranks_var)

    df = df.loc[df['player'].isnull()]

    return df


def summarize_actual_der_rnk_diff(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    Summarizes the difference between actual and derived rank by round.
    Used to determine which rounds to drop for drafts that don't have
    actual rank data (e.g. 2021 drafts).
    """

    keep_vars = ['draft_id', 'final_player_name', 'number', 'round'
                , 'rank_derived', 'projection_adp', 'rank_actual', 'appearance_id']
    df = df[keep_vars].copy()

    df['ind_rank_diff'] = np.where(abs(df['rank_actual'] - df['rank_derived']) > 1, 1, 0)

    df = df.groupby('round').agg({'appearance_id': 'count'
                                , 'ind_rank_diff': 'sum'}).reset_index()

    df.rename(columns={'appearance_id': 'total_num_picks'
                    , 'ind_rank_diff': 'num_picks_w_diff'}, inplace=True)

    return df


_df_drafts = df_final.loc[df_final['draft_year'] == 2022].copy()
_df_ranks = df_ranks.copy()

df_adp_val = validate_adp_ranks(_df_drafts, _df_ranks)
df_missing_lookups = find_missing_lookups(_df_drafts, _df_ranks)
df_rank_diff_summary = summarize_actual_der_rnk_diff(_df_drafts)

df_rank_diff_summary


Unnamed: 0,round,total_num_picks,num_picks_w_diff
0,1,600,0
1,2,600,0
2,3,600,0
3,4,600,0
4,5,600,0
5,6,600,0
6,7,600,0
7,8,600,1
8,9,600,0
9,10,600,1


In [None]:
###########################################################################################
################################### Basic Exploration #####################################
###########################################################################################

In [198]:
# df = df_final.copy()
df = df_final.loc[df_final['draft_year'] == 2022].copy()

dfs = []
for round in range(1, 19):
    df = df_final.loc[df_final['round'] == round]

    df = df['rank_pick_diff'].quantile([.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]).to_frame()
    df = df.transpose()

    df['round'] = round

    cols = df.columns.to_list()
    cols = cols[-1:] + cols[:-1]

    df = df[cols]

    dfs.append(df)

df = pd.concat(dfs)


# df[['0.01', '0.05']]
df


Unnamed: 0,round,0.01,0.05,0.25,0.5,0.75,0.95,0.99
rank_pick_diff,1,-3.0,-2.0,-1.0,0.0,1.0,3.0,5.0
rank_pick_diff,2,-6.0,-4.0,-1.25,0.0,2.0,5.0,9.0
rank_pick_diff,3,-8.0,-6.0,-2.0,0.0,2.0,6.0,12.01
rank_pick_diff,4,-10.0,-7.0,-2.25,0.0,3.0,10.0,14.0
rank_pick_diff,5,-12.01,-9.0,-4.0,0.0,3.0,10.0,17.0
rank_pick_diff,6,-13.01,-8.0,-3.0,0.0,3.0,10.0,18.01
rank_pick_diff,7,-13.0,-9.0,-3.0,0.0,4.0,11.0,22.0
rank_pick_diff,8,-16.01,-11.0,-4.0,0.0,4.0,12.0,21.01
rank_pick_diff,9,-18.01,-11.0,-4.0,0.0,5.0,12.05,19.02
rank_pick_diff,10,-19.0,-12.0,-5.0,-1.0,4.0,15.0,38.0


In [121]:
# Check correlations between primary modeling variables and draft pick
df[['number', 'projection_adp', 'actual_proj_adp_diff']].corr()

Unnamed: 0,number,projection_adp,actual_proj_adp_diff
number,1.0,0.989551,-0.024938
projection_adp,0.989551,1.0,0.119459
actual_proj_adp_diff,-0.024938,0.119459,1.0


In [70]:
df = df_complete_players.copy()
df = df[['draft_id', 'draft_source']].drop_duplicates(subset='draft_id')

df = df.groupby('draft_source', dropna=False).size().to_frame('num_drafts').reset_index()

df

Unnamed: 0,draft_source,num_drafts
0,sit_and_go,68
1,tournament,33
2,,55
