In [1]:
# OVERVIEW

# Background: 
# The 1st Round of the NFL Draft is a phenomenon that can make or break a team's next season.
# We will experiment with the theory that if a team did well on Offense and poorly on Defense in the previous season, 
# then they would use their highest pick on a player that can improve their Defense (or vice-versa)

# Goal: Predict whether or not a team will draft an Offensive or Defensive player in the 1st Round of the NFL Draft

# Data Points:
# Independent Variable: What unit for the position each team drafted in the 1st Round (Offense=OFF or Defense=DEF)
# (teams can have multiple 1st round picks)
# Dependent Variable: A team's stats and league-wide rankings for the previous season

# Example:
# Bengals 2020 Draft QB=Offense=OFF - Bengals 2019 total passing yards, total rushing yards, turnovers, points for/against, etc.
# Bengals 2019 Stats - https://www.pro-football-reference.com/teams/cin/2019.htm

# Data Scraped From:
# 1st Round Draft History Data - http://www.drafthistory.com/index.php/rounds/round_1
# Team Stats and Rankings - https://www.pro-football-reference.com

In [2]:
# IMPORTS
# DataFrame
import pandas as pd

# Math
import math

In [3]:
# CONSTANTS
CURRENT_YEAR = 2020
YEARS = 20
NUM_NEIGHBORS = 5
TEAM_DICT = {'Bengals':'cin','Redskins':'was','Lions':'det','Giants':'nyg','Dolphins':'mia','Chargers':'sdg',
            'Panthers':'car','Cardinals':'crd','Jaguars':'jax','Browns':'cle','Jets':'nyj','Raiders':'rai',
             'Buccaneers':'tam','49ers':'sfo','Broncos':'den','Falcons':'atl','Cowboys':'dal','Eagles':'phi',
             'Vikings':'min','Saints':'nor','Packers':'gnb','Seahawks':'sea','Ravens':'rav','Titans':'oti',
             'Chiefs':'kan','Colts':'clt','Texans':'htx','Bears':'chi','Steelers':'pit','Rams':'ram','Bills':'buf',
             'Patriots':'nwe'}
EMPTY_COLS = ['Off. Rank Ply', 'Off. Rank Y/P', 'Off. Rank Cmp', 'Off. Rank 1stD', 'Off. Rank 1stD', 'Off. Rank Pen',
             'Off. Rank Yds', 'Off. Rank 1stPy', 'Off. Rank #Dr', 'Def. Rank Ply', 'Def. Rank Y/P', 'Def. Rank Cmp',
             'Def. Rank 1stD', 'Def. Rank 1stD', 'Def. Rank Pen', 'Def. Rank Yds', 'Def. Rank 1stPy', 'Def. Rank #Dr',
             'Off. Rank 1stD.1', 'Off. Rank 1stD.2', 'Off. Rank Yds.3', 'Def. Rank 1stD.1', 'Def. Rank 1stD.2', 'Def. Rank Yds.3']
UNIT_DICT = {'QB':'OFF','RB':'OFF','WR':'OFF','TE':'OFF',' C':'OFF',' G':'OFF',' T':'OFF','DT':'DEF','DE':'DEF','LB':'DEF',
             'DB':'DEF'}
BASIC_FACTORS = ['Position Drafted', 'Team', 'Draft Year', 'Pick Number']
TEAM_FACTORS = ['PF', 'Y/P', '1stD', 'Cmp', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2', 'Sc%', 'Yds.4', 'Pts']
OPP_FACTORS = ['PF', 'Y/P', 'TO', '1stD', 'Cmp', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2', 'Sc%', 'Yds.4', 'Pts']
OFF_RANK_FACTORS = ['PF', 'Yds', '1stD', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', 'Sc%', 'Yds.4', 'Pts']
DEF_RANK_FACTORS = ['PF', 'Yds', 'TO', '1stD', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', 'Sc%', 'Yds.4', 'Pts']
NEXT_DRAFT_ORDER = ['Jaguars', 'Jets', 'Dolphins', 'Falcons', 'Bengals', 'Eagles', 'Lions', 'Panthers', 'Broncos', 'Cowboys', 
                    'Giants', '49ers', 'Chargers', 'Vikings', 'Patriots', 'Cardinals', 'Raiders', 'Dolphins', 'Redskins', 'Bears', 
                    'Jaguars', 'Colts', 'Browns', 'Titans', 'Buccaneers', 'Ravens', 'Jets', 'Steelers',
                    'Saints', 'Bills', 'Packers', 'Chiefs']

In [4]:
# Recreate raw data from file so that empty values are not perceived as Null=NaN
raw = pd.read_csv('Raw Data For Last ' + str(YEARS) + ' Drafts.csv', encoding = 'utf-8')
raw

Unnamed: 0,Position Drafted,Team,Draft Year,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,...,Def. Rank Yds.3,Def. Rank 1stPy,Def. Rank #Dr,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,QB,Bengals,2020,1,279.0,5169.0,1049.0,4.9,30.0,14.0,...,,,,9.0,27.0,12.0,18.0,15.0,28.0,22.0
1,DE,Redskins,2020,2,266.0,4395.0,885.0,5.0,21.0,8.0,...,,,,4.0,15.0,31.0,31.0,29.0,29.0,30.0
2,DB,Lions,2020,3,341.0,5549.0,1021.0,5.4,23.0,8.0,...,,,,7.0,25.0,3.0,21.0,28.0,30.0,24.0
3,T,Giants,2020,4,341.0,5416.0,1012.0,5.4,33.0,16.0,...,,,,8.0,29.0,25.0,19.0,17.0,20.0,25.0
4,QB,Dolphins,2020,5,306.0,4960.0,1022.0,4.9,26.0,8.0,...,,,,1.0,28.0,27.0,20.0,20.0,26.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,RB,Vikings,2001,27,397.0,5961.0,958.0,6.2,28.0,10.0,...,,,,6.0,31.0,7.0,30.0,30.0,30.0,27.0
632,DB,Raiders,2001,28,479.0,5776.0,1023.0,5.6,20.0,9.0,...,,,,24.0,7.0,1.0,5.0,15.0,18.0,11.0
633,DE,Rams,2001,29,540.0,7075.0,1014.0,7.0,35.0,12.0,...,,,,2.0,22.0,25.0,15.0,14.0,24.0,30.0
634,WR,Colts,2001,30,429.0,6141.0,1026.0,6.0,29.0,14.0,...,,,,12.0,23.0,3.0,25.0,28.0,25.0,19.0


In [5]:
# Data Cleaning
def clean_raw_df(raw, save, keep_id) -> 'DataFrame':
    
    # Convert non-Numerical data types to Numerical
    raw = _convert_string_data(raw)
    
    # Keep only the columns of data that we want
    raw = _isolate_key_columns(raw)
    
    # Create an ID column for later reference
    raw = _add_id_column(raw)
    
    # Save a copy for reference to identify what team/position/year occurred for stats
    if save:
        _save_reference_df(raw)
    
    # Remove Identifying Columns
    if keep_id == False:
        raw = _remove_id_columns(raw)
    
    return raw

def _convert_string_data(df) -> 'DataFrame':
    team_start_list = []
    opp_start_list = []
    for i in list(df['Team Start']):
        team_start_list.append(float(str(i)[4:].strip()))
    for i in list(df['Opp. Start']):
        opp_start_list.append(float(str(i)[4:].strip()))
    
    df['Team Start'] = team_start_list
    df['Opp. Start'] = opp_start_list
    
    team_time_list = []
    opp_time_list = []
    for i in list(df['Team Time']):
        team_time_list.append(float((int(str(i)[0])*60)+int(str(i)[2:])))
    for i in list(df['Opp. Time']):
        opp_time_list.append(float((int(str(i)[0])*60)+int(str(i)[2:])))
        
    df['Team Time'] = team_time_list
    df['Opp. Time'] = opp_time_list
    
    return df
    
def _isolate_key_columns(df):
    factors = []
    for i in TEAM_FACTORS:
        factors.append('Team ' + i)
    for i in OPP_FACTORS:
        factors.append('Opp. ' + i)
    for i in OFF_RANK_FACTORS:
        factors.append('Off. Rank ' + i)
    for i in DEF_RANK_FACTORS:
        factors.append('Def. Rank ' + i)
    print(factors)
    print(len(BASIC_FACTORS+factors))
    df = df[BASIC_FACTORS+factors]
    return df

def _add_id_column(df) -> 'DataFrame':
    df = df.reset_index()
    df = df.rename(columns={'index':'ID'})
    return df
    
def _save_reference_df(df):
    reference_df = df.copy(deep=True)
    reference_df.to_csv('Clean Reference For Last ' + str(YEARS) + ' Drafts.csv', encoding = 'utf-8-sig', index=False)
    
def _remove_id_columns(df):
    df = df.drop(columns=['Position Drafted', 'Team', 'Draft Year'])
    return df

In [6]:
# We will use this dataframe for our model since it only has Numerical values, 
# but we will not use the ID when calculating distance
clean = clean_raw_df(raw, True, False)
clean

['Team PF', 'Team Y/P', 'Team 1stD', 'Team Cmp', 'Team Att', 'Team Yds.1', 'Team TD', 'Team Int', 'Team NY/A', 'Team Att.1', 'Team Yds.2', 'Team TD.1', 'Team Y/A', 'Team 1stD.2', 'Team Sc%', 'Team Yds.4', 'Team Pts', 'Opp. PF', 'Opp. Y/P', 'Opp. TO', 'Opp. 1stD', 'Opp. Cmp', 'Opp. Yds.1', 'Opp. TD', 'Opp. Int', 'Opp. NY/A', 'Opp. Att.1', 'Opp. Yds.2', 'Opp. TD.1', 'Opp. Y/A', 'Opp. 1stD.2', 'Opp. Sc%', 'Opp. Yds.4', 'Opp. Pts', 'Off. Rank PF', 'Off. Rank Yds', 'Off. Rank 1stD', 'Off. Rank Att', 'Off. Rank Yds.1', 'Off. Rank TD', 'Off. Rank Int', 'Off. Rank NY/A', 'Off. Rank Att.1', 'Off. Rank Yds.2', 'Off. Rank TD.1', 'Off. Rank Y/A', 'Off. Rank Sc%', 'Off. Rank Yds.4', 'Off. Rank Pts', 'Def. Rank PF', 'Def. Rank Yds', 'Def. Rank TO', 'Def. Rank 1stD', 'Def. Rank Yds.1', 'Def. Rank TD', 'Def. Rank Int', 'Def. Rank NY/A', 'Def. Rank Att.1', 'Def. Rank Yds.2', 'Def. Rank TD.1', 'Def. Rank Y/A', 'Def. Rank Sc%', 'Def. Rank Yds.4', 'Def. Rank Pts']
68


Unnamed: 0,ID,Pick Number,Team PF,Team Y/P,Team 1stD,Team Cmp,Team Att,Team Yds.1,Team TD,Team Int,...,Def. Rank TD,Def. Rank Int,Def. Rank NY/A,Def. Rank Att.1,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank Yds.4,Def. Rank Pts
0,0,1,279.0,4.9,312.0,356.0,616.0,3652.0,18.0,16.0,...,17.0,22.0,31.0,32.0,32.0,27.0,25.0,9.0,28.0,22.0
1,1,2,266.0,5.0,248.0,298.0,479.0,2812.0,18.0,13.0,...,30.0,12.0,21.0,31.0,31.0,17.0,26.0,4.0,29.0,30.0
2,2,3,341.0,5.4,313.0,344.0,571.0,3900.0,28.0,15.0,...,27.0,30.0,28.0,26.0,21.0,14.0,10.0,7.0,30.0,24.0
3,3,4,341.0,5.4,311.0,376.0,607.0,3731.0,30.0,17.0,...,25.0,25.0,29.0,29.0,20.0,28.0,4.0,8.0,20.0,25.0
4,4,5,306.0,4.9,315.0,371.0,615.0,3804.0,22.0,18.0,...,32.0,12.0,32.0,30.0,27.0,21.0,22.0,1.0,26.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,631,27,397.0,6.2,319.0,307.0,495.0,3832.0,33.0,18.0,...,17.0,31.0,24.0,8.0,15.0,25.0,27.0,6.0,30.0,27.0
632,632,28,479.0,5.6,337.0,284.0,475.0,3306.0,28.0,11.0,...,23.0,6.0,16.0,4.0,5.0,5.0,14.0,24.0,18.0,11.0
633,633,29,540.0,7.0,380.0,380.0,587.0,5232.0,37.0,23.0,...,30.0,11.0,28.0,4.0,13.0,27.0,25.0,2.0,24.0,30.0
634,634,30,429.0,6.0,357.0,357.0,571.0,4282.0,33.0,15.0,...,16.0,23.0,18.0,20.0,25.0,16.0,23.0,12.0,25.0,19.0


In [7]:
# We will use this dataframe to reference the Team/Year/Position Drafted for our test values and our test's closest neighbors
reference = pd.read_csv('Clean Reference For Last ' + str(YEARS) + ' Drafts.csv', encoding = 'utf-8')
reference

Unnamed: 0,ID,Position Drafted,Team,Draft Year,Pick Number,Team PF,Team Y/P,Team 1stD,Team Cmp,Team Att,...,Def. Rank TD,Def. Rank Int,Def. Rank NY/A,Def. Rank Att.1,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank Yds.4,Def. Rank Pts
0,0,QB,Bengals,2020,1,279.0,4.9,312.0,356.0,616.0,...,17.0,22.0,31.0,32.0,32.0,27.0,25.0,9.0,28.0,22.0
1,1,DE,Redskins,2020,2,266.0,5.0,248.0,298.0,479.0,...,30.0,12.0,21.0,31.0,31.0,17.0,26.0,4.0,29.0,30.0
2,2,DB,Lions,2020,3,341.0,5.4,313.0,344.0,571.0,...,27.0,30.0,28.0,26.0,21.0,14.0,10.0,7.0,30.0,24.0
3,3,T,Giants,2020,4,341.0,5.4,311.0,376.0,607.0,...,25.0,25.0,29.0,29.0,20.0,28.0,4.0,8.0,20.0,25.0
4,4,QB,Dolphins,2020,5,306.0,4.9,315.0,371.0,615.0,...,32.0,12.0,32.0,30.0,27.0,21.0,22.0,1.0,26.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,631,RB,Vikings,2001,27,397.0,6.2,319.0,307.0,495.0,...,17.0,31.0,24.0,8.0,15.0,25.0,27.0,6.0,30.0,27.0
632,632,DB,Raiders,2001,28,479.0,5.6,337.0,284.0,475.0,...,23.0,6.0,16.0,4.0,5.0,5.0,14.0,24.0,18.0,11.0
633,633,DE,Rams,2001,29,540.0,7.0,380.0,380.0,587.0,...,30.0,11.0,28.0,4.0,13.0,27.0,25.0,2.0,24.0,30.0
634,634,WR,Colts,2001,30,429.0,6.0,357.0,357.0,571.0,...,16.0,23.0,18.0,20.0,25.0,16.0,23.0,12.0,25.0,19.0


In [8]:
# Split into Train / Test Dataframes to Test Model Accuracy on 2020 NFL Draft

def split_dfs_for_testing(final_df) -> list:

    test_df = final_df.iloc[:32]
    train_df = final_df.iloc[32:]
    print(train_df.shape, test_df.shape)

    combine = [train_df, test_df]
    return combine

In [9]:
combine = split_dfs_for_testing(clean)
train_df_1 = combine[0]
test_df_1 = combine[1]
train_df_1.to_csv('Train Data For ' + str(CURRENT_YEAR) + ' Draft.csv', encoding = 'utf-8-sig', index=False)
test_df_1.to_csv('Test Data For ' + str(CURRENT_YEAR) + ' Draft.csv', encoding = 'utf-8-sig', index=False)
train_df_1

(604, 66) (32, 66)


Unnamed: 0,ID,Pick Number,Team PF,Team Y/P,Team 1stD,Team Cmp,Team Att,Team Yds.1,Team TD,Team Int,...,Def. Rank TD,Def. Rank Int,Def. Rank NY/A,Def. Rank Att.1,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank Yds.4,Def. Rank Pts
32,32,1,225.0,4.3,239.0,283.0,495.0,2523.0,15.0,18.0,...,3.0,29.0,6.0,32.0,32.0,32.0,27.0,13.0,10.0,22.0
33,33,2,342.0,5.8,344.0,331.0,532.0,3867.0,26.0,20.0,...,31.0,32.0,14.0,25.0,14.0,15.0,7.0,8.0,9.0,26.0
34,34,3,333.0,4.9,258.0,299.0,524.0,3165.0,18.0,19.0,...,19.0,15.0,19.0,22.0,26.0,21.0,19.0,9.0,8.0,19.0
35,35,4,290.0,5.4,300.0,382.0,556.0,3751.0,19.0,10.0,...,32.0,14.0,32.0,30.0,30.0,21.0,22.0,2.0,29.0,32.0
36,36,5,396.0,6.3,388.0,408.0,625.0,5125.0,36.0,26.0,...,30.0,26.0,30.0,18.0,24.0,29.0,20.0,6.0,26.0,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,631,27,397.0,6.2,319.0,307.0,495.0,3832.0,33.0,18.0,...,17.0,31.0,24.0,8.0,15.0,25.0,27.0,6.0,30.0,27.0
632,632,28,479.0,5.6,337.0,284.0,475.0,3306.0,28.0,11.0,...,23.0,6.0,16.0,4.0,5.0,5.0,14.0,24.0,18.0,11.0
633,633,29,540.0,7.0,380.0,380.0,587.0,5232.0,37.0,23.0,...,30.0,11.0,28.0,4.0,13.0,27.0,25.0,2.0,24.0,30.0
634,634,30,429.0,6.0,357.0,357.0,571.0,4282.0,33.0,15.0,...,16.0,23.0,18.0,20.0,25.0,16.0,23.0,12.0,25.0,19.0


In [10]:
test_df_1

Unnamed: 0,ID,Pick Number,Team PF,Team Y/P,Team 1stD,Team Cmp,Team Att,Team Yds.1,Team TD,Team Int,...,Def. Rank TD,Def. Rank Int,Def. Rank NY/A,Def. Rank Att.1,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank Yds.4,Def. Rank Pts
0,0,1,279.0,4.9,312.0,356.0,616.0,3652.0,18.0,16.0,...,17.0,22.0,31.0,32.0,32.0,27.0,25.0,9.0,28.0,22.0
1,1,2,266.0,5.0,248.0,298.0,479.0,2812.0,18.0,13.0,...,30.0,12.0,21.0,31.0,31.0,17.0,26.0,4.0,29.0,30.0
2,2,3,341.0,5.4,313.0,344.0,571.0,3900.0,28.0,15.0,...,27.0,30.0,28.0,26.0,21.0,14.0,10.0,7.0,30.0,24.0
3,3,4,341.0,5.4,311.0,376.0,607.0,3731.0,30.0,17.0,...,25.0,25.0,29.0,29.0,20.0,28.0,4.0,8.0,20.0,25.0
4,4,5,306.0,4.9,315.0,371.0,615.0,3804.0,22.0,18.0,...,32.0,12.0,32.0,30.0,27.0,21.0,22.0,1.0,26.0,32.0
5,5,6,337.0,5.9,349.0,394.0,597.0,4426.0,24.0,20.0,...,8.0,22.0,20.0,21.0,18.0,21.0,15.0,15.0,15.0,18.0
6,6,7,340.0,5.1,335.0,382.0,633.0,3650.0,17.0,21.0,...,8.0,8.0,14.0,25.0,29.0,32.0,32.0,5.0,14.0,28.0
7,7,8,361.0,5.5,314.0,355.0,554.0,3477.0,20.0,12.0,...,31.0,30.0,27.0,23.0,24.0,5.0,20.0,3.0,32.0,29.0
8,8,9,300.0,5.4,298.0,364.0,589.0,3760.0,24.0,8.0,...,12.0,25.0,23.0,22.0,28.0,31.0,31.0,13.0,22.0,23.0
9,9,10,335.0,5.6,305.0,318.0,539.0,3554.0,22.0,21.0,...,17.0,8.0,17.0,28.0,30.0,28.0,30.0,17.0,17.0,21.0


In [11]:
# DATA PREPARATION FUNCTIONS

def convert_df_to_weighted_df(df):
    std_list = list(df.std(axis = 0, skipna= True))
    mean_list = list(df.mean(axis=0, skipna= True))
    frames = [pd.DataFrame(df.loc[df.index[0]], columns=df.columns)]
    for i in range(df.index[0], df.index[0]+len(df.index)):
        row_list = list(df.loc[i])
        weighted_row = []
        for j in range(len(row_list)):
            if j == 0:
                weighted_row.append(row_list[j])
            else:
                weighted_row.append((row_list[j] - mean_list[j])/std_list[j])
        weighted_df_row = pd.DataFrame([weighted_row], columns=df.columns)
        frames.append(weighted_df_row)
    weighted_df = pd.concat(frames)
    weighted_df.index = df.index
    return weighted_df

def ready_dataset_for_modeling(train_df, test_df) -> list:
    weighted_train_df = convert_df_to_weighted_df(train_df)
    weighted_test_df = convert_df_to_weighted_df(test_df)
    
    train_dataset = weighted_train_df.to_numpy().tolist()
    test_dataset = weighted_test_df.to_numpy().tolist()
    
    combine = [train_dataset, test_dataset]
    return combine

In [12]:
model_ready_combine = ready_dataset_for_modeling(train_df_1, test_df_1)
train_dataset_1 = model_ready_combine[0]
test_dataset_1 = model_ready_combine[1]

In [13]:
# K NEAREST NEIGHBORS MODEL FUNCTIONS

def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(1, len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return math.sqrt(distance)

def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for i in range(0, len(train)-1):
        ID = train[i][0]
        train_row = train[i]
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist, ID))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append((distances[i][2], round(distances[i][1], 2)))
    return neighbors

def prediction_data(train_dataset, test_dataset, test_row_index, reference_df) -> 'DataFrame':
    neighbors = get_neighbors(train_dataset, test_dataset[test_row_index], NUM_NEIGHBORS)
    
    ID = test_dataset[test_row_index][0]
    frames = []
    id_col = [ID]
    distance_col = [0]
    
    for neighbor in neighbors:
        id_col.append(neighbor[0])
        distance_col.append(neighbor[1])
        frames.append(pd.DataFrame([list(reference_df.loc[int(neighbor[0])])], columns=reference_df.columns))
        
    big_df = pd.concat(frames)
    big_df = pd.DataFrame([reference_df.loc[ID]], columns=reference_df.columns).append(big_df, ignore_index=True)
    big_df.insert(1, 'Unit', big_df['Position Drafted'].map(UNIT_DICT), True)
    big_df.insert(1, 'Distance', distance_col, True)
    
    return big_df

def unit_prediction(train_dataset, test_dataset, test_row_index, reference_df):
    predict_dict = {}
    neighbors = get_neighbors(train_dataset, test_dataset[test_row_index], NUM_NEIGHBORS)
    ID = test_dataset[test_row_index][0]
    
    sum = 0
    for neighbor in neighbors:
        position = list(reference_df.loc[int(neighbor[0])])[1]
        for i in UNIT_DICT.keys():
            if position == i:
                unit = UNIT_DICT[position]
        if unit not in predict_dict:
            predict_dict[unit] = 0
        predict_dict[unit] += 1
        sum += 1
    
    percent_dict = {'OFF':0,'DEF':0,'Predicted':'NA','Actual':'NA'}
    for i in predict_dict:
        percent_dict[i] = round(predict_dict[i]/sum, 4)*100
        
    if percent_dict['OFF'] > percent_dict['DEF']:
        percent_dict['Predicted'] = 'OFF'
    else:
        percent_dict['Predicted'] = 'DEF'
        
    try:
        percent_dict['Actual'] = UNIT_DICT[list(reference_df.loc[ID])[1]]
    except:
        percent_dict['Actual'] = 'NA'
        
    return percent_dict

In [14]:
# Test Closest Neighbors for Number 1 Pick of the 2020 NFL DRAFT
# In other words, out of the past 20 years, which Team Stats are the closest to the Bengals 2019 Season, 
# and what unit did That Team draft their following year?
PICK_NUM = 1
test = prediction_data(train_dataset_1, test_dataset_1, PICK_NUM-1, reference)
test

Unnamed: 0,ID,Distance,Unit,Position Drafted,Team,Draft Year,Pick Number,Team PF,Team Y/P,Team 1stD,...,Def. Rank TD,Def. Rank Int,Def. Rank NY/A,Def. Rank Att.1,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank Yds.4,Def. Rank Pts
0,0,0.0,OFF,QB,Bengals,2020,1,279.0,4.9,312.0,...,17.0,22.0,31.0,32.0,32.0,27.0,25.0,9.0,28.0,22.0
1,605,4.9,OFF,QB,Falcons,2001,1,252.0,4.3,256.0,...,17.0,21.0,30.0,21.0,27.0,24.0,24.0,7.0,26.0,25.0
2,511,4.91,OFF,T,Raiders,2004,2,270.0,4.6,258.0,...,20.0,20.0,29.0,32.0,32.0,30.0,25.0,5.0,24.0,27.0
3,352,5.02,OFF,T,Rams,2009,2,232.0,4.7,249.0,...,14.0,21.0,31.0,28.0,29.0,30.0,29.0,4.0,25.0,29.0
4,224,5.06,OFF,T,Jaguars,2013,2,255.0,4.8,269.0,...,12.0,20.0,28.0,32.0,30.0,31.0,12.0,3.0,25.0,24.0
5,193,5.16,OFF,QB,Jaguars,2014,3,247.0,4.6,271.0,...,21.0,26.0,26.0,32.0,29.0,29.0,19.0,7.0,23.0,25.0


In [15]:
# What percentage of the k-nearest neighbors drafted OFF vs. DEF, and what was the actual drafted for the test?
unit_prediction(train_dataset_1, test_dataset_1, PICK_NUM-1, reference)

{'OFF': 100.0, 'DEF': 0, 'Predicted': 'OFF', 'Actual': 'OFF'}

In [16]:
# MODEL TESTING FUNCTIONS

def test_unit_prediction(train_dataset, test_dataset, reference_df):
    correct_predictions = 0
    eighty_plus_percent_correct = 0
    total_eighty = 0
    top_10 = 0
    mid_10 = 0
    last_12 = 0
    print(list(reference_df.loc[test_dataset[0][0]])[3])
    
    length = len(test_dataset)
    for i in range(length):
        predict_dict = unit_prediction(train_dataset, test_dataset, i, reference_df)
        if predict_dict['Actual'] == predict_dict['Predicted']:
            correct_predictions += 1
            if predict_dict[predict_dict['Predicted']] >= 80:
                eighty_plus_percent_correct += 1
            if i <= 9:
                top_10 += 1
            elif i <= 19:
                mid_10 += 1
            else:
                last_12 += 1
        if predict_dict[predict_dict['Predicted']] >= 80:
            total_eighty += 1
        print(predict_dict, 'Total ' + str(correct_predictions)+'/'+str(i+1))
        
    correct_pred_percent = round(correct_predictions/length, 4)*100
    eighty_plus_percent_accuracy = round(eighty_plus_percent_correct / total_eighty, 4)*100
    top_10_accuracy = round(top_10 / 10, 4)*100
    mid_10_accuracy = round(mid_10 / 10, 4)*100
    last_12_accuracy = round(last_12 / 12, 4)*100
    
    print()
    print('Correct Predictions %: ', correct_pred_percent)
    print('Correct 80+% Predictions %: ', eighty_plus_percent_accuracy)
    print('Correct Top 10 Predictions %: ', top_10_accuracy)
    print('Correct Mid 10 Predictions %: ', mid_10_accuracy)
    print('Correct Last 12 Predictions %: ', last_12_accuracy)
    print()
    
    return (correct_pred_percent, eighty_plus_percent_accuracy, top_10_accuracy, mid_10_accuracy, last_12_accuracy)

def test_model_across_past_years(train_df, test_df, reference_df, years, include_current_year):
    total_correct = 0
    total_eighty_correct = 0
    total_top_10_correct = 0
    total_mid_10_correct = 0
    total_last_12_correct = 0
    train = train_df.copy(deep=True)
    test = test_df.copy(deep=True)
    for i in range(years):
        if include_current_year == False:
            test = train.iloc[:32]
            train = train.drop(train.head(32).index)
        
        model_ready_combine = ready_dataset_for_modeling(train, test)
        train_dataset = model_ready_combine[0]
        test_dataset = model_ready_combine[1]
        predict = test_unit_prediction(train_dataset, test_dataset, reference_df)
        
        total_correct += predict[0]
        total_eighty_correct += predict[1]
        total_top_10_correct += predict[2]
        total_mid_10_correct += predict[3]
        total_last_12_correct += predict[4]
        
        if include_current_year == True:
            test = train.iloc[:32]
            train = train.drop(train.head(32).index)
        
    average_correct = round(total_correct / years, 2)
    average_eighty_correct = round(total_eighty_correct / years, 2)
    average_top_10 = round(total_top_10_correct / years, 2)
    average_mid_10 = round(total_mid_10_correct / years, 2)
    average_last_12 = round(total_last_12_correct / years, 2)
    
    print('Average Correct Predictions %: ', average_correct)
    print('Average Correct 80+% Predictions %: ', average_eighty_correct)
    print('Average Top 10 Predictions %: ', average_top_10)
    print('Average Mid 10 Predictions %: ', average_mid_10)
    print('Average Last 12 Predictions %: ', average_last_12)
    
    return (average_correct, average_eighty_correct, average_top_10, average_mid_10, average_last_12)

In [17]:
# Out of 32 Picks for the 2020 NFL Draft, what percentage of them are correctly predicted 
# using data from the 5 Nearest Neighbors?
test_unit_prediction(train_dataset_1, test_dataset_1, reference)

2020
{'OFF': 100.0, 'DEF': 0, 'Predicted': 'OFF', 'Actual': 'OFF'} Total 1/1
{'OFF': 20.0, 'DEF': 80.0, 'Predicted': 'DEF', 'Actual': 'DEF'} Total 2/2
{'OFF': 60.0, 'DEF': 40.0, 'Predicted': 'OFF', 'Actual': 'DEF'} Total 2/3
{'OFF': 60.0, 'DEF': 40.0, 'Predicted': 'OFF', 'Actual': 'OFF'} Total 3/4
{'OFF': 80.0, 'DEF': 20.0, 'Predicted': 'OFF', 'Actual': 'OFF'} Total 4/5
{'OFF': 60.0, 'DEF': 40.0, 'Predicted': 'OFF', 'Actual': 'OFF'} Total 5/6
{'OFF': 40.0, 'DEF': 60.0, 'Predicted': 'DEF', 'Actual': 'DEF'} Total 6/7
{'OFF': 40.0, 'DEF': 60.0, 'Predicted': 'DEF', 'Actual': 'DEF'} Total 7/8
{'OFF': 40.0, 'DEF': 60.0, 'Predicted': 'DEF', 'Actual': 'DEF'} Total 8/9
{'OFF': 20.0, 'DEF': 80.0, 'Predicted': 'DEF', 'Actual': 'OFF'} Total 8/10
{'OFF': 60.0, 'DEF': 40.0, 'Predicted': 'OFF', 'Actual': 'OFF'} Total 9/11
{'OFF': 40.0, 'DEF': 60.0, 'Predicted': 'DEF', 'Actual': 'OFF'} Total 9/12
{'OFF': 20.0, 'DEF': 80.0, 'Predicted': 'DEF', 'Actual': 'OFF'} Total 9/13
{'OFF': 20.0, 'DEF': 80.0, 'Pre

(56.25, 46.67, 80.0, 60.0, 33.33)

In [18]:
# Now we can begin testing the model for drafts before the 2020 draft
# For example, we will test the average scores of the model by also predicting the 2019 Draft using draft data from 2001-2018
# And to test the 2018 Draft, we will use draft data from 2001-2017
test_model_across_past_years(train_df_1, test_df_1, reference, 3, True)

2020
{'OFF': 100.0, 'DEF': 0, 'Predicted': 'OFF', 'Actual': 'OFF'} Total 1/1
{'OFF': 20.0, 'DEF': 80.0, 'Predicted': 'DEF', 'Actual': 'DEF'} Total 2/2
{'OFF': 60.0, 'DEF': 40.0, 'Predicted': 'OFF', 'Actual': 'DEF'} Total 2/3
{'OFF': 60.0, 'DEF': 40.0, 'Predicted': 'OFF', 'Actual': 'OFF'} Total 3/4
{'OFF': 80.0, 'DEF': 20.0, 'Predicted': 'OFF', 'Actual': 'OFF'} Total 4/5
{'OFF': 60.0, 'DEF': 40.0, 'Predicted': 'OFF', 'Actual': 'OFF'} Total 5/6
{'OFF': 40.0, 'DEF': 60.0, 'Predicted': 'DEF', 'Actual': 'DEF'} Total 6/7
{'OFF': 40.0, 'DEF': 60.0, 'Predicted': 'DEF', 'Actual': 'DEF'} Total 7/8
{'OFF': 40.0, 'DEF': 60.0, 'Predicted': 'DEF', 'Actual': 'DEF'} Total 8/9
{'OFF': 20.0, 'DEF': 80.0, 'Predicted': 'DEF', 'Actual': 'OFF'} Total 8/10
{'OFF': 60.0, 'DEF': 40.0, 'Predicted': 'OFF', 'Actual': 'OFF'} Total 9/11
{'OFF': 40.0, 'DEF': 60.0, 'Predicted': 'DEF', 'Actual': 'OFF'} Total 9/12
{'OFF': 20.0, 'DEF': 80.0, 'Predicted': 'DEF', 'Actual': 'OFF'} Total 9/13
{'OFF': 20.0, 'DEF': 80.0, 'Pre

(55.21, 53.91, 60.0, 66.67, 41.66)

In [None]:
SUMMARY

What I want to do is use the most accurate data columns
over the last X years in order to make predictions for 2021

To find out the most accurate data columns over the last X years, 
We need to test varying data columns over different numbers of years
for the year 2020, and from there decide which combination will work best
to predict the results for 2021

The most accurate data columns for testing the years 2017-2019 and 2018-2019:
1. Test 2
2. Test 3
3. Test 4
4. Test 1

The results of those tests on predicting 2020:
1. Test 1 - (68.75, 81.25, 90.0, 80.0, 41.67)
2. Test 4 - (62.5, 66.67, 100.0, 60.0, 33.33)
3. Test 2 - (62.5, 58.82, 90.0, 70.0, 33.33)
4. Test 3 - (56.25, 46.67, 80.0, 60.0, 33.33)

In [19]:
TEST DATA COLUMNS

Test 0 - ALL COLUMNS
TEAM_FACTORS = ['PF', 'Yds', 'Ply', 'Y/P', 'TO', 'FL', '1stD', 'Cmp', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', '1stD.1', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2', 'Pen', 'Yds.3', '1stPy', '#Dr', 'Sc%', 'TO%', 'Start', 'Time', 'Plays', 'Yds.4', 'Pts']
OPP_FACTORS = ['PF', 'Yds', 'Ply', 'Y/P', 'TO', 'FL', '1stD', 'Cmp', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', '1stD.1', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2', 'Pen', 'Yds.3', '1stPy', '#Dr', 'Sc%', 'TO%', 'Start', 'Time', 'Plays', 'Yds.4', 'Pts']
OFF_RANK_FACTORS = ['PF', 'Yds', 'TO', 'FL', '1stD', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', 'Sc%', 'TO%', 'Start', 'Time', 'Plays', 'Yds.4', 'Pts']
DEF_RANK_FACTORS = ['PF', 'Yds', 'TO', 'FL', '1stD', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', 'Sc%', 'TO%', 'Start', 'Time', 'Plays', 'Yds.4', 'Pts']

Test 1
TEAM_FACTORS = ['PF', 'TO', 'Att', 'Yds.1', 'TD', 'Int', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2']
OPP_FACTORS = ['PF', 'TO', 'Att', 'Yds.1', 'TD', 'Int', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2']
OFF_RANK_FACTORS = ['PF', 'Yds.1', 'TO', 'Att']
DEF_RANK_FACTORS = ['PF', 'Yds.1', 'TO', 'Att']

Test 2
TEAM_FACTORS = ['PF', 'TO', 'Cmp', 'Yds.1', 'TD', 'Int', 'Att.1', 'Yds.2', 'TD.1', 'Y/A']
OPP_FACTORS = ['PF', 'TO', 'Cmp', 'Yds.1', 'TD', 'Int', 'Att.1', 'Yds.2', 'TD.1', 'Y/A']
OFF_RANK_FACTORS = ['PF', 'TO', 'Yds.1', 'TD', 'Int', 'Att.1', 'Yds.2', 'TD.1', 'Y/A']
DEF_RANK_FACTORS = ['PF', 'TO', 'Yds.1', 'TD', 'Int', 'Att.1', 'Yds.2', 'TD.1', 'Y/A']

Test 3
TEAM_FACTORS = ['PF', 'Y/P', '1stD', 'Cmp', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2', 'Sc%', 'Yds.4', 'Pts']
OPP_FACTORS = ['PF', 'Y/P', 'TO', '1stD', 'Cmp', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2', 'Sc%', 'Yds.4', 'Pts']
OFF_RANK_FACTORS = ['PF', 'Yds', '1stD', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', 'Sc%', 'Yds.4', 'Pts']
DEF_RANK_FACTORS = ['PF', 'Yds', 'TO', '1stD', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', 'Sc%', 'Yds.4', 'Pts']

Test 4
TEAM_FACTORS = ['PF', 'Yds', 'Y/P', 'TO', '1stD', 'Cmp', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2', 'Sc%', 'TO%', 'Time', 'Plays', 'Yds.4', 'Pts']
OPP_FACTORS = ['PF', 'Yds', 'Y/P', 'TO', '1stD', 'Cmp', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2', 'Sc%', 'TO%', 'Time', 'Plays', 'Yds.4', 'Pts']
OFF_RANK_FACTORS = ['PF', 'Yds', 'TO', '1stD', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', 'Sc%', 'TO%', 'Time', 'Plays', 'Yds.4', 'Pts']
DEF_RANK_FACTORS = ['PF', 'Yds', 'TO', '1stD', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', 'Sc%', 'TO%', 'Time', 'Plays', 'Yds.4', 'Pts']

SyntaxError: invalid syntax (<ipython-input-19-2a24d6678051>, line 1)

In [None]:
RESULTS FOR TESTING DIFFERENT DATA COLUMNS ON THE 2020 NFL DRAFT
TEST 0 - (53.12, 56.25, 80.0, 60.0, 25.0)
TEST 1 - (68.75, 81.25, 90.0, 80.0, 41.67)     1st 
TEST 2 - (62.5, 58.82, 90.0, 70.0, 33.33)      3rd
TEST 3 - (56.25, 46.67, 80.0, 60.0, 33.33)     4th
TEST 4 - (62.5, 66.67, 100.0, 60.0, 33.33)     2nd

In [None]:
RESULTS FOR TESTING DIFFERENT DATA COLUMNS OVER THE LAST 2 YEARS (include 2020)

Test 0 - (45.31, 51.2, 55.0, 55.0, 29.16)       ALL COLUMNS
Test 1 - (56.25, 69.79, 75.0, 60.0, 37.5)       1st
Test 2 - (56.25, 57.98, 65.0, 70.0, 37.5)       2nd
Test 3 - (51.56, 51.46, 65.0, 60.0, 33.33)      4th
Test 4 - (56.25, 53.34, 90.0, 55.0, 29.16)      3rd

In [None]:
RESULTS FOR TESTING DIFFERENT DATA COLUMNS OVER THE LAST 3 YEARS (include 2020)

Test 0 - (43.75, 44.55, 46.67, 50.0, 36.11)       ALL COLUMNS
Test 1 - (53.13, 61.34, 60.0, 56.67, 44.44)      3rd
Test 2 - (59.37, 58.65, 60.0, 73.33, 47.22)      1st
Test 3 - (55.21, 53.91, 60.0, 66.67, 41.66)      4th
Test 4 - (57.29, 53.1, 73.33, 56.67, 44.44)      2nd

In [None]:
RESULTS FOR TESTING DIFFERENT DATA COLUMNS OVER THE LAST 2 YEARS (not include 2020)

Test 0 - (39.06, 38.7, 30.0, 45.0, 41.66)       ALL COLUMNS
Test 1 - (45.31, 51.39, 45.0, 45.0, 45.83)      4th
Test 2 - (57.81, 58.57, 45.0, 75.0, 54.17)      1st
Test 3 - (54.69, 57.53, 50.0, 70.0, 45.83)      2nd
Test 4 - (54.69, 46.31, 60.0, 55.0, 50.0)       3rd

In [None]:
RESULTS FOR TESTING DIFFERENT DATA COLUMNS OVER THE LAST 3 YEARS (not include 2020)

Test 0 - (43.75, 47.23, 36.67, 46.67, 47.22)       ALL COLUMNS
Test 1 - (43.75, 56.48, 40.0, 46.67, 44.44)     4th
Test 2 - (56.25, 68.21, 46.67, 70.0, 52.78)     1st
Test 3 - (55.21, 58.87, 50.0, 66.67, 50.0)      2nd
Test 4 - (51.04, 41.99, 53.33, 50.0, 50.0)      3rd