In [None]:
# OVERVIEW

# Background: 
# The 1st Round of the NFL Draft is a phenomenon that can make or break a team's next season.
# We will experiment with the theory that if a team did well on Offense and poorly on Defense in the previous season, 
# then they would use their highest pick on a player that can improve their Defense (or vice-versa)

# Goal: Predict whether or not a team will draft an Offensive or Defensive player in the 1st Round of the NFL Draft

# Data Points:
# Independent Variable: What unit for the position each team drafted in the 1st Round (Offense=OFF or Defense=DEF)
# (teams can have multiple 1st round picks)
# Dependent Variable: A team's stats and league-wide rankings for the previous season

# Example:
# Bengals 2020 Draft QB=Offense=OFF - Bengals 2019 total passing yards, total rushing yards, turnovers, points for/against, etc.
# Bengals 2019 Stats - https://www.pro-football-reference.com/teams/cin/2019.htm

# Data Scraped From:
# 1st Round Draft History Data - http://www.drafthistory.com/index.php/rounds/round_1
# Team Stats and Rankings - https://www.pro-football-reference.com

In [1]:
# IMPORTS
# DataFrame
import pandas as pd
import numpy as np

# Web Scraping
from requests import get
from bs4 import BeautifulSoup

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Math
import math

In [2]:
# CONSTANTS
CURRENT_YEAR = 2020
YEARS = 20
NUM_NEIGHBORS = 5
TEAM_DICT = {'Bengals':'cin','Redskins':'was','Lions':'det','Giants':'nyg','Dolphins':'mia','Chargers':'sdg',
            'Panthers':'car','Cardinals':'crd','Jaguars':'jax','Browns':'cle','Jets':'nyj','Raiders':'rai',
             'Buccaneers':'tam','49ers':'sfo','Broncos':'den','Falcons':'atl','Cowboys':'dal','Eagles':'phi',
             'Vikings':'min','Saints':'nor','Packers':'gnb','Seahawks':'sea','Ravens':'rav','Titans':'oti',
             'Chiefs':'kan','Colts':'clt','Texans':'htx','Bears':'chi','Steelers':'pit','Rams':'ram','Bills':'buf',
             'Patriots':'nwe'}
EMPTY_COLS = ['Off. Rank Ply', 'Off. Rank Y/P', 'Off. Rank Cmp', 'Off. Rank 1stD', 'Off. Rank 1stD', 'Off. Rank Pen',
             'Off. Rank Yds', 'Off. Rank 1stPy', 'Off. Rank #Dr', 'Def. Rank Ply', 'Def. Rank Y/P', 'Def. Rank Cmp',
             'Def. Rank 1stD', 'Def. Rank 1stD', 'Def. Rank Pen', 'Def. Rank Yds', 'Def. Rank 1stPy', 'Def. Rank #Dr',
             'Off. Rank 1stD.1', 'Off. Rank 1stD.2', 'Off. Rank Yds.3', 'Def. Rank 1stD.1', 'Def. Rank 1stD.2', 'Def. Rank Yds.3']
UNIT_DICT = {'QB':1,'RB':1,'WR':1,'TE':1,' C':1,' G':1,' T':1,'DT':2,'DE':2,'LB':2,'DB':2}
POS_DICT = {'QB':1,'RB':2,'WR':3,'TE':4,' C':5,' G':6,' T':7,'DT':8,'DE':9,'LB':10,'DB':11}
BASIC_FACTORS = ['Position Drafted', 'Team', 'Draft Year', 'Pick Number']
TEAM_FACTORS = ['PF', 'Yds', 'Ply', 'Y/P', 'TO', 'FL', '1stD', 'Cmp', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', '1stD.1', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2', 'Pen', 'Yds.3', '1stPy', '#Dr', 'Sc%', 'TO%', 'Start', 'Time', 'Plays', 'Yds.4', 'Pts']
OPP_FACTORS = ['PF', 'Yds', 'Ply', 'Y/P', 'TO', 'FL', '1stD', 'Cmp', 'Att', 'Yds.1', 'TD', 'Int', 'NY/A', '1stD.1', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', '1stD.2', 'Pen', 'Yds.3', '1stPy', '#Dr', 'Sc%', 'TO%', 'Start', 'Time', 'Plays', 'Yds.4', 'Pts']
OFF_RANK_FACTORS = ['PF', 'Yds', 'TO', 'FL', '1stD', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', 'Sc%', 'TO%', 'Start', 'Time', 'Plays', 'Yds.4', 'Pts']
DEF_RANK_FACTORS = ['PF', 'Yds', 'TO', 'FL', '1stD', 'Yds.1', 'TD', 'Int', 'NY/A', 'Att.1', 'Yds.2', 'TD.1', 'Y/A', 'Sc%', 'TO%', 'Start', 'Time', 'Plays', 'Yds.4', 'Pts']
NEXT_DRAFT_ORDER = ['Jaguars', 'Jets', 'Dolphins', 'Falcons', 'Bengals', 'Eagles', 'Lions', 'Panthers', 'Broncos', 'Cowboys', 
                    'Giants', '49ers', 'Chargers', 'Vikings', 'Patriots', 'Cardinals', 'Raiders', 'Dolphins', 'Redskins', 'Bears', 
                    'Colts', 'Titans', 'Jets', 'Steelers', 'Jaguars', 'Browns', 'Ravens', 'Saints',
                    'Packers', 'Bills', 'Buccaneers', 'Chiefs']

In [3]:
# Recreate raw data from file so that empty values are not perceived as Null=NaN
raw = pd.read_csv('Raw Data For Last ' + str(YEARS) + ' Drafts.csv', encoding = 'utf-8')
raw

Unnamed: 0,Position Drafted,Team,Draft Year,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,...,Def. Rank Yds.3,Def. Rank 1stPy,Def. Rank #Dr,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,QB,Bengals,2020,1,279.0,5169.0,1049.0,4.9,30.0,14.0,...,,,,9.0,27.0,12.0,18.0,15.0,28.0,22.0
1,DE,Redskins,2020,2,266.0,4395.0,885.0,5.0,21.0,8.0,...,,,,4.0,15.0,31.0,31.0,29.0,29.0,30.0
2,DB,Lions,2020,3,341.0,5549.0,1021.0,5.4,23.0,8.0,...,,,,7.0,25.0,3.0,21.0,28.0,30.0,24.0
3,T,Giants,2020,4,341.0,5416.0,1012.0,5.4,33.0,16.0,...,,,,8.0,29.0,25.0,19.0,17.0,20.0,25.0
4,QB,Dolphins,2020,5,306.0,4960.0,1022.0,4.9,26.0,8.0,...,,,,1.0,28.0,27.0,20.0,20.0,26.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,RB,Vikings,2001,27,397.0,5961.0,958.0,6.2,28.0,10.0,...,,,,6.0,31.0,7.0,30.0,30.0,30.0,27.0
632,DB,Raiders,2001,28,479.0,5776.0,1023.0,5.6,20.0,9.0,...,,,,24.0,7.0,1.0,5.0,15.0,18.0,11.0
633,DE,Rams,2001,29,540.0,7075.0,1014.0,7.0,35.0,12.0,...,,,,2.0,22.0,25.0,15.0,14.0,24.0,30.0
634,WR,Colts,2001,30,429.0,6141.0,1026.0,6.0,29.0,14.0,...,,,,12.0,23.0,3.0,25.0,28.0,25.0,19.0


In [4]:
# Data Cleaning
def clean_raw_df(raw, save, keep_id, keep_position_drafted=True) -> 'DataFrame':
    
    # Convert non-Numerical data types to Numerical
    raw = _convert_string_data(raw, keep_position_drafted)
    
    # Keep only the columns of data that we want
    raw = _isolate_key_columns(raw)
    
    # Create an ID column for later reference
    raw = _add_id_column(raw)
    
    # Save a copy for reference to identify what team/position/year occurred for stats
    if save:
        _save_reference_df(raw)
    
    # Remove Identifying Columns
    if keep_id == False:
        raw = _remove_id_columns(raw)
    
    return raw

def _convert_string_data(df, keep) -> 'DataFrame':
    team_start_list = []
    opp_start_list = []
    for i in list(df['Team Start']):
        team_start_list.append(float(str(i)[4:].strip()))
    for i in list(df['Opp. Start']):
        opp_start_list.append(float(str(i)[4:].strip()))
    
    df['Team Start'] = team_start_list
    df['Opp. Start'] = opp_start_list
    
    team_time_list = []
    opp_time_list = []
    for i in list(df['Team Time']):
        team_time_list.append(float((int(str(i)[0])*60)+int(str(i)[2:])))
    for i in list(df['Opp. Time']):
        opp_time_list.append(float((int(str(i)[0])*60)+int(str(i)[2:])))
        
    df['Team Time'] = team_time_list
    df['Opp. Time'] = opp_time_list
    
    if keep:
        num_positions = []
        for i in list(df['Position Drafted']):
            num_positions.append(UNIT_DICT[i])

        df['Position Drafted'] = num_positions
    
    return df
    
def _isolate_key_columns(df):
    factors = []
    for i in TEAM_FACTORS:
        factors.append('Team ' + i)
    for i in OPP_FACTORS:
        factors.append('Opp. ' + i)
    for i in OFF_RANK_FACTORS:
        factors.append('Off. Rank ' + i)
    for i in DEF_RANK_FACTORS:
        factors.append('Def. Rank ' + i)
    df = df[BASIC_FACTORS+factors]
    return df

def _add_id_column(df) -> 'DataFrame':
    df = df.reset_index()
    df = df.rename(columns={'index':'ID'})
    return df
    
def _save_reference_df(df):
    reference_df = df.copy(deep=True)
    reference_df.to_csv('Clean Reference For Last ' + str(YEARS) + ' Drafts.csv', encoding = 'utf-8-sig', index=False)
    
def _remove_id_columns(df):
    df = df.drop(columns=['Pick Number','Team', 'Draft Year'])
    return df

In [5]:
# We will use this dataframe for our model since it only has Numerical values, 
# but we will not use the ID when calculating distance
clean = clean_raw_df(raw, save=True, keep_id=False)
clean

Unnamed: 0,ID,Position Drafted,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,Team 1stD,Team Cmp,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,0,1,279.0,5169.0,1049.0,4.9,30.0,14.0,312.0,356.0,...,32.0,27.0,25.0,9.0,27.0,12.0,18.0,15.0,28.0,22.0
1,1,2,266.0,4395.0,885.0,5.0,21.0,8.0,248.0,298.0,...,31.0,17.0,26.0,4.0,15.0,31.0,31.0,29.0,29.0,30.0
2,2,2,341.0,5549.0,1021.0,5.4,23.0,8.0,313.0,344.0,...,21.0,14.0,10.0,7.0,25.0,3.0,21.0,28.0,30.0,24.0
3,3,1,341.0,5416.0,1012.0,5.4,33.0,16.0,311.0,376.0,...,20.0,28.0,4.0,8.0,29.0,25.0,19.0,17.0,20.0,25.0
4,4,1,306.0,4960.0,1022.0,4.9,26.0,8.0,315.0,371.0,...,27.0,21.0,22.0,1.0,28.0,27.0,20.0,20.0,26.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,631,1,397.0,5961.0,958.0,6.2,28.0,10.0,319.0,307.0,...,15.0,25.0,27.0,6.0,31.0,7.0,30.0,30.0,30.0,27.0
632,632,2,479.0,5776.0,1023.0,5.6,20.0,9.0,337.0,284.0,...,5.0,5.0,14.0,24.0,7.0,1.0,5.0,15.0,18.0,11.0
633,633,2,540.0,7075.0,1014.0,7.0,35.0,12.0,380.0,380.0,...,13.0,27.0,25.0,2.0,22.0,25.0,15.0,14.0,24.0,30.0
634,634,1,429.0,6141.0,1026.0,6.0,29.0,14.0,357.0,357.0,...,25.0,16.0,23.0,12.0,23.0,3.0,25.0,28.0,25.0,19.0


In [6]:
# We will use this dataframe to reference the Team/Year/Position Drafted for our test values and our test's closest neighbors
reference = pd.read_csv('Clean Reference For Last ' + str(YEARS) + ' Drafts.csv', encoding = 'utf-8')
reference

Unnamed: 0,ID,Position Drafted,Team,Draft Year,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,0,1,Bengals,2020,1,279.0,5169.0,1049.0,4.9,30.0,...,32.0,27.0,25.0,9.0,27.0,12.0,18.0,15.0,28.0,22.0
1,1,2,Redskins,2020,2,266.0,4395.0,885.0,5.0,21.0,...,31.0,17.0,26.0,4.0,15.0,31.0,31.0,29.0,29.0,30.0
2,2,2,Lions,2020,3,341.0,5549.0,1021.0,5.4,23.0,...,21.0,14.0,10.0,7.0,25.0,3.0,21.0,28.0,30.0,24.0
3,3,1,Giants,2020,4,341.0,5416.0,1012.0,5.4,33.0,...,20.0,28.0,4.0,8.0,29.0,25.0,19.0,17.0,20.0,25.0
4,4,1,Dolphins,2020,5,306.0,4960.0,1022.0,4.9,26.0,...,27.0,21.0,22.0,1.0,28.0,27.0,20.0,20.0,26.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,631,1,Vikings,2001,27,397.0,5961.0,958.0,6.2,28.0,...,15.0,25.0,27.0,6.0,31.0,7.0,30.0,30.0,30.0,27.0
632,632,2,Raiders,2001,28,479.0,5776.0,1023.0,5.6,20.0,...,5.0,5.0,14.0,24.0,7.0,1.0,5.0,15.0,18.0,11.0
633,633,2,Rams,2001,29,540.0,7075.0,1014.0,7.0,35.0,...,13.0,27.0,25.0,2.0,22.0,25.0,15.0,14.0,24.0,30.0
634,634,1,Colts,2001,30,429.0,6141.0,1026.0,6.0,29.0,...,25.0,16.0,23.0,12.0,23.0,3.0,25.0,28.0,25.0,19.0


In [7]:
# Split into Train / Test Dataframes to Test Model Accuracy on 2020 NFL Draft

def split_dfs_for_testing(final_df) -> list:

    test_df = final_df.iloc[:32]
    train_df = final_df.iloc[32:]
    print(train_df.shape, test_df.shape)

    combine = [train_df, test_df]
    return combine

In [8]:
combine = split_dfs_for_testing(clean)
train_df = combine[0]
test_df = combine[1] 
train_df = train_df.drop(columns='ID')
train_df

(604, 102) (32, 102)


Unnamed: 0,Position Drafted,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,Team 1stD,Team Cmp,Team Att,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
32,1,225.0,3865.0,902.0,4.3,28.0,10.0,239.0,283.0,495.0,...,32.0,32.0,27.0,13.0,26.0,25.0,28.0,21.0,10.0,22.0
33,2,342.0,5769.0,1003.0,5.8,32.0,12.0,344.0,331.0,532.0,...,14.0,15.0,7.0,8.0,32.0,30.0,16.0,19.0,9.0,26.0
34,2,333.0,4787.0,971.0,4.9,30.0,11.0,258.0,299.0,524.0,...,26.0,21.0,19.0,9.0,21.0,29.0,1.0,2.0,8.0,19.0
35,2,290.0,5379.0,995.0,5.4,24.0,14.0,300.0,382.0,556.0,...,30.0,21.0,22.0,2.0,23.0,27.0,18.0,17.0,29.0,32.0
36,2,396.0,6648.0,1055.0,6.3,35.0,9.0,388.0,408.0,625.0,...,24.0,29.0,20.0,6.0,28.0,22.0,10.0,7.0,26.0,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,1,397.0,5961.0,958.0,6.2,28.0,10.0,319.0,307.0,495.0,...,15.0,25.0,27.0,6.0,31.0,7.0,30.0,30.0,30.0,27.0
632,2,479.0,5776.0,1023.0,5.6,20.0,9.0,337.0,284.0,475.0,...,5.0,5.0,14.0,24.0,7.0,1.0,5.0,15.0,18.0,11.0
633,2,540.0,7075.0,1014.0,7.0,35.0,12.0,380.0,380.0,587.0,...,13.0,27.0,25.0,2.0,22.0,25.0,15.0,14.0,24.0,30.0
634,1,429.0,6141.0,1026.0,6.0,29.0,14.0,357.0,357.0,571.0,...,25.0,16.0,23.0,12.0,23.0,3.0,25.0,28.0,25.0,19.0


In [9]:
test_df = test_df.drop(columns='Position Drafted')
test_df

Unnamed: 0,ID,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,Team 1stD,Team Cmp,Team Att,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,0,279.0,5169.0,1049.0,4.9,30.0,14.0,312.0,356.0,616.0,...,32.0,27.0,25.0,9.0,27.0,12.0,18.0,15.0,28.0,22.0
1,1,266.0,4395.0,885.0,5.0,21.0,8.0,248.0,298.0,479.0,...,31.0,17.0,26.0,4.0,15.0,31.0,31.0,29.0,29.0,30.0
2,2,341.0,5549.0,1021.0,5.4,23.0,8.0,313.0,344.0,571.0,...,21.0,14.0,10.0,7.0,25.0,3.0,21.0,28.0,30.0,24.0
3,3,341.0,5416.0,1012.0,5.4,33.0,16.0,311.0,376.0,607.0,...,20.0,28.0,4.0,8.0,29.0,25.0,19.0,17.0,20.0,25.0
4,4,306.0,4960.0,1022.0,4.9,26.0,8.0,315.0,371.0,615.0,...,27.0,21.0,22.0,1.0,28.0,27.0,20.0,20.0,26.0,32.0
5,5,337.0,5879.0,997.0,5.9,31.0,11.0,349.0,394.0,597.0,...,18.0,21.0,15.0,15.0,32.0,23.0,29.0,21.0,15.0,18.0
6,6,340.0,5469.0,1077.0,5.1,35.0,14.0,335.0,382.0,633.0,...,29.0,32.0,32.0,5.0,23.0,24.0,5.0,3.0,14.0,28.0
7,7,361.0,5467.0,1000.0,5.5,18.0,6.0,314.0,355.0,554.0,...,24.0,5.0,20.0,3.0,22.0,7.0,32.0,31.0,32.0,29.0
8,8,300.0,5468.0,1020.0,5.4,20.0,12.0,298.0,364.0,589.0,...,28.0,31.0,31.0,13.0,24.0,21.0,9.0,10.0,22.0,23.0
9,9,335.0,5455.0,973.0,5.6,28.0,7.0,305.0,318.0,539.0,...,30.0,28.0,30.0,17.0,17.0,13.0,13.0,11.0,17.0,21.0


In [10]:
def convert_df_to_weighted_df(df):
    std_list = list(df.std(axis = 0, skipna= True))
    mean_list = list(df.mean(axis=0, skipna= True))
    frames = [pd.DataFrame(df.loc[df.index[0]], columns=df.columns)]
    for i in range(df.index[0], df.index[0]+len(df.index)):
        row_list = list(df.loc[i])
        weighted_row = []
        for j in range(len(row_list)):
            if j == 0:
                weighted_row.append(row_list[j])
            else:
                weighted_row.append((row_list[j] - mean_list[j])/std_list[j])
        weighted_df_row = pd.DataFrame([weighted_row], columns=df.columns)
        frames.append(weighted_df_row)
    weighted_df = pd.concat(frames)
    weighted_df.index = df.index
    return weighted_df

In [11]:
train_df = convert_df_to_weighted_df(train_df)
test_df = convert_df_to_weighted_df(test_df)
train_df

Unnamed: 0,Position Drafted,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,Team 1stD,Team Cmp,Team Att,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
32,1.0,-1.647166,-2.215272,-2.272113,-1.777888,0.214048,-0.211270,-1.697137,-0.940876,-0.773119,...,1.612923,1.666576,1.113280,-0.258618,0.978279,0.883752,1.210413,0.416338,-0.793731,0.485588
33,2.0,-0.011434,0.708710,-0.136331,1.032242,0.793800,0.327157,1.061820,0.045059,-0.144795,...,-0.349882,-0.168731,-1.083457,-0.798373,1.636153,1.430105,-0.091810,0.198333,-0.902098,0.918176
34,2.0,-0.137260,-0.799352,-0.813014,-0.653836,0.503924,0.057943,-1.197897,-0.612231,-0.280649,...,0.958654,0.479024,0.234585,-0.690422,0.430051,1.320834,-1.719587,-1.654703,-1.010464,0.161146
35,2.0,-0.738426,0.109785,-0.305502,0.282874,-0.365705,0.865583,-0.094314,1.092616,0.262767,...,1.394833,0.479024,0.564096,-1.446079,0.649342,1.102293,0.125227,-0.019671,1.265233,1.567059
36,2.0,0.743519,2.058595,0.963280,1.968952,1.228614,-0.480483,2.217954,1.626664,1.434508,...,0.740565,1.342698,0.344422,-1.014275,1.197570,0.555941,-0.742921,-1.109692,0.940134,1.242618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,1.0,0.757499,1.003565,-1.087917,1.781610,0.214048,-0.211270,0.404925,-0.447908,-0.773119,...,-0.240837,0.910861,1.113280,-1.014275,1.526507,-1.083117,1.427450,1.397357,1.373600,1.026324
632,2.0,1.903909,0.719460,0.286596,0.657558,-0.945457,-0.480483,0.877889,-0.920336,-1.112754,...,-1.331284,-1.248323,-0.314599,0.928844,-1.104989,-1.738739,-1.285513,-0.237675,0.073201,-0.704031
633,2.0,2.756726,2.714341,0.096279,3.280346,1.228614,0.327157,2.007747,1.051535,0.789201,...,-0.458926,1.126780,0.893606,-1.446079,0.539696,0.883752,-0.200328,-0.346677,0.723400,1.350765
634,1.0,1.204879,1.279992,0.350035,1.406926,0.358986,0.865583,1.403405,0.579108,0.517493,...,0.849610,-0.060772,0.673933,-0.366569,0.649342,-1.520198,0.884857,1.179352,0.831767,0.161146


In [12]:
X_train = train_df.drop("Position Drafted", axis=1)
Y_train = train_df["Position Drafted"]
X_test  = test_df.drop("ID", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((604, 100), (604,), (32, 100))

In [14]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

67.72

In [15]:
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False).head(10)

Unnamed: 0,Feature,Correlation
44,Opp. Att.1,1.177419
38,Opp. Att,0.772528
47,Opp. Y/A,0.597795
26,Team Time,0.562363
10,Team TD,0.554151
22,Team #Dr,0.532513
66,Off. Rank TD,0.402271
98,Def. Rank Yds.4,0.379295
84,Def. Rank 1stD,0.329531
88,Def. Rank NY/A,0.292431


In [16]:
coeff_df.sort_values(by='Correlation', ascending=False).tail(10)

Unnamed: 0,Feature,Correlation
52,Opp. #Dr,-0.347176
70,Off. Rank Yds.2,-0.361266
23,Team Sc%,-0.362191
48,Opp. 1stD.2,-0.394779
97,Def. Rank Plays,-0.43475
92,Def. Rank Y/A,-0.435089
30,Opp. PF,-0.468484
89,Def. Rank Att.1,-0.564344
74,Off. Rank TO%,-0.639125
62,Off. Rank TO,-0.721202


In [17]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

74.5

In [18]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

73.68

In [19]:
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

60.6

In [20]:
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

55.3

In [21]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc



68.71

In [22]:
sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

62.25

In [23]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

93.87

In [24]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

93.87

In [25]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,93.87
8,Decision Tree,93.87
0,Support Vector Machines,74.5
1,KNN,73.68
7,Linear SVC,68.71
2,Logistic Regression,67.72
6,Stochastic Gradient Decent,62.25
4,Naive Bayes,60.6
5,Perceptron,55.3


In [None]:
# Y_pred = decision_tree.predict(X_test)

In [26]:
submission = pd.DataFrame({
        "Prediction": Y_pred
        # "X_Test": X_test
    })
submission = pd.concat([submission, reference.iloc[:32]], axis=1)
submission

Unnamed: 0,Prediction,ID,Position Drafted,Team,Draft Year,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,1.0,0,1,Bengals,2020,1,279.0,5169.0,1049.0,4.9,...,32.0,27.0,25.0,9.0,27.0,12.0,18.0,15.0,28.0,22.0
1,2.0,1,2,Redskins,2020,2,266.0,4395.0,885.0,5.0,...,31.0,17.0,26.0,4.0,15.0,31.0,31.0,29.0,29.0,30.0
2,1.0,2,2,Lions,2020,3,341.0,5549.0,1021.0,5.4,...,21.0,14.0,10.0,7.0,25.0,3.0,21.0,28.0,30.0,24.0
3,1.0,3,1,Giants,2020,4,341.0,5416.0,1012.0,5.4,...,20.0,28.0,4.0,8.0,29.0,25.0,19.0,17.0,20.0,25.0
4,1.0,4,1,Dolphins,2020,5,306.0,4960.0,1022.0,4.9,...,27.0,21.0,22.0,1.0,28.0,27.0,20.0,20.0,26.0,32.0
5,1.0,5,1,Chargers,2020,6,337.0,5879.0,997.0,5.9,...,18.0,21.0,15.0,15.0,32.0,23.0,29.0,21.0,15.0,18.0
6,1.0,6,2,Panthers,2020,7,340.0,5469.0,1077.0,5.1,...,29.0,32.0,32.0,5.0,23.0,24.0,5.0,3.0,14.0,28.0
7,2.0,7,2,Cardinals,2020,8,361.0,5467.0,1000.0,5.5,...,24.0,5.0,20.0,3.0,22.0,7.0,32.0,31.0,32.0,29.0
8,2.0,8,2,Jaguars,2020,9,300.0,5468.0,1020.0,5.4,...,28.0,31.0,31.0,13.0,24.0,21.0,9.0,10.0,22.0,23.0
9,2.0,9,1,Browns,2020,10,335.0,5455.0,973.0,5.6,...,30.0,28.0,30.0,17.0,17.0,13.0,13.0,11.0,17.0,21.0


In [27]:
def test_unit_prediction(submission):
    correct_predictions = 0
    top_10 = 0
    mid_10 = 0
    last_12 = 0
    
    prediction_list = list(submission['Prediction'])
    actual_list = list(submission['Position Drafted'])
    length = len(prediction_list)
    
    for i in range(length):
        if prediction_list[i] == actual_list[i]:
            correct_predictions += 1
            if i <= 9:
                top_10 += 1
            elif i <= 19:
                mid_10 += 1
            elif i <= 31:
                last_12 += 1
    
    correct_pred_percent = round(correct_predictions/length, 4)*100
    top_10_accuracy = round(top_10 / 10, 4)*100
    mid_10_accuracy = round(mid_10 / 10, 4)*100
    last_12_accuracy = round(last_12 / 12, 4)*100
    
    print()
    print('Correct Predictions %: ', correct_pred_percent)
    print('Correct Top 10 Predictions %: ', top_10_accuracy)
    print('Correct Mid 10 Predictions %: ', mid_10_accuracy)
    print('Correct Last 12 Predictions %: ', last_12_accuracy)
    print()
    
    return (correct_pred_percent, top_10_accuracy, mid_10_accuracy, last_12_accuracy)


In [28]:
test_unit_prediction(submission)


Correct Predictions %:  50.0
Correct Top 10 Predictions %:  70.0
Correct Mid 10 Predictions %:  70.0
Correct Last 12 Predictions %:  16.669999999999998



(50.0, 70.0, 70.0, 16.669999999999998)

In [29]:
reference2 = pd.read_csv('Raw Data For ' + str(CURRENT_YEAR+1) + ' Draft.csv', encoding = 'utf-8')
reference2 = clean_raw_df(reference2, save=False, keep_id =True, keep_position_drafted=False)
reference2.drop(columns='Position Drafted', inplace=True)
reference2

Unnamed: 0,ID,Team,Draft Year,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,0,Jaguars,2021,1,292.0,4935.0,932.0,5.3,24.0,8.0,...,30.0,30.0,20.0,3.0,26.0,28.0,22.0,15.0,27.0,30.0
1,1,Jets,2021,2,229.0,4129.0,886.0,4.7,17.0,5.0,...,10.0,16.0,5.0,8.0,18.0,27.0,27.0,22.0,21.0,24.0
2,2,Dolphins,2021,3,378.0,4970.0,941.0,5.3,16.0,6.0,...,17.0,14.0,23.0,30.0,1.0,5.0,8.0,7.0,14.0,3.0
3,3,Falcons,2021,4,369.0,5510.0,1007.0,5.5,16.0,5.0,...,7.0,13.0,15.0,17.0,13.0,12.0,10.0,14.0,24.0,20.0
4,4,Bengals,2021,5,308.0,4921.0,1000.0,4.9,22.0,13.0,...,25.0,4.0,29.0,12.0,25.0,20.0,4.0,9.0,18.0,17.0
5,5,Eagles,2021,6,320.0,5138.0,1001.0,5.1,26.0,8.0,...,24.0,27.0,11.0,20.0,29.0,25.0,5.0,3.0,7.0,13.0
6,6,Lions,2021,7,342.0,5186.0,937.0,5.5,19.0,7.0,...,29.0,32.0,16.0,4.0,30.0,10.0,28.0,25.0,32.0,32.0
7,7,Giants,2021,8,257.0,4458.0,911.0,4.9,20.0,10.0,...,11.0,11.0,8.0,14.0,11.0,19.0,30.0,29.0,22.0,15.0
8,8,Panthers,2021,9,343.0,5272.0,933.0,5.7,16.0,5.0,...,16.0,16.0,28.0,6.0,9.0,23.0,25.0,31.0,25.0,25.0
9,9,Broncos,2021,10,292.0,4923.0,955.0,5.2,32.0,9.0,...,26.0,27.0,30.0,7.0,31.0,30.0,15.0,11.0,10.0,21.0


In [30]:
test_df_2 = reference2.drop(columns=['Team', 'Draft Year', 'Pick Number'])
test_df_2 = convert_df_to_weighted_df(test_df_2)
test_df_2

Unnamed: 0,ID,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,Team 1stD,Team Cmp,Team Att,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,0.0,-1.114695,-0.706427,-0.8083,-0.454606,0.95524,0.064635,-1.006668,0.462338,0.863772,...,1.558015,1.66972,0.420452,-1.459168,1.1081,1.174567,0.655789,-0.066015,1.237647,1.474145
1,1.0,-2.075869,-2.220987,-1.920279,-1.818424,-0.461805,-0.969521,-2.419817,-1.711978,-1.098988,...,-0.70305,0.127381,-1.233785,-0.93687,0.25108,1.068391,1.193872,0.712271,0.568272,0.842369
2,2.0,0.197384,-0.640658,-0.590739,-0.454606,-0.664239,-0.624802,-0.171625,-0.175777,-0.468744,...,0.088323,-0.092953,0.751299,1.361237,-1.570087,-1.26747,-0.850844,-0.955486,-0.212666,-1.368849
3,3.0,0.060073,0.37406,1.004709,0.0,-0.664239,-0.969521,0.567066,0.887747,1.043841,...,-1.04221,-0.203121,-0.13096,0.003264,-0.284557,-0.524242,-0.635611,-0.177199,0.90296,0.421184
4,4.0,-0.870587,-0.732734,0.835495,-1.363818,0.55037,1.788227,-0.492796,0.580507,0.611674,...,0.992749,-1.194624,1.412994,-0.519033,1.000973,0.325163,-1.281311,-0.733118,0.233584,0.105296
5,5.0,-0.687507,-0.324968,0.859668,-0.909212,1.360109,0.064635,-0.107391,-0.483017,0.701709,...,0.879696,1.339218,-0.57209,0.316643,1.429483,0.85604,-1.173694,-1.400221,-0.993604,-0.315888
6,6.0,-0.351858,-0.234771,-0.687433,0.0,-0.056935,-0.280084,0.181662,0.2969,0.449612,...,1.444962,1.890054,-0.020678,-1.354708,1.53661,-0.736593,1.301489,1.045822,1.79546,1.684738
7,7.0,-1.648681,-1.602761,-1.315942,-1.363818,0.1455,0.754072,-1.552658,-0.884793,-0.6128,...,-0.589997,-0.423455,-0.902938,-0.310114,-0.498812,0.218987,1.516722,1.490558,0.679834,-0.105296
8,8.0,-0.336602,-0.073168,-0.784126,0.454606,-0.664239,-0.969521,-0.171625,0.320534,-0.234653,...,-0.02473,0.127381,1.302712,-1.145789,-0.713067,0.643689,0.978639,1.712925,1.014522,0.947665
9,9.0,-1.114695,-0.728976,-0.25231,-0.681909,2.574719,0.409353,-1.263604,-1.168399,-0.198639,...,1.105802,1.339218,1.523277,-1.04133,1.643738,1.386918,-0.097528,-0.51075,-0.658916,0.52648


In [39]:
train_df_2 = pd.concat([test_df, train_df])
train_df_2 = train_df_2.drop(columns='ID')
listy = list(train_df_2['Position Drafted'])
listy2 = list(clean['Position Drafted'][:32])
train_df_2['Position Drafted'] = listy2+listy[32:]
train_df_2

Unnamed: 0,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,Team 1stD,Team Cmp,Team Att,Team Yds.1,...,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts,Position Drafted
0,-1.238250,-0.796147,0.781036,-1.388433,1.056882,1.406268,-0.380443,-0.028530,0.928962,-0.369800,...,1.106330,0.761136,-0.556763,0.805546,-0.551625,0.095755,-0.289075,1.146995,0.334064,1.0
1,-1.424615,-2.166853,-2.698486,-1.175850,-0.232872,-0.608683,-2.404204,-1.585913,-1.239605,-2.138523,...,-0.017561,0.869870,-1.117027,-0.412287,1.612168,1.469349,1.271235,1.264635,1.197907,2.0
2,-0.349434,-0.123191,0.186971,-0.325518,0.053740,-0.608683,-0.348822,-0.350747,0.216659,0.152394,...,-0.354728,-0.869870,-0.780869,0.602574,-1.576579,0.412738,1.159784,1.382276,0.550025,2.0
3,-0.349434,-0.358726,-0.003978,-0.325518,1.486799,2.077918,-0.412065,0.508499,0.786501,-0.203456,...,1.218719,-1.522273,-0.668816,1.008518,0.928865,0.201416,-0.066174,0.205871,0.658005,1.0
4,-0.851185,-1.166273,0.208188,-1.388433,0.483658,-0.608683,-0.285580,0.374242,0.913133,-0.049745,...,0.431996,0.434935,-1.453186,0.907032,1.156633,0.307077,0.268178,0.911714,1.413867,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,0.757499,1.003565,-1.087917,1.781610,0.214048,-0.211270,0.404925,-0.447908,-0.773119,0.498524,...,0.910861,1.113280,-1.014275,1.526507,-1.083117,1.427450,1.397357,1.373600,1.026324,1.0
632,1.903909,0.719460,0.286596,0.657558,-0.945457,-0.480483,0.877889,-0.920336,-1.112754,-0.327701,...,-1.248323,-0.314599,0.928844,-1.104989,-1.738739,-1.285513,-0.237675,0.073201,-0.704031,2.0
633,2.756726,2.714341,0.096279,3.280346,1.228614,0.327157,2.007747,1.051535,0.789201,2.697602,...,1.126780,0.893606,-1.446079,0.539696,0.883752,-0.200328,-0.346677,0.723400,1.350765,2.0
634,1.204879,1.279992,0.350035,1.406926,0.358986,0.865583,1.403405,0.579108,0.517493,1.205371,...,-0.060772,0.673933,-0.366569,0.649342,-1.520198,0.884857,1.179352,0.831767,0.161146,1.0


In [40]:
X_train = train_df_2.drop("Position Drafted", axis=1)
Y_train = train_df_2["Position Drafted"]
X_test  = test_df_2.drop("ID", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((636, 100), (636,), (32, 100))

In [41]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)



Unnamed: 0,Model,Score
3,Random Forest,93.4
8,Decision Tree,93.4
0,Support Vector Machines,74.37
1,KNN,73.43
7,Linear SVC,70.44
2,Logistic Regression,67.92
4,Naive Bayes,59.43
5,Perceptron,57.7
6,Stochastic Gradient Decent,53.93


In [43]:
submission = pd.DataFrame({
        "Prediction": Y_pred
        # "X_Test": X_test
    })
submission = pd.concat([submission, reference2], axis=1)
submission

Unnamed: 0,Prediction,ID,Team,Draft Year,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,1.0,0,Jaguars,2021,1,292.0,4935.0,932.0,5.3,24.0,...,30.0,30.0,20.0,3.0,26.0,28.0,22.0,15.0,27.0,30.0
1,1.0,1,Jets,2021,2,229.0,4129.0,886.0,4.7,17.0,...,10.0,16.0,5.0,8.0,18.0,27.0,27.0,22.0,21.0,24.0
2,1.0,2,Dolphins,2021,3,378.0,4970.0,941.0,5.3,16.0,...,17.0,14.0,23.0,30.0,1.0,5.0,8.0,7.0,14.0,3.0
3,2.0,3,Falcons,2021,4,369.0,5510.0,1007.0,5.5,16.0,...,7.0,13.0,15.0,17.0,13.0,12.0,10.0,14.0,24.0,20.0
4,1.0,4,Bengals,2021,5,308.0,4921.0,1000.0,4.9,22.0,...,25.0,4.0,29.0,12.0,25.0,20.0,4.0,9.0,18.0,17.0
5,2.0,5,Eagles,2021,6,320.0,5138.0,1001.0,5.1,26.0,...,24.0,27.0,11.0,20.0,29.0,25.0,5.0,3.0,7.0,13.0
6,1.0,6,Lions,2021,7,342.0,5186.0,937.0,5.5,19.0,...,29.0,32.0,16.0,4.0,30.0,10.0,28.0,25.0,32.0,32.0
7,1.0,7,Giants,2021,8,257.0,4458.0,911.0,4.9,20.0,...,11.0,11.0,8.0,14.0,11.0,19.0,30.0,29.0,22.0,15.0
8,2.0,8,Panthers,2021,9,343.0,5272.0,933.0,5.7,16.0,...,16.0,16.0,28.0,6.0,9.0,23.0,25.0,31.0,25.0,25.0
9,1.0,9,Broncos,2021,10,292.0,4923.0,955.0,5.2,32.0,...,26.0,27.0,30.0,7.0,31.0,30.0,15.0,11.0,10.0,21.0
