In [None]:
# OVERVIEW
# Data
# What position each team drafted and what their stats for the previous season was
# Chargers 2020 Draft QB, LB - Chargers 2019 receiving, rushing, tackles, ints, points for, etc
    
# 1st Round Draft History Data - http://www.drafthistory.com/index.php/rounds/round_1
# Team Stats and Rankings - https://www.pro-football-reference.com/teams/cin/2019.htm

In [1]:
# IMPORTS
# Data
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Web Scraping
from requests import get
from bs4 import BeautifulSoup

# Math
import math

In [2]:
# CONSTANTS
CURRENT_YEAR = 2020
YEARS = 20
TEAM_DICT = {'Bengals':'cin','Redskins':'was','Lions':'det','Giants':'nyg','Dolphins':'mia','Chargers':'sdg',
            'Panthers':'car','Cardinals':'crd','Jaguars':'jax','Browns':'cle','Jets':'nyj','Raiders':'rai',
             'Buccaneers':'tam','49ers':'sfo','Broncos':'den','Falcons':'atl','Cowboys':'dal','Eagles':'phi',
             'Vikings':'min','Saints':'nor','Packers':'gnb','Seahawks':'sea','Ravens':'rav','Titans':'oti',
             'Chiefs':'kan','Colts':'clt','Texans':'htx','Bears':'chi','Steelers':'pit','Rams':'ram','Bills':'buf',
             'Patriots':'nwe'}
# POS_DICT = {'QB':1,'RB':2,'WR':3,'TE':4,' C':5,' G':6,' T':7,'DT':8,'DE':9,'LB':10,'DB':11}
EMPTY_COLS = ['Off. Rank Ply', 'Off. Rank Y/P', 'Off. Rank Cmp', 'Off. Rank 1stD', 'Off. Rank 1stD', 'Off. Rank Pen',
             'Off. Rank Yds', 'Off. Rank 1stPy', 'Off. Rank #Dr', 'Def. Rank Ply', 'Def. Rank Y/P', 'Def. Rank Cmp',
             'Def. Rank 1stD', 'Def. Rank 1stD', 'Def. Rank Pen', 'Def. Rank Yds', 'Def. Rank 1stPy', 'Def. Rank #Dr',
             'Off. Rank 1stD.1', 'Off. Rank 1stD.2', 'Off. Rank Yds.3', 'Def. Rank 1stD.1', 'Def. Rank 1stD.2', 'Def. Rank Yds.3']
UNIT_DICT = {'QB':'OFF','RB':'OFF','WR':'OFF','TE':'OFF',' C':'OFF',' G':'OFF',' T':'OFF','DT':'DEF','DE':'DEF','LB':'DEF',
             'DB':'DEF'}
NUM_NEIGHBORS = 5
print(len(TEAM_DICT))

32


In [None]:
# WEB SCRAPING FUNCTIONS
def get_first_rounders() -> list:
    url = 'http://www.drafthistory.com/index.php/rounds/round_1'
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find(id='main')
    row_containers = table.table.find_all('tr')
    row_containers = row_containers[2:]https://www.youtube.com/playlist?list=WL
    labels = ['Year','Round','Pick','Player','Name','Team','Position','College']
    
    all_rows = []
    should_break = False
    for i in row_containers:
        new_row = []
        cols = i.find_all('td')
        for j in range(0, len(cols)):
            if j == 0:
                try:
                    year = int(cols[j].text)
                    if year == CURRENT_YEAR - YEARS:
                        should_break=True
                        break
                    new_row.append(cols[j].text)
                except:
                    new_row.append(year)
            else:
                new_row.append(cols[j].text)
        if should_break:
            break
        if new_row:
            all_rows.append(new_row)
    
    df = pd.DataFrame(all_rows,columns=labels)
    df.drop(columns=['Round','Player','College'], inplace=True)
    return df

def get_team_stats_by_year_url(team_name: str, year) -> str:
    base_url = 'https://www.pro-football-reference.com/teams/'
    end_url = '.htm'
    return base_url + TEAM_DICT[team_name] + '/' + str(year) + end_url

def get_team_stats(url) -> 'DataFrame':
    response = get(url)
    if response.status_code == 404:
        return pd.DataFrame()
    soup = BeautifulSoup(response.text,'html.parser')
    table_container = soup.find(id='all_team_stats')
    if table_container:
        print('Scraping', url)
    else:
        print(url)
    table = table_container.find(class_='table_outer_container').div.table
    
    label_containers = table.thead.find_all('tr')[1].find_all('th')
    labels = []
    for i in label_containers:
        labels.append(str(i.text))
        
    list_of_rows = []
    row_containers = table.tbody.find_all('tr')
    for i in row_containers:
        row = []
        row.append(i.th)
        stat_containers = i.find_all('td')
        for j in stat_containers:
            try: 
                num = float(j.text)
                row.append(num)
            except:
                row.append(str(j.text).strip())
        list_of_rows.append(row)
    
    return pd.DataFrame(list_of_rows,columns=labels)

def convert_team_stats_to_one_row(df) -> 'DataFrame':
    team_columns = []
    opp_columns = []
    off_rank_columns = []
    def_rank_columns = []
    
    for i in df.columns:
        team_columns.append('Team ' + i)
        opp_columns.append('Opp. ' + i)
        off_rank_columns.append('Off. Rank ' + i)
        def_rank_columns.append('Def. Rank ' + i)
        
    row1 = pd.DataFrame([list(df.loc[0])], columns=team_columns)
    row2 = pd.DataFrame([list(df.loc[1])], columns=opp_columns)
    row3 = pd.DataFrame([list(df.loc[2])], columns=off_rank_columns)
    row4 = pd.DataFrame([list(df.loc[3])], columns=def_rank_columns)
    
    new_df = pd.concat([row1, row2, row3, row4], axis=1).reindex(row1.index)
    return new_df

def make_final_one_row(df, draft_year, pick_number, team_name, position_drafted) -> 'DataFrame':
    df.drop(columns=['Team Player', 'Opp. Player', 'Off. Rank Player', 'Def. Rank Player'], inplace=True)
    df.insert(0, 'Pick Number', [pick_number], True)
    df.insert(0, 'Draft Year', [draft_year], True)
    df.insert(0, 'Team', [team_name], True)
    df.insert(0, 'Position Drafted', [position_drafted], True)
    return df

def get_final_df() -> 'DataFrame':
    first_rounders = get_first_rounders()
    frames = []
    for i in first_rounders.index:
        row = list(first_rounders.loc[i])

        draft_year = row[0]
        pick_number = row[1]
        team_name = row[3]
        position_drafted = row[4]

        stats_url = get_team_stats_by_year_url(team_name, int(draft_year)-1)
        team_stats_df = get_team_stats(stats_url)
        if team_stats_df.empty:
            print('Not Found', stats_url)
            continue
        else:
            pass
        one_row = convert_team_stats_to_one_row(team_stats_df)
        final_row = make_final_one_row(one_row, draft_year, pick_number, team_name, position_drafted)
        frames.append(final_row)
    final_df = pd.concat(frames)
    return final_df

In [None]:
final_df = get_final_df()
final_df.to_csv('Profile For Last ' + str(YEARS) + ' Drafts.csv', encoding = 'utf-8-sig', index=False)

In [None]:
final_df

In [28]:
final_df = pd.read_csv('Profile For Last ' + str(YEARS) + ' Drafts.csv', encoding = 'utf-8')
final_df

Unnamed: 0,Position Drafted,Team,Draft Year,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,...,Def. Rank Yds.3,Def. Rank 1stPy,Def. Rank #Dr,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,QB,Bengals,2020,1,279.0,5169.0,1049.0,4.9,30.0,14.0,...,,,,9.0,27.0,12.0,18.0,15.0,28.0,22.0
1,DE,Redskins,2020,2,266.0,4395.0,885.0,5.0,21.0,8.0,...,,,,4.0,15.0,31.0,31.0,29.0,29.0,30.0
2,DB,Lions,2020,3,341.0,5549.0,1021.0,5.4,23.0,8.0,...,,,,7.0,25.0,3.0,21.0,28.0,30.0,24.0
3,T,Giants,2020,4,341.0,5416.0,1012.0,5.4,33.0,16.0,...,,,,8.0,29.0,25.0,19.0,17.0,20.0,25.0
4,QB,Dolphins,2020,5,306.0,4960.0,1022.0,4.9,26.0,8.0,...,,,,1.0,28.0,27.0,20.0,20.0,26.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,RB,Vikings,2001,27,397.0,5961.0,958.0,6.2,28.0,10.0,...,,,,6.0,31.0,7.0,30.0,30.0,30.0,27.0
632,DB,Raiders,2001,28,479.0,5776.0,1023.0,5.6,20.0,9.0,...,,,,24.0,7.0,1.0,5.0,15.0,18.0,11.0
633,DE,Rams,2001,29,540.0,7075.0,1014.0,7.0,35.0,12.0,...,,,,2.0,22.0,25.0,15.0,14.0,24.0,30.0
634,WR,Colts,2001,30,429.0,6141.0,1026.0,6.0,29.0,14.0,...,,,,12.0,23.0,3.0,25.0,28.0,25.0,19.0


In [29]:
def clean_final_df(final_df) -> 'DataFrame':
    
    team_start_list = []
    opp_start_list = []
    for i in list(final_df['Team Start']):
        team_start_list.append(float(str(i)[4:].strip()))
    for i in list(final_df['Opp. Start']):
        opp_start_list.append(float(str(i)[4:].strip()))
    
    final_df['Team Start'] = team_start_list
    final_df['Opp. Start'] = opp_start_list
    
    team_time_list = []
    opp_time_list = []
    for i in list(final_df['Team Time']):
        team_time_list.append(float((int(str(i)[0])*60)+int(str(i)[2:])))
    for i in list(final_df['Opp. Time']):
        opp_time_list.append(float((int(str(i)[0])*60)+int(str(i)[2:])))
        
    final_df['Team Time'] = team_time_list
    final_df['Opp. Time'] = opp_time_list
    
    for i in final_df:
        if i in EMPTY_COLS:
            final_df = final_df.drop(columns=i)
    
    return final_df

In [30]:
clean_final_df = clean_final_df(final_df)
clean_final_df

Unnamed: 0,Position Drafted,Team,Draft Year,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,QB,Bengals,2020,1,279.0,5169.0,1049.0,4.9,30.0,14.0,...,32.0,27.0,25.0,9.0,27.0,12.0,18.0,15.0,28.0,22.0
1,DE,Redskins,2020,2,266.0,4395.0,885.0,5.0,21.0,8.0,...,31.0,17.0,26.0,4.0,15.0,31.0,31.0,29.0,29.0,30.0
2,DB,Lions,2020,3,341.0,5549.0,1021.0,5.4,23.0,8.0,...,21.0,14.0,10.0,7.0,25.0,3.0,21.0,28.0,30.0,24.0
3,T,Giants,2020,4,341.0,5416.0,1012.0,5.4,33.0,16.0,...,20.0,28.0,4.0,8.0,29.0,25.0,19.0,17.0,20.0,25.0
4,QB,Dolphins,2020,5,306.0,4960.0,1022.0,4.9,26.0,8.0,...,27.0,21.0,22.0,1.0,28.0,27.0,20.0,20.0,26.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,RB,Vikings,2001,27,397.0,5961.0,958.0,6.2,28.0,10.0,...,15.0,25.0,27.0,6.0,31.0,7.0,30.0,30.0,30.0,27.0
632,DB,Raiders,2001,28,479.0,5776.0,1023.0,5.6,20.0,9.0,...,5.0,5.0,14.0,24.0,7.0,1.0,5.0,15.0,18.0,11.0
633,DE,Rams,2001,29,540.0,7075.0,1014.0,7.0,35.0,12.0,...,13.0,27.0,25.0,2.0,22.0,25.0,15.0,14.0,24.0,30.0
634,WR,Colts,2001,30,429.0,6141.0,1026.0,6.0,29.0,14.0,...,25.0,16.0,23.0,12.0,23.0,3.0,25.0,28.0,25.0,19.0


In [31]:
specialized_df = clean_final_df
reference_df = specialized_df.copy(deep=True)
reference_df

Unnamed: 0,Position Drafted,Team,Draft Year,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,QB,Bengals,2020,1,279.0,5169.0,1049.0,4.9,30.0,14.0,...,32.0,27.0,25.0,9.0,27.0,12.0,18.0,15.0,28.0,22.0
1,DE,Redskins,2020,2,266.0,4395.0,885.0,5.0,21.0,8.0,...,31.0,17.0,26.0,4.0,15.0,31.0,31.0,29.0,29.0,30.0
2,DB,Lions,2020,3,341.0,5549.0,1021.0,5.4,23.0,8.0,...,21.0,14.0,10.0,7.0,25.0,3.0,21.0,28.0,30.0,24.0
3,T,Giants,2020,4,341.0,5416.0,1012.0,5.4,33.0,16.0,...,20.0,28.0,4.0,8.0,29.0,25.0,19.0,17.0,20.0,25.0
4,QB,Dolphins,2020,5,306.0,4960.0,1022.0,4.9,26.0,8.0,...,27.0,21.0,22.0,1.0,28.0,27.0,20.0,20.0,26.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,RB,Vikings,2001,27,397.0,5961.0,958.0,6.2,28.0,10.0,...,15.0,25.0,27.0,6.0,31.0,7.0,30.0,30.0,30.0,27.0
632,DB,Raiders,2001,28,479.0,5776.0,1023.0,5.6,20.0,9.0,...,5.0,5.0,14.0,24.0,7.0,1.0,5.0,15.0,18.0,11.0
633,DE,Rams,2001,29,540.0,7075.0,1014.0,7.0,35.0,12.0,...,13.0,27.0,25.0,2.0,22.0,25.0,15.0,14.0,24.0,30.0
634,WR,Colts,2001,30,429.0,6141.0,1026.0,6.0,29.0,14.0,...,25.0,16.0,23.0,12.0,23.0,3.0,25.0,28.0,25.0,19.0


In [32]:
def convert_final_df_to_num(final_df) -> 'DataFrame':
    years_list = []
    for i in list(final_df['Draft Year']):
        years_list.append(int(i))
    final_df.index = years_list
    
    num_df = final_df.drop(columns=['Position Drafted','Team','Draft Year'])
    # num_df['Position Drafted'] = num_df['Position Drafted'].map(POS_DICT)
    return num_df

In [33]:
num_df = convert_final_df_to_num(specialized_df)
num_df

Unnamed: 0,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,Team 1stD,Team Cmp,Team Att,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
2020,1,279.0,5169.0,1049.0,4.9,30.0,14.0,312.0,356.0,616.0,...,32.0,27.0,25.0,9.0,27.0,12.0,18.0,15.0,28.0,22.0
2020,2,266.0,4395.0,885.0,5.0,21.0,8.0,248.0,298.0,479.0,...,31.0,17.0,26.0,4.0,15.0,31.0,31.0,29.0,29.0,30.0
2020,3,341.0,5549.0,1021.0,5.4,23.0,8.0,313.0,344.0,571.0,...,21.0,14.0,10.0,7.0,25.0,3.0,21.0,28.0,30.0,24.0
2020,4,341.0,5416.0,1012.0,5.4,33.0,16.0,311.0,376.0,607.0,...,20.0,28.0,4.0,8.0,29.0,25.0,19.0,17.0,20.0,25.0
2020,5,306.0,4960.0,1022.0,4.9,26.0,8.0,315.0,371.0,615.0,...,27.0,21.0,22.0,1.0,28.0,27.0,20.0,20.0,26.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2001,27,397.0,5961.0,958.0,6.2,28.0,10.0,319.0,307.0,495.0,...,15.0,25.0,27.0,6.0,31.0,7.0,30.0,30.0,30.0,27.0
2001,28,479.0,5776.0,1023.0,5.6,20.0,9.0,337.0,284.0,475.0,...,5.0,5.0,14.0,24.0,7.0,1.0,5.0,15.0,18.0,11.0
2001,29,540.0,7075.0,1014.0,7.0,35.0,12.0,380.0,380.0,587.0,...,13.0,27.0,25.0,2.0,22.0,25.0,15.0,14.0,24.0,30.0
2001,30,429.0,6141.0,1026.0,6.0,29.0,14.0,357.0,357.0,571.0,...,25.0,16.0,23.0,12.0,23.0,3.0,25.0,28.0,25.0,19.0


In [34]:
def split_dfs(final_df) -> list:

    train_df_years = [*range(CURRENT_YEAR-1, CURRENT_YEAR - YEARS, -1)]

    test_df = final_df.loc[[CURRENT_YEAR]]
    train_df = final_df.loc[train_df_years]
    print(train_df.shape, test_df.shape)

    combine = [train_df, test_df]
    return combine

In [35]:
combine = split_dfs(num_df)
train_df = combine[0]
test_df = combine[1]
test_df = test_df.reset_index(drop=True)
train_df = train_df.reset_index(drop=True)
train_df.head()

# test_df.to_csv('Test_DF.csv', encoding='utf-8-sig', index=True)
# train_df.to_csv('Train_DF.csv', encoding = 'utf-8-sig', index=True)

(604, 99) (32, 99)


Unnamed: 0,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,Team 1stD,Team Cmp,Team Att,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,1,225.0,3865.0,902.0,4.3,28.0,10.0,239.0,283.0,495.0,...,32.0,32.0,27.0,13.0,26.0,25.0,28.0,21.0,10.0,22.0
1,2,342.0,5769.0,1003.0,5.8,32.0,12.0,344.0,331.0,532.0,...,14.0,15.0,7.0,8.0,32.0,30.0,16.0,19.0,9.0,26.0
2,3,333.0,4787.0,971.0,4.9,30.0,11.0,258.0,299.0,524.0,...,26.0,21.0,19.0,9.0,21.0,29.0,1.0,2.0,8.0,19.0
3,4,290.0,5379.0,995.0,5.4,24.0,14.0,300.0,382.0,556.0,...,30.0,21.0,22.0,2.0,23.0,27.0,18.0,17.0,29.0,32.0
4,5,396.0,6648.0,1055.0,6.3,35.0,9.0,388.0,408.0,625.0,...,24.0,29.0,20.0,6.0,28.0,22.0,10.0,7.0,26.0,29.0


In [36]:
train_df.index = train_df.index + 32
train_df.head()

Unnamed: 0,Pick Number,Team PF,Team Yds,Team Ply,Team Y/P,Team TO,Team FL,Team 1stD,Team Cmp,Team Att,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
32,1,225.0,3865.0,902.0,4.3,28.0,10.0,239.0,283.0,495.0,...,32.0,32.0,27.0,13.0,26.0,25.0,28.0,21.0,10.0,22.0
33,2,342.0,5769.0,1003.0,5.8,32.0,12.0,344.0,331.0,532.0,...,14.0,15.0,7.0,8.0,32.0,30.0,16.0,19.0,9.0,26.0
34,3,333.0,4787.0,971.0,4.9,30.0,11.0,258.0,299.0,524.0,...,26.0,21.0,19.0,9.0,21.0,29.0,1.0,2.0,8.0,19.0
35,4,290.0,5379.0,995.0,5.4,24.0,14.0,300.0,382.0,556.0,...,30.0,21.0,22.0,2.0,23.0,27.0,18.0,17.0,29.0,32.0
36,5,396.0,6648.0,1055.0,6.3,35.0,9.0,388.0,408.0,625.0,...,24.0,29.0,20.0,6.0,28.0,22.0,10.0,7.0,26.0,29.0


In [37]:
# DATA PREPARATION FUNCTIONS

def convert_df_to_weighted_df(df):
    std_list = list(df.std(axis = 0, skipna= True))
    mean_list = list(df.mean(axis=0, skipna= True))
    frames = []
    for i in range(df.index[0], df.index[0]+len(df.index)):
        row_list = list(df.loc[i])
        weighted_row = []
        for j in range(len(row_list)):
            weighted_row.append((row_list[j] - mean_list[j])/std_list[j])
        weighted_df_row = pd.DataFrame([weighted_row], columns=df.columns)
        frames.append(weighted_df_row)
    weighted_df = pd.concat(frames)
    weighted_df.index = df.index
    return weighted_df

def ready_dataset_for_modeling(train_df, test_df) -> list:
    weighted_train_df = convert_df_to_weighted_df(train_df)
    weighted_test_df = convert_df_to_weighted_df(test_df)
    
    train_dataset = weighted_train_df.to_numpy().tolist()
    test_dataset = weighted_test_df.to_numpy().tolist()
    
    combine = [train_dataset, test_dataset]
    return combine

In [38]:
'''def get_model_ready_combine(final_df):
    clean_final_df = clean_final_df(final_df)
    specialized_df = specialize_final_df(clean_final_df)
    reference_df = specialized_df.copy(deep=True)
    num_df = convert_final_df_to_num(specialized_df)
    combine = split_dfs(num_df)
    train_df = combine[0]
    test_df = combine[1]
    test_df = test_df.reset_index(drop=True)
    train_df = train_df.reset_index(drop=True)
    train_df.index = train_df.index + 32
    return ready_dataset_for_modeling(train_df, test_df)
    
model_ready_combine = get_model_ready_combine(final_df)
train_dataset = model_ready_combine[0]
test_dataset = model_ready_combine[1]
train_dataset
    '''

'def get_model_ready_combine(final_df):\n    clean_final_df = clean_final_df(final_df)\n    specialized_df = specialize_final_df(clean_final_df)\n    reference_df = specialized_df.copy(deep=True)\n    num_df = convert_final_df_to_num(specialized_df)\n    combine = split_dfs(num_df)\n    train_df = combine[0]\n    test_df = combine[1]\n    test_df = test_df.reset_index(drop=True)\n    train_df = train_df.reset_index(drop=True)\n    train_df.index = train_df.index + 32\n    return ready_dataset_for_modeling(train_df, test_df)\n    \nmodel_ready_combine = get_model_ready_combine(final_df)\ntrain_dataset = model_ready_combine[0]\ntest_dataset = model_ready_combine[1]\ntrain_dataset\n    '

In [39]:
model_ready_combine = ready_dataset_for_modeling(train_df, test_df)
train_dataset = model_ready_combine[0]
test_dataset = model_ready_combine[1]

In [40]:
# K NEAREST NEIGHBORS MODEL FUNCTIONS

def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return math.sqrt(distance)

def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for i in range(0, len(train)-1):
        index = i+32
        train_row = train[i]
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist, index))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append((distances[i][2], round(distances[i][1], 2)))
    return neighbors

def prediction_data(train_dataset, test_dataset, test_row_index) -> 'DataFrame':
    neighbors = get_neighbors(train_dataset, test_dataset[test_row_index], NUM_NEIGHBORS)
    
    frames = []
    index_col = [test_row_index]
    distance_col = [0]
    
    for neighbor in neighbors:
        index_col.append(neighbor[0])
        distance_col.append(neighbor[1])
        frames.append(pd.DataFrame([list(reference_df.loc[int(neighbor[0])])], columns=reference_df.columns))
        
    big_df = pd.concat(frames)
    big_df = pd.DataFrame([reference_df.loc[test_row_index]], columns=reference_df.columns).append(big_df, ignore_index=True)
    big_df.insert(0, 'Distance', distance_col, True)
    big_df.insert(0, 'Index', index_col, True)
    big_df.insert(2, 'Unit', big_df['Position Drafted'].map(UNIT_DICT), True)
    
    return big_df

def unit_prediction(train_dataset, test_dataset, test_row_index):
    predict_dict = {}
    neighbors = get_neighbors(train_dataset, test_dataset[test_row_index], NUM_NEIGHBORS)
    
    sum = 0
    for neighbor in neighbors:
        position = list(reference_df.loc[int(neighbor[0])])[0]
        for i in UNIT_DICT.keys():
            if position == i:
                unit = UNIT_DICT[position]
        if unit not in predict_dict:
            predict_dict[unit] = 0
        predict_dict[unit] += 1
        sum += 1
        
    #print('Actual Unit: ' + UNIT_DICT[list(reference_df.loc[test_row_index])[0]])
    #for i in predict_dict:
        #print(i, 'Count:'+str(predict_dict[i]), str(round(predict_dict[i]/sum, 4)*100) + '%')
    
    percent_dict = {'OFF':0,'DEF':0}
    for i in predict_dict:
        percent_dict[i] = round(predict_dict[i]/sum, 4)*100
    percent_dict['Actual Unit'] = UNIT_DICT[list(reference_df.loc[test_row_index])[0]]
    return percent_dict

In [41]:
PICK_NUM = 1
test = prediction_data(train_dataset, test_dataset, PICK_NUM-1)
test

Unnamed: 0,Index,Distance,Unit,Position Drafted,Team,Draft Year,Pick Number,Team PF,Team Yds,Team Ply,...,Def. Rank Yds.2,Def. Rank TD.1,Def. Rank Y/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Start,Def. Rank Time,Def. Rank Plays,Def. Rank Yds.4,Def. Rank Pts
0,0,0.0,OFF,QB,Bengals,2020,1,279.0,5169.0,1049.0,...,32.0,27.0,25.0,9.0,27.0,12.0,18.0,15.0,28.0,22.0
1,605,7.02,OFF,QB,Falcons,2001,1,252.0,3994.0,925.0,...,27.0,24.0,24.0,7.0,20.0,10.0,20.0,21.0,26.0,25.0
2,558,7.07,OFF,WR,Cardinals,2003,17,262.0,4563.0,1003.0,...,30.0,23.0,23.0,6.0,26.0,18.0,25.0,26.0,29.0,29.0
3,559,7.1,DEF,DE,Cardinals,2003,18,262.0,4563.0,1003.0,...,30.0,23.0,23.0,6.0,26.0,18.0,25.0,26.0,29.0,29.0
4,256,7.48,OFF,QB,Redskins,2012,2,288.0,5387.0,1032.0,...,18.0,23.0,18.0,5.0,25.0,27.0,16.0,10.0,15.0,24.0
5,512,7.67,OFF,WR,Cardinals,2004,3,225.0,4490.0,981.0,...,19.0,24.0,15.0,3.0,21.0,29.0,24.0,21.0,26.0,31.0


In [42]:
unit_prediction(train_dataset, test_dataset, PICK_NUM-1)

{'OFF': 80.0, 'DEF': 20.0, 'Actual Unit': 'OFF'}

In [56]:
# TESTING FUNCTIONS

def test_unit_prediction(train_dataset, test_dataset):
    correct_predictions = 0
    length = len(test_dataset)
    for i in range(length):
        predict_dict = unit_prediction(train_dataset, test_dataset, i)
        offense = predict_dict['OFF']
        defense = predict_dict['DEF']
        if predict_dict['Actual Unit'] == 'OFF' and offense > defense:
            correct_predictions += 1
        elif predict_dict['Actual Unit'] == 'DEF' and defense > offense:
            correct_predictions += 1
    return round(correct_predictions/length, 4)

def get_influence_of_all_columns(train_df, test_df):
    remove_col_dict = {}
    for i in train_df.columns:
        new_train_df = train_df.drop(columns=i)
        test_df = test_df.drop(columns=i)
        model_ready_combine = ready_dataset_for_modeling(new_train_df, test_df)
        train_dataset = model_ready_combine[0]
        test_dataset = model_ready_combine[1]
        prediction = test_unit_prediction(train_dataset, test_dataset)
        remove_col_dict[i] = prediction
        print(i, remove_col_dict[i])
        
    return remove_col_dict

def isolate_influential_columns(columns_influence):
    good_cols = []
    for i in columns_influence:
        if columns_influence[i] < .4:
            good_cols.append(i)
    return good_cols

def isolate_noninfluential_columns(train_df, test_df, columns_influence):
    drop_cols = []
    for i in columns_influence:
        if columns_influence[i] > .5:
            drop_cols.append(i)
    new_train_df = train_df.drop(columns=drop_cols)
    test_df = test_df.drop(columns=drop_cols)
    model_ready_combine = ready_dataset_for_modeling(new_train_df, test_df)
    train_dataset = model_ready_combine[0]
    test_dataset = model_ready_combine[1]
    print(drop_cols)
    combine = [train_dataset, test_dataset]
    return combine

def test_num_correct_predictions(train_dataset, test_dataset):
    correct_predictions = 0
    length = len(test_dataset)
    for i in range(length):
        predict_dict = unit_prediction(train_dataset, test_dataset, i)
        offense = predict_dict['OFF']
        defense = predict_dict['DEF']
        if predict_dict['Actual Unit'] == 'OFF':
            correct_predictions += offense
        elif predict_dict['Actual Unit'] == 'DEF':
            correct_predictions += defense
    return round(correct_predictions/length, 4)

In [49]:
test_unit_prediction(train_dataset, test_dataset)
# All Columns Accuracy: .5312
# No Rank Columns Accuracy: .5
# No Defensive Rank Col Accuracy: .5625
# No Offensive Rank Col Accuracy: .5625
# No Columns Influence > .4:  .5312
# No Columns Influence > .5: .5

0.5312

In [50]:
test_num_correct_predictions(train_dataset, test_dataset)
# All Columns Accuracy: 53.75
# No Columns Influence > .4: 51.875
# No Columns Influence > .5: 52.1875

53.125

In [46]:
# Takes 5 minutes to run
columns_influence = get_influence_of_all_columns(train_df, test_df)

Pick Number 0.5312
Team PF 0.4688
Team Yds 0.5312
Team Ply 0.6562
Team Y/P 0.5312
Team TO 0.5312
Team FL 0.5312
Team 1stD 0.5312
Team Cmp 0.5
Team Att 0.3438
Team Yds.1 0.5
Team TD 0.4688
Team Int 0.5312
Team NY/A 0.5312
Team 1stD.1 0.4688
Team Att.1 0.4688
Team Yds.2 0.4375
Team TD.1 0.4688
Team Y/A 0.5625
Team 1stD.2 0.4375
Team Pen 0.5625
Team Yds.3 0.5312
Team 1stPy 0.5
Team #Dr 0.5625
Team Sc% 0.5938
Team TO% 0.375
Team Start 0.4375
Team Time 0.5625
Team Plays 0.6562
Team Yds.4 0.4062
Team Pts 0.5938
Opp. PF 0.5625
Opp. Yds 0.375
Opp. Ply 0.6875
Opp. Y/P 0.4062
Opp. TO 0.5
Opp. FL 0.5625
Opp. 1stD 0.5625
Opp. Cmp 0.625
Opp. Att 0.6875
Opp. Yds.1 0.5312
Opp. TD 0.5312
Opp. Int 0.5938
Opp. NY/A 0.5
Opp. 1stD.1 0.5625
Opp. Att.1 0.4375
Opp. Yds.2 0.5
Opp. TD.1 0.75
Opp. Y/A 0.4062
Opp. 1stD.2 0.5625
Opp. Pen 0.4688
Opp. Yds.3 0.4688
Opp. 1stPy 0.5312
Opp. #Dr 0.5
Opp. Sc% 0.4375
Opp. TO% 0.4062
Opp. Start 0.5625
Opp. Time 0.5312
Opp. Plays 0.4375
Opp. Yds.4 0.4062
Opp. Pts 0.4062
Off

In [57]:
isolate_influential_columns(columns_influence)

['Team Att', 'Team TO%', 'Opp. Yds', 'Off. Rank Yds.4']

In [26]:
# NORM DATASET
model_ready_combine = ready_dataset_for_modeling(train_df, test_df)
train_dataset = model_ready_combine[0]
test_dataset = model_ready_combine[1]
test_num_correct_predictions(train_dataset, test_dataset)

53.75

In [55]:
# DROP ISOLATED COLS DATASET
combine = isolate_influential_columns(train_df, test_df, columns_influence)
train_dataset = combine[0]
test_dataset = combine[1]
test_num_correct_predictions(train_dataset, test_dataset)

['Pick Number', 'Team 1stD', 'Team TD.1', 'Team Time', 'Opp. FL', 'Opp. Att', 'Opp. TD', 'Opp. Int', 'Opp. TD.1', 'Off. Rank Pts', 'Def. Rank FL']


52.5

In [75]:
# DROP ANY COLS DATASET
drop_cols = ['Off. Rank Time']
new_train_df = train_df.drop(columns=drop_cols)
new_test_df = test_df.drop(columns=drop_cols)
model_ready_combine = ready_dataset_for_modeling(new_train_df, new_test_df)
train_dataset = model_ready_combine[0]
test_dataset = model_ready_combine[1]
test_unit_prediction(train_dataset, test_dataset)

0.5312