In [None]:
# OVERVIEW
# Data
# What position each team drafted and what their stats for the previous season was
# Chargers 2020 Draft QB, LB - Chargers 2019 receiving, rushing, tackles, ints, points for, etc
    
# 1st Round Draft History Data - http://www.drafthistory.com/index.php/rounds/round_1
# Team Stats and Rankings - https://www.pro-football-reference.com/teams/cin/2019.htm

In [2]:
# IMPORTS
# Data
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Web Scraping
from requests import get
from bs4 import BeautifulSoup

# Math
import math

In [3]:
# CONSTANTS
CURRENT_YEAR = 2020
YEARS = 20
TEAM_DICT = {'Bengals':'cin','Redskins':'was','Lions':'det','Giants':'nyg','Dolphins':'mia','Chargers':'sdg',
            'Panthers':'car','Cardinals':'crd','Jaguars':'jax','Browns':'cle','Jets':'nyj','Raiders':'rai',
             'Buccaneers':'tam','49ers':'sfo','Broncos':'den','Falcons':'atl','Cowboys':'dal','Eagles':'phi',
             'Vikings':'min','Saints':'nor','Packers':'gnb','Seahawks':'sea','Ravens':'rav','Titans':'oti',
             'Chiefs':'kan','Colts':'clt','Texans':'htx','Bears':'chi','Steelers':'pit','Rams':'ram','Bills':'buf',
             'Patriots':'nwe'}
POS_DICT = {'QB':1,'RB':2,'WR':3,'TE':4,' C':5,' G':6,' T':7,'DT':8,'DE':9,'LB':10,'DB':11}
EMPTY_COLS = ['Off. Rank Ply', 'Off. Rank Y/P', 'Off. Rank Cmp', 'Off. Rank 1stD', 'Off. Rank 1stD', 'Off. Rank Pen',
             'Off. Rank Yds', 'Off. Rank 1stPy', 'Off. Rank #Dr', 'Def. Rank Ply', 'Def. Rank Y/P', 'Def. Rank Cmp',
             'Def. Rank 1stD', 'Def. Rank 1stD', 'Def. Rank Pen', 'Def. Rank Yds', 'Def. Rank 1stPy', 'Def. Rank #Dr',
             'Off. Rank 1stD.1', 'Off. Rank 1stD.2', 'Off. Rank Yds.3', 'Def. Rank 1stD.1', 'Def. Rank 1stD.2', 'Def. Rank Yds.3']
UNIT_DICT = {'QB':'OFF','RB':'OFF','WR':'OFF','TE':'OFF',' C':'OFF',' G':'OFF',' T':'OFF','DT':'DEF','DE':'DEF','LB':'DEF',
             'DB':'DEF'}
NUM_NEIGHBORS = 10
print(len(TEAM_DICT))

32


In [None]:
# WEB SCRAPING FUNCTIONS
def get_first_rounders() -> list:
    url = 'http://www.drafthistory.com/index.php/rounds/round_1'
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find(id='main')
    row_containers = table.table.find_all('tr')
    row_containers = row_containers[2:]
    labels = ['Year','Round','Pick','Player','Name','Team','Position','College']
    
    all_rows = []
    should_break = False
    for i in row_containers:
        new_row = []
        cols = i.find_all('td')
        for j in range(0, len(cols)):
            if j == 0:
                try:
                    year = int(cols[j].text)
                    if year == CURRENT_YEAR - YEARS:
                        should_break=True
                        break
                    new_row.append(cols[j].text)
                except:
                    new_row.append(year)
            else:
                new_row.append(cols[j].text)
        if should_break:
            break
        if new_row:
            all_rows.append(new_row)
    
    df = pd.DataFrame(all_rows,columns=labels)
    df.drop(columns=['Round','Player','College'], inplace=True)
    return df

def get_team_stats_by_year_url(team_name: str, year) -> str:
    base_url = 'https://www.pro-football-reference.com/teams/'
    end_url = '.htm'
    return base_url + TEAM_DICT[team_name] + '/' + str(year) + end_url

def get_team_stats(url) -> 'DataFrame':
    response = get(url)
    if response.status_code == 404:
        return pd.DataFrame()
    soup = BeautifulSoup(response.text,'html.parser')
    table_container = soup.find(id='all_team_stats')
    if table_container:
        print('Scraping', url)
    else:
        print(url)
    table = table_container.find(class_='table_outer_container').div.table
    
    label_containers = table.thead.find_all('tr')[1].find_all('th')
    labels = []
    for i in label_containers:
        labels.append(str(i.text))
        
    list_of_rows = []
    row_containers = table.tbody.find_all('tr')
    for i in row_containers:
        row = []
        row.append(i.th)
        stat_containers = i.find_all('td')
        for j in stat_containers:
            try: 
                num = float(j.text)
                row.append(num)
            except:
                row.append(str(j.text).strip())
        list_of_rows.append(row)
    
    return pd.DataFrame(list_of_rows,columns=labels)

def convert_team_stats_to_one_row(df) -> 'DataFrame':
    team_columns = []
    opp_columns = []
    off_rank_columns = []
    def_rank_columns = []
    
    for i in df.columns:
        team_columns.append('Team ' + i)
        opp_columns.append('Opp. ' + i)
        off_rank_columns.append('Off. Rank ' + i)
        def_rank_columns.append('Def. Rank ' + i)
        
    row1 = pd.DataFrame([list(df.loc[0])], columns=team_columns)
    row2 = pd.DataFrame([list(df.loc[1])], columns=opp_columns)
    row3 = pd.DataFrame([list(df.loc[2])], columns=off_rank_columns)
    row4 = pd.DataFrame([list(df.loc[3])], columns=def_rank_columns)
    
    new_df = pd.concat([row1, row2, row3, row4], axis=1).reindex(row1.index)
    return new_df

def make_final_one_row(df, draft_year, pick_number, team_name, position_drafted) -> 'DataFrame':
    df.drop(columns=['Team Player', 'Opp. Player', 'Off. Rank Player', 'Def. Rank Player'], inplace=True)
    df.insert(0, 'Pick Number', [pick_number], True)
    df.insert(0, 'Draft Year', [draft_year], True)
    df.insert(0, 'Team', [team_name], True)
    df.insert(0, 'Position Drafted', [position_drafted], True)
    return df

def get_final_df() -> 'DataFrame':
    first_rounders = get_first_rounders()
    frames = []
    for i in first_rounders.index:
        row = list(first_rounders.loc[i])

        draft_year = row[0]
        pick_number = row[1]
        team_name = row[3]
        position_drafted = row[4]

        stats_url = get_team_stats_by_year_url(team_name, int(draft_year)-1)
        team_stats_df = get_team_stats(stats_url)
        if team_stats_df.empty:
            print('Not Found', stats_url)
            continue
        else:
            pass
        one_row = convert_team_stats_to_one_row(team_stats_df)
        final_row = make_final_one_row(one_row, draft_year, pick_number, team_name, position_drafted)
        frames.append(final_row)
    final_df = pd.concat(frames)
    return final_df

In [None]:
final_df = get_final_df()
final_df.to_csv('Profile For Last ' + str(YEARS) + ' Drafts.csv', encoding = 'utf-8-sig', index=False)

In [None]:
final_df

In [4]:
final_df = pd.read_csv('Profile For Last ' + str(YEARS) + ' Drafts.csv', encoding = 'utf-8')
final_df

FileNotFoundError: [Errno 2] No such file or directory: 'Profile For Last 20 Drafts.csv'

In [None]:
def clean_final_df(final_df) -> 'DataFrame':
    
    team_start_list = []
    opp_start_list = []
    for i in list(final_df['Team Start']):
        team_start_list.append(float(str(i)[4:].strip()))
    for i in list(final_df['Opp. Start']):
        opp_start_list.append(float(str(i)[4:].strip()))
    
    final_df['Team Start'] = team_start_list
    final_df['Opp. Start'] = opp_start_list
    
    team_time_list = []
    opp_time_list = []
    for i in list(final_df['Team Time']):
        team_time_list.append(float((int(str(i)[0])*60)+int(str(i)[2:])))
    for i in list(final_df['Opp. Time']):
        opp_time_list.append(float((int(str(i)[0])*60)+int(str(i)[2:])))
        
    final_df['Team Time'] = team_time_list
    final_df['Opp. Time'] = opp_time_list
    
    for i in final_df:
        if i in EMPTY_COLS:
            final_df = final_df.drop(columns=i)
    
    return final_df

In [None]:
clean_final_df = clean_final_df(final_df)
reference_df = clean_final_df.copy(deep=True)
clean_final_df

In [None]:
def convert_final_df_to_num(final_df) -> 'DataFrame':
    years_list = []
    for i in list(final_df['Draft Year']):
        years_list.append(int(i))
    final_df.index = years_list
    
    # num_df = final_df.drop(columns=['Position Drafted','Team','Draft Year'])
    num_df['Position Drafted'] = num_df['Position Drafted'].map(POS_DICT)
    return num_df

In [None]:
num_df = convert_final_df_to_num(clean_final_df)
num_df

In [None]:
def split_dfs(final_df) -> list:

    train_df_years = [*range(CURRENT_YEAR-1, CURRENT_YEAR - YEARS, -1)]

    test_df = final_df.loc[[CURRENT_YEAR]]
    train_df = final_df.loc[train_df_years]
    print(train_df.shape, test_df.shape)

    combine = [train_df, test_df]
    return combine

In [None]:
combine = split_dfs(num_df)
train_df = combine[0]
test_df = combine[1]
test_df = test_df.reset_index(drop=True)
train_df = train_df.reset_index(drop=True)
train_df.head()

# test_df.to_csv('Test_DF.csv', encoding='utf-8-sig', index=True)
# train_df.to_csv('Train_DF.csv', encoding = 'utf-8-sig', index=True)

In [None]:
train_df.index = train_df.index + 32
train_df.head()

In [None]:
# DATA PREPARATION FUNCTIONS

def convert_df_to_weighted_df(df):
    std_list = list(df.std(axis = 0, skipna= True))
    mean_list = list(df.mean(axis=0, skipna= True))
    frames = []
    for i in range(df.index[0], df.index[0]+len(df.index)):
        row_list = list(df.loc[i])
        weighted_row = []
        for j in range(len(row_list)):
            weighted_row.append((row_list[j] - mean_list[j])/std_list[j])
        weighted_df_row = pd.DataFrame([weighted_row], columns=df.columns)
        frames.append(weighted_df_row)
    weighted_df = pd.concat(frames)
    weighted_df.index = df.index
    return weighted_df

def ready_dataset_for_modeling(train_df, test_df) -> list:
    weighted_train_df = convert_df_to_weighted_df(train_df)
    weighted_test_df = convert_df_to_weighted_df(test_df)
    
    train_dataset = weighted_train_df.to_numpy().tolist()
    test_dataset = weighted_test_df.to_numpy().tolist()
    
    combine = [train_dataset, test_dataset]
    return combine

In [None]:
model_ready_combine = ready_dataset_for_modeling(train_df, test_df)
train_dataset = model_ready_combine[0]
test_dataset = model_ready_combine[1]

In [None]:
# K NEAREST NEIGHBORS MODEL FUNCTIONS

def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return math.sqrt(distance)

def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for i in range(0, len(train)-1):
        index = i+32
        train_row = train[i]
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist, index))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append((distances[i][2], round(distances[i][1], 2)))
    return neighbors

def prediction_data(test_row_index) -> 'DataFrame':
    neighbors = get_neighbors(train_dataset, test_dataset[test_row_index], NUM_NEIGHBORS)
    
    frames = []
    index_col = [test_row_index]
    distance_col = [0]
    
    for neighbor in neighbors:
        index_col.append(neighbor[0])
        distance_col.append(neighbor[1])
        frames.append(pd.DataFrame([list(reference_df.loc[int(neighbor[0])])], columns=reference_df.columns))
        
    big_df = pd.concat(frames)
    big_df = pd.DataFrame([reference_df.loc[test_row_index]], columns=reference_df.columns).append(big_df, ignore_index=True)
    big_df.insert(0, 'Distance', distance_col, True)
    big_df.insert(0, 'Index', index_col, True)
    big_df.insert(2, 'Unit', big_df['Position Drafted'].map(UNIT_DICT), True)
    
    return big_df

def unit_prediction(test_row_index):
    predict_dict = {}
    neighbors = get_neighbors(train_dataset, test_dataset[test_row_index], NUM_NEIGHBORS)
    
    sum = 0
    for neighbor in neighbors:
        position = list(reference_df.loc[int(neighbor[0])])[0]
        for i in UNIT_DICT.keys():
            if position == i:
                unit = UNIT_DICT[position]
        if unit not in predict_dict:
            predict_dict[unit] = 0
        predict_dict[unit] += 1
        sum += 1
        
    #print('Actual Unit: ' + UNIT_DICT[list(reference_df.loc[test_row_index])[0]])
    #for i in predict_dict:
        #print(i, 'Count:'+str(predict_dict[i]), str(round(predict_dict[i]/sum, 4)*100) + '%')
    
    percent_dict = {}
    for i in predict_dict:
        percent_dict[i] = round(predict_dict[i]/sum, 4)*100
    percent_dict['Actual Unit'] = UNIT_DICT[list(reference_df.loc[test_row_index])[0]]
    return percent_dict

In [None]:
PICK_NUM = 1
test = prediction_data(PICK_NUM-1)
test

In [None]:
unit_prediction(PICK_NUM-1)

In [None]:
# TESTING

def test_unit_prediction():
    correct_predictions = 0
    length = len(test_df.index)
    for i in range(length):
        predict_dict = unit_prediction(i)
        offense = predict_dict['OFF']
        defense = predict_dict['DEF']
        if predict_dict['Actual Unit'] == 'OFF' and offense > defense:
            correct_predictions += 1
        elif predict_dict['Actual Unit'] == 'DEF' and defense > offense:
            correct_predictions += 1
    return round(correct_predictions/length, 4)

In [None]:
test_unit_prediction()
# All Columns Accuracy: .5312
# No Rank Columns Accuracy: 