In [1]:
# CONCLUSION

# From KNN Evaluation, we know that the k values with the least errors = 10, 11, 19, 21, 27-32
# After comparing these k value predictions, I decided to use a k = 10 to limit the amount of "Unsure" picks
# and maximize the amount of "Likely picks", since the higher k became, the more "Unsure" the percentage of the pick became

# COMPARISON METRICS
# Expected Value = sum of percentage if percentage is .6 or greater (otherwise prediction is irrelevant)
# When k = 10, expected to get 16 right (which coincidentally is half of the total 32 picks)

# NOTE
# When comparing KNN 10 vs 19 vs 32, there were no complete switches (only Unsure changes, not flip from 1 to 2 likelihood)
# So is to say, if KNN 10 predicts the 1st pick to "somewhat likely" be OFF, then KNN 19 and 32 will not say it to be 
# "somewhat likely" to be DEF
# Basically, the KNNs will most likely predict the same unit

In [18]:
# DataFrame
import pandas as pd
import numpy as np
from collections import defaultdict

# ML Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

# Train Test Split
from sklearn import model_selection

# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Model Evaluation
from sklearn.metrics import classification_report, confusion_matrix

# Visualization
import matplotlib.pyplot as plt
from pprint import pprint

In [3]:
FIRST_YEAR = 2000
CURRENT_YEAR = 2021

In [4]:
past_reference = pd.read_csv(str(FIRST_YEAR) + '-' + str(CURRENT_YEAR-1) + ' Reference Data.csv', encoding='utf-8')
current_reference = pd.read_csv(str(CURRENT_YEAR) + ' Raw Test Data.csv', encoding='utf-8').drop(columns='Round')
X_train = pd.read_csv(str(FIRST_YEAR) + '-' + str(CURRENT_YEAR) + ' X Train.csv', encoding='utf-8')
Y_train = pd.read_csv(str(FIRST_YEAR) + '-' + str(CURRENT_YEAR) + ' Y Train.csv', encoding='utf-8')['Position']
X_test = pd.read_csv(str(FIRST_YEAR) + '-' + str(CURRENT_YEAR) + ' X Test.csv', encoding='utf-8')
past_reference.shape, current_reference.shape, X_train.shape, Y_train.shape, X_test.shape

((667, 108), (32, 104), (667, 103), (667,), (32, 103))

In [5]:
# Match past_reference with current_reference
current_reference.index = current_reference.index + 1
X_test.index = X_test.index + 1
past_reference.index = past_reference.index + 33
X_train.index = X_train.index + 33
current_reference

Unnamed: 0,Pick,Team,Team PF,Team Yds,Team Tot Yds & TOPly,Team Tot Yds & TOY/P,Team Tot Yds & TOTO,Team FL,Team 1stD,Team PassingCmp,...,Def. Rank RushingYds,Def. Rank RushingTD,Def. Rank RushingY/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Average DriveStart,Def. Rank Average DriveTime,Def. Rank Average DrivePlays,Def. Rank Average DriveYds,Def. Rank Average DrivePts
1,1,Jaguars,306,5218,997.0,5.2,25,9,310,387.0,...,30,29,27.0,4.0,26.0,29,22,13.0,26.0,29.0
2,2,Jets,243,4479,948.0,4.7,19,5,269,292.0,...,12,14,7.0,8.0,20.0,26,26,21.0,23.0,24.0
3,3,49ers,376,5922,1046.0,5.7,31,14,350,371.0,...,7,4,6.0,20.0,23.0,25,3,5.0,4.0,8.0
4,4,Falcons,396,5895,1078.0,5.5,18,7,366,408.0,...,6,13,14.0,14.0,16.0,15,11,16.0,27.0,21.0
5,5,Bengals,311,5116,1040.0,4.9,24,13,318,372.0,...,29,7,31.0,11.0,25.0,18,7,8.0,20.0,20.0
6,6,Dolphins,404,5424,1021.0,5.3,20,7,345,370.0,...,16,18,18.0,29.0,1.0,6,4,6.0,14.0,5.0
7,7,Lions,377,5603,991.0,5.7,21,8,350,374.0,...,28,32,15.0,3.0,31.0,11,28,26.0,32.0,32.0
8,8,Panthers,350,5592,993.0,5.6,21,5,335,373.0,...,20,18,28.0,6.0,11.0,23,24,28.0,24.0,26.0
9,9,Broncos,323,5369,1030.0,5.2,32,9,308,317.0,...,25,28,29.0,9.0,30.0,30,11,11.0,11.0,19.0
10,10,Cowboys,395,5949,1113.0,5.3,26,13,371,413.0,...,31,25,30.0,12.0,15.0,28,17,12.0,19.0,25.0


In [6]:
def get_neighbors(pick_num, num_neighbors):
    # Make dataframe with current-year pick_num row and all past_data to find closest neighbors from past_data
    all_data = pd.concat([X_test.loc[[pick_num]], X_train])
    
    neigh = NearestNeighbors(n_neighbors = num_neighbors+1)
    neigh.fit(all_data)
    distances = neigh.kneighbors(all_data.loc[[pick_num]])[0][0]
    neighbors = neigh.kneighbors(all_data.loc[[pick_num]])[1][0]
    neighbors_indexes = [all_data.index[i] for i in neighbors]
    new = pd.concat([current_reference.loc[[neighbors_indexes[0]]], past_reference.loc[neighbors_indexes[1:]]])
    
    listy = ['Name', 'Position', 'Year'] + list(new.columns)[:-4]
    new = new[listy]
    new.insert(0, 'Distance', distances, True)
    
    return round(new, 2)

In [7]:
get_neighbors(1, 5)

Unnamed: 0,Distance,Name,Position,Year,Pick,Team,Team PF,Team Yds,Team Tot Yds & TOPly,Team Tot Yds & TOY/P,...,Def. Rank RushingYds,Def. Rank RushingTD,Def. Rank RushingY/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Average DriveStart,Def. Rank Average DriveTime,Def. Rank Average DrivePlays,Def. Rank Average DriveYds,Def. Rank Average DrivePts
1,0.0,,,,1,Jaguars,306.0,5218.0,997.0,5.2,...,30.0,29.0,27.0,4.0,26.0,29.0,22.0,13.0,26.0,29.0
129,8.06,Myles Garrett,DE,2017.0,1,Browns,264.0,4976.0,983.0,5.1,...,31.0,25.0,29.0,6.0,31.0,26.0,30.0,28.0,30.0,31.0
37,8.12,Tua Tagovailoa,QB,2020.0,5,Dolphins,306.0,4960.0,1022.0,4.9,...,27.0,21.0,22.0,1.0,28.0,27.0,20.0,20.0,26.0,32.0
294,8.15,Mark Barron,DB,2012.0,7,Buccaneers,287.0,5108.0,966.0,5.3,...,32.0,32.0,31.0,1.0,16.0,30.0,25.0,20.0,29.0,32.0
50,8.32,Austin Jackson,T,2020.0,18,Dolphins,306.0,4960.0,1022.0,4.9,...,27.0,21.0,22.0,1.0,28.0,27.0,20.0,20.0,26.0,32.0
153,8.47,Jabrill Peppers,DB,2017.0,25,Browns,264.0,4976.0,983.0,5.1,...,31.0,25.0,29.0,6.0,31.0,26.0,30.0,28.0,30.0,31.0


In [8]:
def get_final_df(num):
    knn = KNeighborsClassifier(n_neighbors = num)
    knn.fit(X_train, Y_train)
    Y_pred = knn.predict(X_test)
    
    if 'Prediction' in current_reference.columns:
        current_reference['Prediction'] = Y_pred
    else:
        current_reference.insert(0, 'Prediction', Y_pred, True)
    prob_df = pd.DataFrame(knn.predict_proba(X_test))
    prob_df.rename(columns = {0:'OFF %', 1:'DEF %'}, inplace=True)
    prob_df.index = prob_df.index + 1
    final = pd.concat([prob_df, current_reference], axis=1)

    listy = []
    expected_correct = []

    for i,j in zip(final['OFF %'], final['DEF %']):
        test = i if i > j else j
        if test < .6:
            listy.append('Unsure')
            expected_correct.append(0)
        else:
            expected_correct.append(test)
            if test < .7:
                listy.append('Somewhat Likely')
            elif test < .8:
                listy.append('Likely')
            elif test < .9:
                listy.append('More Likely')
            elif test <= 1:
                listy.append('Very Likely')
        
    final.insert(3, 'Likelihood', listy, True)
    final.insert(4, 'Expected', expected_correct, True)

    return final

In [16]:
testy = get_final_df(10)
testy

Unnamed: 0,OFF %,DEF %,Prediction,Likelihood,Expected,Pick,Team,Team PF,Team Yds,Team Tot Yds & TOPly,...,Def. Rank RushingYds,Def. Rank RushingTD,Def. Rank RushingY/A,Def. Rank Sc%,Def. Rank TO%,Def. Rank Average DriveStart,Def. Rank Average DriveTime,Def. Rank Average DrivePlays,Def. Rank Average DriveYds,Def. Rank Average DrivePts
1,0.5,0.5,1,Unsure,0.0,1,Jaguars,306,5218,997.0,...,30,29,27.0,4.0,26.0,29,22,13.0,26.0,29.0
2,0.6,0.4,1,Somewhat Likely,0.6,2,Jets,243,4479,948.0,...,12,14,7.0,8.0,20.0,26,26,21.0,23.0,24.0
3,0.3,0.7,2,Likely,0.7,3,49ers,376,5922,1046.0,...,7,4,6.0,20.0,23.0,25,3,5.0,4.0,8.0
4,0.4,0.6,2,Somewhat Likely,0.6,4,Falcons,396,5895,1078.0,...,6,13,14.0,14.0,16.0,15,11,16.0,27.0,21.0
5,0.5,0.5,1,Unsure,0.0,5,Bengals,311,5116,1040.0,...,29,7,31.0,11.0,25.0,18,7,8.0,20.0,20.0
6,0.8,0.2,1,More Likely,0.8,6,Dolphins,404,5424,1021.0,...,16,18,18.0,29.0,1.0,6,4,6.0,14.0,5.0
7,0.6,0.4,1,Somewhat Likely,0.6,7,Lions,377,5603,991.0,...,28,32,15.0,3.0,31.0,11,28,26.0,32.0,32.0
8,0.2,0.8,2,More Likely,0.8,8,Panthers,350,5592,993.0,...,20,18,28.0,6.0,11.0,23,24,28.0,24.0,26.0
9,0.6,0.4,1,Somewhat Likely,0.6,9,Broncos,323,5369,1030.0,...,25,28,29.0,9.0,30.0,30,11,11.0,11.0,19.0
10,0.5,0.5,1,Unsure,0.0,10,Cowboys,395,5949,1113.0,...,31,25,30.0,12.0,15.0,28,17,12.0,19.0,25.0


In [10]:
def get_likelihood_dict(final):
    listy = ['Unsure', 'Somewhat Likely', 'Likely', 'More Likely', 'Very Likely']
    likelihood_dict = defaultdict(int)
    for i in final['Likelihood']:
        likelihood_dict[i] += 1
    for i in listy:
        if i not in likelihood_dict:
            likelihood_dict[i] = 0
    return dict(likelihood_dict)

In [11]:
get_likelihood_dict(testy)

{'Somewhat Likely': 15,
 'Unsure': 11,
 'Likely': 6,
 'More Likely': 0,
 'Very Likely': 0}

In [12]:
# Compare Differences in Predictions Between 2 K Values

def compare_k_predictions(i, j):
    x = get_likelihood_dict(get_final_df(i))
    y = get_likelihood_dict(get_final_df(j))
    print(list(zip(x.items(), y.items())))
    return {key: y[key] - x.get(key, 0) for key in x}

compare_k_predictions(10, 11)

[(('Unsure', 8), ('Unsure', 11)), (('Somewhat Likely', 11), ('Somewhat Likely', 10)), (('Likely', 6), ('Likely', 4)), (('More Likely', 7), ('More Likely', 7)), (('Very Likely', 0), ('Very Likely', 0))]


{'Unsure': 3,
 'Somewhat Likely': -1,
 'Likely': -2,
 'More Likely': 0,
 'Very Likely': 0}

In [13]:
# See distribution of Likelihoods over multiple k values

def test_multiple_ks(k_list):
    df = pd.DataFrame()
    for i in k_list:
        final = get_final_df(i)
        dicty = get_likelihood_dict(final)
        dicty['Expected'] = final['Expected'].sum()
        df = pd.concat([df, pd.DataFrame([dicty], columns=dicty.keys())])
    df.index = k_list
    return df

test_multiple_ks(list(range(8,35)))

Unnamed: 0,Unsure,Likely,Somewhat Likely,More Likely,Very Likely,Expected
8,8,10,12,2,0,16.75
9,16,8,7,1,0,11.777778
10,8,6,11,7,0,16.4
11,11,4,10,7,0,15.0
12,14,4,9,5,0,13.166667
13,9,5,14,4,0,16.461538
14,11,10,9,2,0,14.928571
15,6,4,19,3,0,17.533333
16,6,4,20,2,0,17.5625
17,12,9,10,1,0,13.823529


In [45]:
def create_final_df(k_list, label=False):
    final = pd.DataFrame(columns = ['KNN ' + str(n) for n in k_list])
    
    for k in k_list:
        
        df = get_final_df(k)
        predictions = []
        for i in df.index:
            if df.loc[i, 'Likelihood'] == 'Unsure':
                predictions.append('Unsure')
            else:
                prediction = str(df.loc[i, 'Prediction'])
                if df.loc[i, 'Likelihood'] == 'Somewhat Likely':
                    initials = ' SL' if label else ''
                    predictions.append(prediction + initials)
                elif df.loc[i, 'Likelihood'] == 'Likely':
                    initials = ' L' if label else ''
                    predictions.append(prediction + initials)
                elif df.loc[i, 'Likelihood'] == 'More Likely':
                    initials = ' ML' if label else ''
                    predictions.append(prediction + initials)
                elif df.loc[i, 'Likelihood'] == 'Very Likely':
                    initials = ' VL' if label else ''
                    predictions.append(prediction + initials)
        final['KNN ' + str(k)] = predictions
        
    final.insert(0, 'Pick', range(1,33), True)
    return final

In [None]:
final_ks = [10, 19, 32]

In [46]:
create_final_df(final_ks).to_csv(str(CURRENT_YEAR) + ' KNN Predictions.csv', encoding='utf-8-sig')
create_final_df(final_ks)

Unnamed: 0,Pick,KNN 10,KNN 19,KNN 32
0,1,Unsure,1,Unsure
1,2,1,1,Unsure
2,3,2,Unsure,Unsure
3,4,2,2,2
4,5,Unsure,Unsure,Unsure
5,6,1,1,1
6,7,1,Unsure,Unsure
7,8,2,2,Unsure
8,9,1,1,Unsure
9,10,Unsure,Unsure,2


In [None]:
create_final_df(final_ks, True).to_csv(str(CURRENT_YEAR) + ' KNN Predictions Labeled.csv', encoding='utf-8-sig')
create_final_df(final_ks, True)