In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score


dataFrame = pd.read_csv("NFLDraftDataCleaned.csv")

In [34]:
#Create new dataframes to hold independent and dependent variables
X = dataFrame[['position', 'age', 'height', 'weight', 'collegeAttended', 'rushingAttempts', 'rushingYards',
                           'avgYardsPerRush', 'rushingTouchdowns', 'receptions', 'receivingYards', 'avgYardsPerReception',
                           'receivingTouchdowns', 'fortyYardDash', 'verticalJump', 'benchPress', 'broadJump', 'threeCone',
                           'shuttle']]
# X = dataFrame[['age', 'height', 'weight', 'rushingAttempts', 'rushingYards',
#                            'avgYardsPerRush', 'rushingTouchdowns', 'receptions', 'receivingYards', 'avgYardsPerReception',
#                            'receivingTouchdowns', 'fortyYardDash', 'verticalJump', 'benchPress', 'broadJump', 'threeCone',
#                            'shuttle']]
y = dataFrame[['draftRound']]

In [35]:
#Some of the dependent variables (position and collegeAttended) are stored as categorical data
#This needs to be converted to numerical data using OneHotEncoder
onehot = pd.get_dummies(X['position'])
X = X.drop('position', axis = 1)
X = X.join(onehot)

onehot = pd.get_dummies(X['collegeAttended'])
X = X.drop('collegeAttended', axis = 1)
X = X.join(onehot)

In [36]:
#Split the dataset into some training data and testing data
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=0.2, shuffle = False)

#Create a linear regression model and fit it to the train data
linearModel = LinearRegression(fit_intercept=True).fit(X_Train, y_Train)

#Predict the draft round for the test set
unformattedPredictions = linearModel.predict(X_Test)

predictions = []

#There are some errors with our predictions, they are giving wildly higher scores
for prediction in unformattedPredictions:
    if(prediction < 1.5):
        predictions.append(1)
    if(prediction >= 1.5 and prediction < 2.5):
        predictions.append(2)
    if(prediction >= 2.5 and prediction < 3.5):
        predictions.append(3)
    if(prediction >= 3.5 and prediction < 4.5):
        predictions.append(4)
    if(prediction >= 4.5 and prediction < 5.5):
        predictions.append(5)
    if(prediction >= 5.5 and prediction < 6.5):
        predictions.append(6)
    if(prediction >= 6.5):
        predictions.append(7)

print(unformattedPredictions)    

explainedVarianceScore = explained_variance_score(y_Test, predictions)
print("Explained variance score = ", explainedVarianceScore)

comparisonFrame = pd.DataFrame({'Actual': y_Test['draftRound'], 'predicted': predictions})
comparisonFrame

[[2.71944945]
 [2.71448438]
 [3.29700085]
 [3.18851672]
 [2.05716684]
 [3.22718877]
 [3.93262393]
 [2.69008765]
 [2.2336514 ]
 [2.53877511]
 [1.75234961]
 [2.4637391 ]
 [4.06210095]
 [2.27525615]
 [3.68726208]
 [3.47345428]
 [4.42759152]
 [4.86769145]
 [3.98861097]
 [4.48294058]
 [5.18461168]
 [4.30886701]
 [4.83600506]
 [4.44181882]
 [3.589805  ]
 [3.17789327]
 [3.68518753]
 [2.52473719]
 [4.22379525]
 [5.01083583]
 [2.8630679 ]
 [3.26614679]
 [5.36677935]
 [6.17515626]
 [3.86777979]
 [4.86416395]
 [4.65786299]
 [2.48271398]
 [5.38549817]
 [4.00671017]
 [6.89599372]
 [5.92829569]
 [4.8957603 ]
 [4.226585  ]
 [5.18130839]
 [1.61373619]
 [1.95427247]
 [2.57868234]
 [2.38698096]
 [3.52986534]
 [3.58266333]
 [2.93266362]
 [1.41178368]
 [3.83485764]
 [3.06317443]
 [3.99088103]
 [3.28480207]
 [4.80502252]
 [2.39609717]
 [3.54269742]
 [4.59543064]
 [3.11042563]
 [3.77592468]
 [4.24836515]
 [2.58512457]
 [3.01844918]
 [3.72882969]
 [4.17384396]
 [1.59459459]
 [4.90859545]
 [4.23099367]
 [4.06

Unnamed: 0,Actual,predicted
748,2,3
749,2,3
750,2,3
751,2,3
752,2,2
753,2,3
754,3,4
755,3,3
756,3,2
757,3,3
