## Making Predictions with KNN with NBA Dataset

In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns= 90
nba = pd.read_csv("nba_2013.csv")

# The names of the columns in the data
print(nba.columns.values)

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [2]:
nba.columns

Index(['player', 'pos', 'age', 'bref_team_id', 'g', 'gs', 'mp', 'fg', 'fga',
       'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft',
       'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf',
       'pts', 'season', 'season_end'],
      dtype='object')

In [3]:
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,4,15,0.266667,62,126,0.492063,0.482,35,53,0.66,72,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,0,0,,93,185,0.502703,0.503,79,136,0.581,142,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,0,0,,143,275,0.52,0.52,76,119,0.639,102,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,128,300,0.426667,336,711,0.472574,0.522,274,336,0.815,32,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,0,1,0.0,136,248,0.548387,0.546,56,67,0.836,94,183,277,40,23,46,63,187,328,2013-2014,2013


In [4]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 31 columns):
player          481 non-null object
pos             481 non-null object
age             481 non-null int64
bref_team_id    481 non-null object
g               481 non-null int64
gs              481 non-null int64
mp              481 non-null int64
fg              481 non-null int64
fga             481 non-null int64
fg.             479 non-null float64
x3p             481 non-null int64
x3pa            481 non-null int64
x3p.            414 non-null float64
x2p             481 non-null int64
x2pa            481 non-null int64
x2p.            478 non-null float64
efg.            479 non-null float64
ft              481 non-null int64
fta             481 non-null int64
ft.             461 non-null float64
orb             481 non-null int64
drb             481 non-null int64
trb             481 non-null int64
ast             481 non-null int64
stl             481 non-null int64
blk    

Looking for the closest neighbor to LeBron James

In [5]:
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]

In [6]:
selected_player

player          LeBron James
pos                       PF
age                       29
bref_team_id             MIA
g                         77
gs                        77
mp                      2902
fg                       767
fga                     1353
fg.                    0.567
x3p                      116
x3pa                     306
x3p.                0.379085
x2p                      651
x2pa                    1047
x2p.                0.621777
efg.                    0.61
ft                       439
fta                      585
ft.                     0.75
orb                       81
drb                      452
trb                      533
ast                      488
stl                      121
blk                       26
tov                      270
pf                       126
pts                     2089
season             2013-2014
season_end              2013
Name: 225, dtype: object

In [7]:
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 
                    'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

In [8]:
import math

In [9]:
def euclidean_distance(row):
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

lebron_distance = nba.apply(euclidean_distance, axis=1)

In [10]:
lebron_distance.sort_values()

225      0.000000
17     485.856006
123    543.179606
179    568.833040
162    607.684190
          ...    
456           NaN
460           NaN
461           NaN
468           NaN
473           NaN
Length: 481, dtype: float64

Larger values in column within a dataset can have larger impact in the Euclidean distance calculations. This can be bad, 
because having larger values does not necessarily make a variable better at predicting which rows are similar. A simple 
way to deal with this problem is to normalize all of the columns to have a mean of 0 and a standard deviation of 1. 
This ensures that no single column has a dominant impact on the Euclidean distance calculations

In [11]:
nba_numeric = nba[distance_columns]
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()

In [12]:
nba_normalized

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.319884,-0.700282,-0.716608,-0.117009,-0.619931,-0.640207,0.240468,0.012541,-0.542173,-0.515408,-0.389712,0.260690,-0.129462,-0.013116,-0.645220,-0.468056,0.061410,-0.667650,0.226515,-0.734621
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.674593,-0.778936,-0.829601,,-0.409366,-0.439646,0.342330,0.223485,-0.117740,0.117019,-0.882950,1.387883,0.187020,0.565852,-0.530733,0.020680,1.065446,-0.013760,1.363938,-0.534801
2,0.116868,-0.010016,-0.457600,-0.308035,-0.290291,-0.405214,0.846880,-0.778936,-0.829601,,-0.069746,-0.133705,0.507937,0.394250,-0.146678,-0.012515,-0.520826,0.743773,0.283340,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924,-0.328603
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.228673,1.737992,1.430256,0.898007,1.241189,1.348408,0.053871,0.414340,1.763270,1.640937,0.578033,-0.383420,0.462221,0.216475,1.033919,-0.123066,-0.683520,1.182380,0.423107,1.729123
4,-0.359519,0.108454,0.149309,-0.319180,-0.331028,-0.475703,1.110379,-0.778936,-0.822068,-1.808704,-0.117293,-0.225487,0.779721,0.655420,-0.339603,-0.408733,0.709147,0.614951,0.138859,0.291341,-0.553630,-0.468056,0.709175,-0.141348,1.139262,-0.400878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,-1.550487,0.740298,-0.322732,0.588028,0.885271,1.039814,-0.095633,0.007604,0.586576,-0.458948,1.030625,1.039068,0.239243,-0.288809,1.136267,1.564741,-0.508339,0.212382,-0.026261,0.046777,0.797313,1.113149,-0.262473,2.107395,0.633741,0.897955
477,0.355062,0.424376,-0.558752,0.638181,1.129694,1.259415,-0.014557,1.875637,1.806899,0.638211,0.670627,0.763721,0.005604,0.303845,1.387068,1.252338,0.640468,-0.431728,-0.177622,-0.262671,-0.133846,0.193175,-0.392026,0.369005,0.703952,1.333733
478,-0.359519,1.016730,1.767734,1.650152,2.264518,2.327598,0.178000,0.990779,1.369994,0.146594,2.300805,2.300224,0.282640,0.082855,0.692541,0.825641,-0.065049,1.774349,1.012624,1.284571,0.530177,3.671827,0.385292,1.485402,1.504361,1.914063
479,-1.312293,1.135201,-0.761055,0.199066,-0.121522,-0.055479,-0.105767,-0.778936,-0.822068,-1.808704,0.127234,0.301411,-0.384386,-0.549979,0.528556,0.604672,0.047334,1.001417,0.496621,0.670665,-0.156743,0.020680,0.547234,0.241416,0.900544,-0.056507


Now we know enough to find the nearest neighbor of a given row. Instead of the Euclidean distance formula, we can use the distance.euclidean function from scipy.spatial, which is a much faster way to calculate Euclidean distance.

In [13]:
from scipy.spatial import distance

# Fill in the NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for Lebron James
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]

Find the player who is most similar to LeBron James by our distance metric. You can accomplish this by finding the second 
lowest value in the euclidean_distances series (the lowest value will correspond to Lebron, as he is most similar to himself), and then cross-referencing the NBA dataframe with the same index

In [14]:
# Find the distance between Lebron James and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]

You can accomplish this by finding the second lowest value in the euclidean_distances series (the lowest value will correspond to Lebron, as he is most similar to himself), and then cross-referencing the NBA dataframe with the same index

In [15]:
'{} is the closest to Lebron James with a distance of {}'.format(most_similar_to_lebron, second_smallest)

'Carmelo Anthony is the closest to Lebron James with a distance of 17.0'

Now that we know how to find the nearest neighbors, we can make predictions on a test set. First, we have to generate testing and training sets. We'll use random sampling to do this. We'll randomly shuffle the index of the nba dataframe, and then pick rows using the randomly shuffled values. If we didn't do this, we'd end up predicting and training on the same data set, which would overfit. We could do cross-validation also, which would be slightly better, but also slightly more complex.

In [16]:
import random
from numpy.random import permutation

# Randomly shuffle the index of nba
random_indices = permutation(nba_normalized.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(nba_normalized)/3)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices
test = nba_normalized.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data
train = nba_normalized.loc[random_indices[test_cutoff:]]

In [17]:
train

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,x2p,x2pa,x2p.,efg.,ft,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
99,-0.121325,0.779789,1.599148,1.347006,1.571985,1.641684,0.137462,1.285731,1.362461,0.480316,1.390622,1.443590,0.167464,0.193350,1.290606,1.183761,0.578033,-0.238495,0.063179,-0.028089,2.506982,2.033124,-0.359638,1.246174,0.493318,1.571817
455,1.784224,-1.155233,-0.862207,-1.199639,-1.105036,-1.120951,-1.656354,-0.739609,-0.776871,0.003826,-1.034268,-1.054927,-2.077092,-1.172769,-0.841205,-0.888767,1.733084,-0.818194,-1.051387,-1.016328,-0.645220,-0.985541,-0.748297,-0.874981,-1.023245,-1.072615
42,-0.359519,0.068964,-0.794772,-0.452920,-0.092424,-0.188324,0.634055,-0.366003,-0.422827,0.658351,0.018556,-0.045322,0.491397,0.474610,-0.291372,-0.317298,0.309562,-0.399522,-0.150102,-0.232724,-0.538365,-0.468056,-0.100532,-0.237040,-0.180710,-0.171297
179,-0.597712,1.056220,1.835168,1.811755,3.055985,2.533643,0.927957,-0.542974,-0.498155,-0.078562,3.754381,3.401611,0.669570,0.524835,3.769680,4.216359,-0.046318,2.193021,2.767032,2.687071,1.499498,1.515638,0.871116,2.426365,2.234558,3.004573
219,1.307836,-2.024019,-0.862207,-1.373502,-1.122495,-1.145351,-4.423087,-0.778936,-0.822068,-1.808704,-1.041060,-1.065125,-4.470637,-4.829148,-0.879790,-0.919246,0.000000,-0.898708,-1.113307,-1.086203,-0.858928,-1.129287,-0.780685,-1.146106,-1.472598,-1.098124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32,-1.312293,0.977240,-0.052994,1.077296,0.454619,0.690079,-0.379400,0.518855,0.601642,0.394951,0.351384,0.593754,-0.456921,-0.328989,0.374217,0.459900,-0.027588,0.164074,0.565422,0.461039,0.026436,0.739410,-0.132920,0.193571,0.732037,0.470679
51,0.116868,0.779789,-0.626186,0.424196,0.745600,0.676524,0.400960,-0.483984,-0.422827,-0.046522,1.037417,1.039068,0.254685,0.062766,0.702187,0.764684,0.122256,1.033623,0.737422,0.855336,-0.004094,1.026902,0.450069,0.608233,0.844375,0.647116
148,2.975192,1.095711,-0.862207,0.212440,-0.301930,-0.172058,-0.460477,0.951452,0.895424,0.629109,-0.681063,-0.619811,-0.626458,0.313890,-0.349249,-0.378255,0.328292,-0.705475,-0.363383,-0.482280,0.034068,0.883156,-0.683520,-0.364628,0.717994,-0.194681
27,-0.835906,-1.550136,-0.862207,-1.231960,-1.075938,-1.069439,-1.717161,-0.660955,-0.626214,-0.398958,-1.027476,-1.058326,1.912151,-1.142634,-0.879790,-0.919246,0.000000,-0.834297,-1.030746,-1.006345,-0.836031,-0.956792,-0.780685,-1.098260,-1.304091,-1.051358


Scikit-learn makes a regressor and a classifier available, we'll be using the regressor, as we have continuous values to predict on. Sklearn performs the normalization and distance finding automatically, and lets us specify how many neighbors we want to look at.

In [18]:
# The columns that we'll be using to make predictions
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta',
             'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column we want to predict
y_column = ["pts"]

from sklearn.neighbors import KNeighborsRegressor
# Create the kNN model
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data
knn.fit(train[x_columns], train[y_column])
# Make predictions on the test set using the fit model
predictions = knn.predict(test[x_columns])

In [19]:
actual = test[y_column]
mse = (((predictions - actual) ** 2).sum()) / len(predictions)
rmse = np.sqrt(mse)

In [20]:
print(mse)
print(rmse)

pts    0.031431
dtype: float64
pts    0.177288
dtype: float64
