In [25]:

import re
import pandas as pd
import numpy as np
import math
from scipy.spatial import distance
import random
from numpy.random import permutation
from sklearn.neighbors import KNeighborsRegressor

import warnings
warnings.filterwarnings("ignore")

In [26]:
nba_raw = pd.read_csv('nba_2013.csv')

In [27]:
nba = nba_raw.fillna(0)

# Convert strings to NaN and drop.
nba = nba.convert_objects(convert_numeric=True).dropna()
    
# The names of the columns in the data.
print("nba.columns.values:", nba.columns.values)

nba.head(5)

nba.columns.values: ['Rk' 'Player' 'Pos' 'Age' 'Tm' 'G' 'GS' 'MP' 'FG' 'FGA' 'FG%' '3P' '3PA'
 '3P.1' '2P' '2PA' '2P.1' 'eFG%' 'FT' 'FTA' 'FT%' 'ORB' 'DRB' 'TRB' 'AST'
 'STL' 'BLK' 'TOV' 'PF' 'PTS']


Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1.0,Quincy Acy,PF,22.0,TOR,29.0,0.0,342.0,42.0,75.0,...,0.816,30.0,47.0,77.0,11.0,13.0,15.0,17.0,53.0,116.0
1,2.0,Jeff Adrien,PF,26.0,CHA,52.0,5.0,713.0,72.0,168.0,...,0.65,68.0,128.0,196.0,36.0,18.0,27.0,32.0,80.0,209.0
2,3.0,Arron Afflalo,SF,27.0,ORL,64.0,64.0,2307.0,397.0,905.0,...,0.857,29.0,210.0,239.0,206.0,40.0,11.0,138.0,137.0,1057.0
3,4.0,Josh Akognon,PG,26.0,DAL,3.0,0.0,9.0,2.0,4.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0,5.0
4,5.0,Cole Aldrich,C,24.0,TOT,45.0,0.0,388.0,44.0,80.0,...,0.6,30.0,90.0,120.0,9.0,5.0,23.0,23.0,60.0,100.0


In [28]:

selected_player = nba[nba["Player"] == "LeBron James"].iloc[0]
distance_columns = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
 '3P.1', '2P', '2PA', '2P.1', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST',
 'STL', 'BLK', 'TOV', 'PF', 'PTS']

def euclidean_distance(row):
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

lebron_distance = nba.apply(euclidean_distance, axis=1)
print("lebron_distance[:5]:\n", lebron_distance[:5])

lebron_distance[:5]:
 0    3878.055056
1    3485.609452
2    1561.897265
3    4237.902735
4    3845.353715
dtype: float64


In [29]:
nba_numeric = nba[distance_columns]
nba_numeric.head(5)

nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()
nba_normalized.head(5)

Unnamed: 0,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P.1,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,-1.07209,-0.779918,-0.788082,-0.877945,-0.786532,-0.86484,1.275708,-0.692923,-0.744648,1.442072,...,0.623629,-0.345612,-0.684541,-0.60083,-0.716027,-0.658966,-0.254608,-0.773548,-0.56334,-0.764147
1,-0.126638,0.107234,-0.612946,-0.455442,-0.601922,-0.597891,0.051613,-0.714468,-0.744648,-1.283917,...,-0.158543,0.284652,-0.096167,0.020484,-0.5197,-0.513235,0.117009,-0.526808,-0.167436,-0.550369
2,0.109725,0.570095,1.45366,1.359842,1.398023,1.517615,0.145056,0.836742,1.215335,0.351676,...,0.816816,-0.362198,0.49947,0.244992,0.815321,0.127978,-0.378481,1.216818,0.668362,1.398917
3,-0.126638,-1.782785,-0.788082,-1.257173,-1.032679,-1.068641,0.715054,-0.692923,-0.744648,1.442072,...,-3.221262,-0.843189,-1.018679,-0.997636,-0.794557,-1.037865,-0.71913,-1.053186,-1.296497,-1.019301
4,-0.599364,-0.162769,-0.788082,-0.825559,-0.774225,-0.850488,1.182265,-0.714468,-0.761119,-1.283917,...,-0.394137,-0.345612,-0.372194,-0.376322,-0.731733,-0.892134,-0.006864,-0.674852,-0.460699,-0.800926


In [30]:


# Fill in NA values in nba_normalized.
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for lebron james.
lebron_normalized = nba_normalized[nba["Player"] == "LeBron James"]

# Find the distance between lebron james and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)

second_smallest = distance_frame.iloc[1]["idx"]

most_similar_to_lebron = nba.loc[int(second_smallest)]["Player"]
print("most_similar_to_lebron:", most_similar_to_lebron)

most_similar_to_lebron: Russell Westbrook


In [31]:


# Randomly shuffle the index of nba.
random_indices = permutation(nba.index)

# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items).
test_cutoff = math.floor(len(nba)/3)

# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = nba.loc[random_indices[1:test_cutoff]]

# Generate the train set with the rest of the data.
train = nba.loc[random_indices[test_cutoff:]]

In [32]:



# The columns that we will be making predictions with.
x_columns = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA',
 '3P.1', '2P', '2PA', '2P.1', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST',
 'STL', 'BLK', 'TOV', 'PF']
# The column that we want to predict.
y_column = ['PTS']

# Create the knn model.
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the model on the training data.
knn.fit(train[x_columns], train[y_column])

# Make predictions on the test set using the fit model.
predictions = knn.predict(test[x_columns])

print("predictions:", predictions[:5])

predictions: [[1.4000e+00]
 [3.1000e+01]
 [1.8920e+02]
 [1.7436e+03]
 [7.3200e+01]]


In [33]:
# Computing error
actual = test[y_column]

mse = (((predictions - actual) ** 2).sum()) / len(predictions)

print("actual:", actual[:20])
print("mse:", mse)

actual:         PTS
3       5.0
410    33.0
51    181.0
573  1903.0
361    51.0
388   164.0
430  1430.0
66   1281.0
507   270.0
533   253.0
433     8.0
456   377.0
439   105.0
117   568.0
258   170.0
261   341.0
374   118.0
513    99.0
399   822.0
512  1166.0
mse: PTS    6382.735368
dtype: float64
