## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data

In [2]:
df = pd.read_csv("D:/Work/Society For Health And Medical Technology/nba_2013.csv")
df.head()


Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [3]:
df.columns

Index(['player', 'pos', 'age', 'bref_team_id', 'g', 'gs', 'mp', 'fg', 'fga',
       'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft',
       'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf',
       'pts', 'season', 'season_end'],
      dtype='object')

## Checking Null Values

In [4]:
df.isnull().sum()

player           0
pos              0
age              0
bref_team_id     0
g                0
gs               0
mp               0
fg               0
fga              0
fg.              2
x3p              0
x3pa             0
x3p.            67
x2p              0
x2pa             0
x2p.             3
efg.             2
ft               0
fta              0
ft.             20
orb              0
drb              0
trb              0
ast              0
stl              0
blk              0
tov              0
pf               0
pts              0
season           0
season_end       0
dtype: int64

## Replacing Null Values with Mean

In [5]:
for col in df:
    if df[col].isnull().any() == True:
        df[col].fillna(df[col].mean(), inplace=True)


In [6]:
df.isnull().sum()

player          0
pos             0
age             0
bref_team_id    0
g               0
gs              0
mp              0
fg              0
fga             0
fg.             0
x3p             0
x3pa            0
x3p.            0
x2p             0
x2pa            0
x2p.            0
efg.            0
ft              0
fta             0
ft.             0
orb             0
drb             0
trb             0
ast             0
stl             0
blk             0
tov             0
pf              0
pts             0
season          0
season_end      0
dtype: int64

## Selecting Numeric and Categorical Columns

In [7]:
category_col = []
numeric_col = []
for col in df:
    if df[col].dtype != "O":
        numeric_col.append(col)
    else:
        category_col.append(col)

numeric_df = df[numeric_col]

## Normalizing Numeric Columns

In [8]:
df_normalized = numeric_df.apply(lambda x: (x - x.min()) / (x.max() - x.min()))


In [9]:
df_category = df[['player', 'bref_team_id', 'season']]


## Concating Numeric and Concating Columns

In [10]:
final_df = pd.concat([df_category,df_normalized],axis=1)

## Selecting Dependend and Independent Columns

In [11]:
X = final_df.drop(columns=['player', 'bref_team_id','season', "pts", "season_end"])
y = final_df["pts"]



## Split Into Train Test set

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15,random_state=0)

## Checking Which Value Will Be Giving Highest Score

In [13]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics

for k in range(1,11):
    knn = KNeighborsRegressor(n_neighbors= k)
    knn.fit(X_train,y_train)
    y_predict = knn.predict(X_test)
    print("Regression score for {} k_value is : {}".format(k, metrics.r2_score(y_test, y_predict)))


Regression score for 1 k_value is : 0.9317393903620022
Regression score for 2 k_value is : 0.957721974848286
Regression score for 3 k_value is : 0.963204226605198
Regression score for 4 k_value is : 0.9638733750401692
Regression score for 5 k_value is : 0.9632497237642577
Regression score for 6 k_value is : 0.9629952220059651
Regression score for 7 k_value is : 0.9603490791203279
Regression score for 8 k_value is : 0.9586363144284761
Regression score for 9 k_value is : 0.9641247642522749
Regression score for 10 k_value is : 0.9606068045874263


## K=9 is Giving Highest Score

In [14]:
knn = KNeighborsRegressor(n_neighbors=9)
knn.fit(X_train, y_train)
y_predict = knn.predict(X_test)
print("Mean Squared Error is: {}".format(metrics.mean_squared_error(y_test, y_predict)))
print("Regression score is: {}".format(metrics.r2_score(y_test, y_predict)))

Mean Squared Error is: 0.001082942720195031
Regression score is: 0.9641247642522749


## Comparing Actual and Predicted Data 

In [15]:
prediction_comperession = pd.DataFrame({'Actual Points': y_test.tolist(), 'Predicted Points': y_predict.tolist()})
prediction_comperession.head(50)

Unnamed: 0,Actual Points,Predicted Points
0,0.168145,0.122466
1,0.276514,0.296825
2,0.422676,0.361743
3,0.007327,0.011955
4,0.381026,0.360029
5,0.160432,0.155933
6,0.100656,0.106098
7,0.271115,0.262502
8,0.032009,0.031752
9,0.009641,0.012727
