In [1]:
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score

In [2]:
cwd = os.getcwd()
data_dir = os.path.join(cwd, 'data')
season = '2017-18'
np.random.seed(2)

In [3]:
df = pd.read_csv(os.path.join(data_dir,'Dataframes','clean', 'df_{}.csv'.format(season)))

In [4]:
columns = ['PTS','3P','AST','DRB','ORB','STL','BLK', 'PF']

In [5]:
df['2P'] = (df['PTS'] - 3*df['3P'] - df['FT'])/2
df['2P'] = df['2P'].astype(int)

In [6]:
columns = ['AST','BLK','DRB','ORB','2P','3P','FT','PF','STL']

In [21]:
data_mean = {'Name':[], 'AST':[], 'BLK':[], 'DRB':[], 'ORB':[], '2P':[], '3P':[], 'FT':[], 'PF':[], 'STL':[]}

for name in tqdm(set(df['Name'])):
    df_cluster = df.loc[df['Name']==name]
    data_mean['Name'].append(name)
    
    for column in columns:
        data_mean[column].append(df_cluster[column].mean())
        
df_mean = pd.DataFrame(data_mean).loc[:, ['Name']+columns]
scaler = MinMaxScaler()
scaler.fit_transform(df_mean.loc[:, columns])
df_mean.head()




Unnamed: 0,Name,AST,BLK,DRB,ORB,2P,3P,FT,PF,STL
0,Dewayne Dedmon,1.366667,0.766667,6.0,1.6,3.1,0.8,0.983333,2.516667,0.65
1,Jusuf Nurkic,1.786667,1.386667,6.493333,2.266667,6.026667,0.0,2.253333,3.12,0.813333
2,Treveon Graham,0.797297,0.027027,1.067568,0.567568,0.756757,0.540541,0.554054,1.472973,0.445946
3,Udonis Haslem,0.064935,0.025974,0.077922,0.038961,0.025974,0.012987,0.0,0.025974,0.0
4,Kevon Looney,0.573529,0.75,1.779412,1.088235,1.382353,0.014706,0.617647,1.411765,0.441176


In [23]:
for i in range(df_mean.shape[0]):
    div = sum([df_mean.loc[i, column] for column in columns])
    
    for column in columns:
        if div != 0:
            df_mean.loc[i, column] = (df_mean.loc[i, column]/div)*100
        else:
            df_mean.loc[i, column] = np.nan

In [25]:
df_mean[df_mean.isnull().any(axis=1)]
df_mean = df_mean.dropna().reset_index(drop=True)

In [26]:
X = df_mean.loc[:,columns]

k_means = KMeans(n_clusters=5, verbose=0)
k_means.fit(X)

new_pos = k_means.labels_

In [29]:
df_mean

Unnamed: 0,Name,AST,BLK,DRB,ORB,2P,3P,FT,PF,STL
0,Dewayne Dedmon,7.685098,4.311153,33.739456,8.997188,17.432052,4.498594,5.529522,14.151828,3.655108
1,Jusuf Nurkic,7.399227,5.742684,26.891220,9.387079,24.958586,0.000000,9.331861,12.921038,3.368305
2,Treveon Graham,12.798265,0.433839,17.136659,9.110629,12.147505,8.676790,8.893709,23.644252,7.158351
3,Udonis Haslem,23.809524,9.523810,28.571429,14.285714,9.523810,4.761905,0.000000,9.523810,0.000000
4,Kevon Looney,7.116788,9.306569,22.080292,13.503650,17.153285,0.182482,7.664234,17.518248,5.474453
5,Royce O'Neale,14.518760,2.446982,30.668842,4.078303,11.419250,5.872757,9.298532,15.823817,5.872757
6,Nikola Jokic,18.970917,2.595078,25.145414,8.322148,16.375839,4.653244,11.275168,8.903803,3.758389
7,Emmanuel Mudiay,25.982256,1.394170,15.842839,3.295311,19.898606,5.956907,12.420786,10.012674,5.196451
8,Shelvin Mack,31.818182,1.038961,17.402597,2.857143,15.584416,5.714286,8.441558,10.519481,6.623377
9,Pau Gasol,15.474553,5.226960,32.324622,8.253095,16.231087,2.819807,10.453920,7.702889,1.513067


In [28]:
silhouette_score(X, new_pos, sample_size=200)

0.153547701571111

In [None]:
df['New_Pos'] = new_pos

In [75]:
df['New_Pos'].value_counts()

3    8662
0    7452
2    5593
4    3596
1    2500
Name: New_Pos, dtype: int64

In [77]:
for i in range(5):
    print(df.loc[df['New_Pos']==i, columns].mean(),'\n')

AST    0.069946
BLK    0.042512
DRB    0.130032
ORB    0.069981
2P     0.109066
3P     0.065432
FT     0.052354
PF     0.236536
STL    0.064221
dtype: float64 

AST    0.122064
BLK    0.123880
DRB    0.342655
ORB    0.244567
2P     0.310133
3P     0.061320
FT     0.178914
PF     0.481400
STL    0.140286
dtype: float64 

AST    0.089326
BLK    0.045557
DRB    0.141004
ORB    0.069939
2P     0.124824
3P     0.108046
FT     0.069296
PF     0.621014
STL    0.102654
dtype: float64 

AST    0.016707
BLK    0.008035
DRB    0.026784
ORB    0.012786
2P     0.019906
3P     0.018979
FT     0.011000
PF     0.020396
STL    0.017663
dtype: float64 

AST    0.163426
BLK    0.043548
DRB    0.179505
ORB    0.060044
2P     0.184703
3P     0.259066
FT     0.130688
PF     0.249027
STL    0.257389
dtype: float64 

