In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import matplotlib.ticker as mtick
import seaborn as sns
import pickle

from nba_stats.read_write.basic_stats import ReadDatabase

In [None]:
from clustering import plot_cluster, run_clustering, NormaliseFeatures

In [None]:
%matplotlib notebook
pd.options.display.float_format = '{:.1f}'.format
pd.options.display.max_columns = 30

In [None]:
sql_conn = ReadDatabase()

In [None]:
sql_conn.basic_summary(playoffs='regular', groupby=None, summary_name='career')
sql_conn.basic_summary(playoffs='regular', summary_name='by_season')

In [None]:
stats_season = sql_conn.get_summary('by_season')[sql_conn.get_summary('by_season').loc[:,'game_count'] > 10].reset_index(drop=True)
stats_2019 = stats_season[stats_season.loc[:,'season'] == 2019].reset_index(drop=True)
stats_career = sql_conn.get_summary('career')[sql_conn.get_summary('career').loc[:,'game_count'] > 4 * 82].reset_index(drop=True)
stats_career_finished = stats_career[stats_career.loc[:,'max_season'] != 2020].reset_index(drop=True)

# Clustering Algorithm

In [None]:
stats = stats_career.copy()

In [None]:
# add fg2 stats
stats.loc[:,'fg2a'] = stats.loc[:,'fga'] - stats.loc[:,'fg3a']
stats.loc[:,'fg2'] = stats.loc[:,'fg'] - stats.loc[:,'fg3']
stats.loc[:,'fg2_pct'] = stats.loc[:,'fg2'] / stats.loc[:,'fg2a']

# 1 player has no fta, 20 have no 3pa
for column in ['ft_pct', 'fg2_pct', 'fg3_pct']:
    stats.loc[:,column] = stats.loc[:,column].fillna(stats.loc[:,column].mean())
    
desired_stats = ['pts', 'trb', 'ast', 'blk', 'stl','tov','fg3_pct', 'fg2_pct', 'ft_pct', 'fg3a', 'fg2a', 'fta']
info_stats = ['last_name','first_name','season','min_season', 'max_season']
info_stats = [x for x in info_stats if x in stats.columns]
data = stats[info_stats + desired_stats]
X = data[desired_stats].to_numpy()

In [None]:
final_clusters, final_closest, final_cost, close_min_max = run_clustering(X, 15, 100, 50)

In [None]:
plt.figure()
plt.plot(final_cost);

In [None]:
plot_cluster(X, closest_cluster=final_closest, clusters=final_clusters)
plt.ylabel('Rebounds')
plt.xlabel('Points')
plt.gca().set_zlabel('Assists');

In [None]:
elbow_cost = []
for i in range(2, 31):
    _, _, cost_temp, _ = run_clustering(X, i, 50, 50, suppress=True)
    elbow_cost.append([i, cost_temp[-1][0]])

In [None]:
plt.figure()
plt.plot([x[0] for x in elbow_cost], [x[1] for x in elbow_cost])

In [None]:
# run if you want to save a copy of the results
pickle_file = 'career.pickle'
with open(pickle_file, "wb") as f:
    pickle.dump([stats, final_clusters, final_closest, final_cost, close_min_max], f)