In [353]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import distance
import warnings
from sklearn.decomposition import PCA
warnings.filterwarnings("ignore")
import plotly.express as px



class NBA_Roster_Analysis(object):
    
    
    def __init__(self):
        
        mapping=pd.read_csv('mapping.csv',index_col=0)
        self.players=pd.merge(pd.read_csv('NBA_Player_Data_1999-2020.csv',index_col=0) ,mapping,on='TEAM')
        
        self.player_stats_required=['POS', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'REB', 'AST', 'STL',
       'BLK', 'TO', 'DD2', 'TD3', 'PER', 'AGE', 'OFFRTG', 'DEFRTG', 'NETRTG',
       'AST%', 'AST/TO', 'AST RATIO', 'OREB%', 'DREB%', 'REB%', 'TO RATIO',
       'EFG%', 'TS%', 'USG%', 'PACE', 'PIE', 'POSS']
        
        
        self.kde=pd.read_pickle('kde.pkl')
        
        self.minutes=pd.read_csv('minutes_played.csv',index_col=0)
        
        
        self.teams=pd.read_csv('NBA_Team_Data_1999-2020.csv',index_col=0).replace('LA Clippers','Los Angeles Clippers')
        
        
        self.knn=pd.read_pickle('knn.pkl')
        
        self.corr=self.players.drop(['PLAYER','YEAR','TEAM NAME','TEAM'],axis=1).corr()
        
        
        
        self.team_stats=self.teams.drop(['TEAM','YEAR','GP','W','L','MIN'],axis=1)
        
        self.normalized_team_stats=self.normalize_team_stats(self.team_stats)
        
        
        self.team_stats_required=['WIN%', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA',
       'FT%', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF',
       'PFD', '+/-', 'OFFRTG', 'DEFRTG', 'NETRTG', 'AST%', 'AST/TO',
       'AST\nRATIO', 'OREB%', 'DREB%', 'REB%', 'TOV%', 'EFG%', 'TS%', 'PACE',
       'PIE', 'POSS']
    
    
    
    def visualize_teams(self,color='Clusters',hover_data=['WIN%']):
      
        pca = PCA(n_components=2)
        components = pca.fit_transform(self.normalized_team_stats)
        pca=pd.DataFrame(data=components,  columns=["PC1", "PC2"])
        
        pca['TEAM']=self.teams['TEAM']+" "+self.teams['YEAR']
        pca['Clusters']=self.knn.labels_
        pca['Clusters']=pca['Clusters'].astype('O')
        
        pca=pd.concat([pca,self.team_stats],axis=1)
        
        fig = px.scatter(pca, x='PC1', y='PC2',color=color,hover_data=hover_data)
        fig.show()
        
        return pca.drop(['PC1','PC2'],axis=1)
    
    
    def normalize_team_stats(self,team_stats):
        scaler = MinMaxScaler()
        scaler.fit(team_stats)
        normalized_team_stats=scaler.transform(team_stats)
        normalized_team_stats=pd.DataFrame(normalized_team_stats)
        normalized_team_stats.columns=team_stats.columns
        return normalized_team_stats

    
    def normalize_player_data(self):
        
        non_numeric=['PLAYER','YEAR','GP','TEAM NAME','TEAM']

        add_on=self.players[non_numeric]

        numeric_data=self.players.drop(non_numeric,axis=1)
        numeric_data=pd.concat([ pd.get_dummies(numeric_data['POS']),numeric_data.drop(['POS'],axis=1)], axis=1)

        scaler = MinMaxScaler()
        scaler.fit(numeric_data)
        normalized_data=scaler.transform(numeric_data)
        normalized_data=pd.DataFrame(normalized_data)
        normalized_data.columns=numeric_data.columns

    
        return pd.concat([add_on, normalized_data], axis=1)
    
    
    def view_available_players(self):
        return list(self.players['PLAYER'].unique())
    
    
    
    def adjust_to_minutes(self,minutes:float, player_stats:pd.DataFrame):
        adjusted_player_stat=[]

        for i in self.player_stats_required:
            if i=='POS':

                adjusted_player_stat.append(max(set(list(player_stats[i])), key=list(player_stats[i]).count))
            elif i=='MIN':
                adjusted_player_stat.append(minutes)
                
            elif (i in self.corr[(self.corr['MIN']>0.5) & (self.corr['MIN']!=1)][['MIN']].index):
                adjusted_player_stat.append(np.mean((player_stats[i])/np.mean(player_stats['MIN']))*minutes)

            else:
                adjusted_player_stat.append(np.mean((player_stats[i])))

        return adjusted_player_stat



    def player_stats_sampling(self,player: str, minutes: float,stats_selection_method:str = 'prime' , prime_window=None):

        if player not in self.players['PLAYER'].unique():
            raise Exception("Invalid player: '{}' - Please select from player list shown at .view_available_players()".format(player))

        player_stats=self.players[self.players['PLAYER']==player]

        if stats_selection_method=='best':
            return [player]+self.adjust_to_minutes(minutes,player_stats.sort_values('PIE',ascending=False).head(1)[self.player_stats_required])

        elif stats_selection_method=='prime':
            if prime_window==None:
                years=5
            else:
                years=prime_window

            return [player]+self.adjust_to_minutes(minutes,player_stats.sort_values('PIE',ascending=False).head(years)[self.player_stats_required])

        else:
            return [player]+self.adjust_to_minutes(minutes,player_stats[self.player_stats_required])




    def team_stats_sampling(self,team:list, minutes_selection_method:str ='sample', stats_selection_method:str = 'prime' , prime_window=None):

        if len(team)!=8:
            raise Exception("Team must contain 8 players, contains '{}' players".format(len(team)))

        if minutes_selection_method not in ['sample','average']:
            raise Exception("Invalid minutes_selection_method: '{}' - Please select from ['sample','average']".format(minutes_selection_method))

        if stats_selection_method not in ['prime','average','best']:
             raise Exception("stats_selection_method: '{}' - Please select from ['prime','average','best']".format(stats_selection_method))

        if stats_selection_method!='prime':
            if prime_window!=None:
                raise Exception("prime_window requires stats_selection_method='prime'".format(stats_selection_method))


        if minutes_selection_method=='sample':
            minutes=self.kde.sample(1)[0]
        else:
            minutes=[np.mean(self.minutes[self.minutes['player']==i]['minutes']) for i in sorted(self.minutes['player'].unique())]


        team_stats=[]

        for player,minute in zip(team,minutes):

            team_stats.append(self.player_stats_sampling(player,minutes=minute,stats_selection_method=stats_selection_method,prime_window=prime_window))

        return team_stats

    
    def normalize_sampled_stats(self,team:list, minutes_selection_method:str ='sample', stats_selection_method:str = 'prime' , prime_window=None):
        
        sampled_players=pd.DataFrame(self.team_stats_sampling(team,minutes_selection_method,stats_selection_method,prime_window),columns=['PLAYER']+self.player_stats_required)
        all_players=self.players[['PLAYER']+self.player_stats_required]
        
        
        aggregated_players=pd.concat([sampled_players,all_players])

        
        
        numeric_data=aggregated_players[self.player_stats_required]
    
        numeric_data=pd.concat([ pd.get_dummies(numeric_data['POS']),numeric_data.drop(['POS'],axis=1)], axis=1)
        
        
        scaler = MinMaxScaler()
        scaler.fit(numeric_data)
        
        normalized_data=scaler.transform(numeric_data)
        normalized_data=pd.DataFrame(normalized_data)
        
        normalized_data.columns=numeric_data.columns
        
        
        return pd.concat([aggregated_players[['PLAYER']].reset_index(drop=True), normalized_data], axis=1).head(8)
        
        
    
    def predict_roster_cluster(self,team:list, minutes_selection_method:str ='sample', stats_selection_method:str = 'prime' , prime_window=None):
        roster=np.asarray(self.normalize_sampled_stats(team,minutes_selection_method,stats_selection_method,prime_window).drop(['PLAYER'],axis=1))
        roster=roster.reshape(1,roster.shape[0],roster.shape[1],1)
        
        model = load_model('models/CLUSTER.h5')
        pred=model.predict(roster)
        
        return list(pred[0]).index(max(pred[0]))
        
    
    def predict_team_stats(self,team:list, minutes_selection_method:str ='sample', stats_selection_method:str = 'prime' , prime_window=None):
        
        
        roster=np.asarray(self.normalize_sampled_stats(team,minutes_selection_method,stats_selection_method,prime_window).drop(['PLAYER'],axis=1))
        roster=roster.reshape(1,roster.shape[0],roster.shape[1],1)
        
        def r2_keras(y_true, y_pred):
            SS_res =  K.sum(K.square( y_true - y_pred )) 
            SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
            return ( 1 - SS_res/(SS_tot + K.epsilon()) )
        
        
        stats={}
        
        for stat in self.team_stats_required:
            filename="models/{}.h5".format(stat.replace("/","_"))
            model = load_model(filename,custom_objects={"r2_keras":r2_keras})
            stats[stat]=model.predict(roster)[0][0]
            
        
        return pd.Series(stats)
    
    
    def k_nearest_neighbors(self,team:list, minutes_selection_method:str ='sample', stats_selection_method:str = 'prime' , prime_window=None,k=5,visualize=False):
        
        predicted_team_stats=self.predict_team_stats(team, minutes_selection_method, stats_selection_method , prime_window)
        
        normalized_predicted_team_stats=(self.normalize_team_stats(self.team_stats.append(predicted_team_stats,ignore_index=True))).iloc[-1]
    
        euclidean_distances = self.normalized_team_stats.apply(lambda row: distance.euclidean(row, normalized_predicted_team_stats), axis=1)

        k_nearest=pd.DataFrame(euclidean_distances,columns=['Distance']).sort_values('Distance').head(k).index
        
        print("Cluster: {}".format(self.knn.predict(np.asarray(normalized_predicted_team_stats).reshape(1, -1))[0]))
        
        
        if visualize==True:
            self.visualize_new_roster(self.normalize_team_stats(self.team_stats.append(predicted_team_stats,ignore_index=True)),predicted_team_stats,cluster=self.knn.predict(np.asarray(normalized_predicted_team_stats).reshape(1, -1))[0])
        
        return self.teams.loc[k_nearest][['TEAM','YEAR','WIN%']]
    
    
    def visualize_new_roster(self,normalized_stats,predicted_stats,cluster):
        teams_with_new=self.teams.copy()
        
        new_team={}
        for i in teams_with_new.columns:
            if i=='TEAM':
                new_team[i]='Theoretical Roster'
            else:
                try:
                    new_team[i]=predicted_stats[i]
                except:
                    if teams_with_new[i].dtype==float or teams_with_new[i].dtype==int:
                        new_team[i]=0
                    else:
                        new_team[i]=""

        teams_with_new=teams_with_new.append(pd.Series(new_team),ignore_index=True)
        
        pca = PCA(n_components=2)
        components = pca.fit_transform(normalized_stats)
        pca=pd.DataFrame(data=components,  columns=["PC1", "PC2"])
        
        pca['TEAM']=teams_with_new['TEAM']+" "+teams_with_new['YEAR']
        pca['Clusters']=np.concatenate((self.knn.labels_, np.array([cluster])), axis=0)
        
       
        pca['Clusters']=pca['Clusters'].astype('O')
        
        pca=pd.concat([pca,teams_with_new.drop(['TEAM','YEAR','GP','W','L','MIN'],axis=1)],axis=1)
        
        def is_real(team):
    
            if 'Theoretical' in team:
                return False
            else:
                return True
        
        
        pca['IS_REAL'] = pca.apply(lambda x: is_real(x['TEAM']), axis=1)
        fig = px.scatter(pca, x='PC1', y='PC2',color='IS_REAL',hover_data=['WIN%'])
        fig.show()
        fig = px.scatter(pca, x='PC1', y='PC2',symbol='IS_REAL',color='Clusters',hover_data=['WIN%'])
        fig.show()
        
        return pca.drop(['PC1','PC2'],axis=1)
    
    
    
    
    





In [357]:
nba_analysis=NBA_Roster_Analysis()



In [359]:
team=['Kevin Durant',
 'LeBron James',
 'Kyle Kuzma',
 'Dwight Howard',
 'David West',
 'Carmelo Anthony',
 'Draymond Green',
 'Lonzo Ball']

In [360]:
nba_analysis.k_nearest_neighbors(team,minutes_selection_method='average',stats_selection_method='best',visualize=True)

Cluster: 2


Unnamed: 0,TEAM,YEAR,WIN%
495,Oklahoma City Thunder,2015-16,0.671
247,Los Angeles Lakers,2007-08,0.695
425,Houston Rockets,2013-14,0.659
587,Philadelphia 76ers,2018-19,0.622
512,Denver Nuggets,2016-17,0.488


In [361]:
nba_analysis.predict_roster_cluster(team,minutes_selection_method='average',stats_selection_method='best')

3

In [362]:
nba_analysis.predict_team_stats(team,minutes_selection_method='average',stats_selection_method='best')

WIN%             0.640847
PTS            110.781120
FGM             39.017609
FGA             86.677368
FG%             47.599209
3PM              7.871988
3PA             23.394127
3P%             36.015114
FTM             21.931206
FTA             31.286158
FT%             76.375916
OREB            12.112515
DREB            34.336731
REB             46.511227
AST             24.267252
TOV             15.364017
STL              6.909541
BLK              5.787009
BLKA             5.292335
PF              21.582930
PFD             21.995998
+/-              4.335916
OFFRTG         111.961197
DEFRTG         107.352562
NETRTG           6.262749
AST%            60.307384
AST/TO           1.495610
AST\nRATIO      17.763536
OREB%           33.897758
DREB%           74.155594
REB%            53.806049
TOV%            15.693519
EFG%            52.551853
TS%             56.609928
PACE           100.046974
PIE             55.606682
POSS          8458.561523
dtype: float32

In [366]:

pd.DataFrame(nba_analysis.team_stats_sampling(team,minutes_selection_method='average',stats_selection_method='best'),columns=['PLAYER']+nba_analysis.player_stats_required)

Unnamed: 0,PLAYER,POS,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,OREB%,DREB%,REB%,TO RATIO,EFG%,TS%,USG%,PACE,PIE,POSS
0,Kevin Durant,PF,36.595675,30.417184,9.980639,19.77117,50.3,2.281289,5.798276,39.1,...,2.0,16.4,9.6,10.3,56.0,63.5,32.2,97.54,19.581062,6028.305694
1,LeBron James,SF,34.399654,25.913798,8.850839,18.157908,48.9,1.459932,4.288551,34.4,...,3.7,16.8,10.4,8.7,53.0,59.1,33.0,90.44,20.621543,5263.968271
2,Kyle Kuzma,SF,31.934083,16.478806,6.243523,13.817632,45.0,2.149409,5.731758,36.6,...,3.5,15.4,9.5,10.0,52.7,54.9,21.9,101.81,10.644694,5207.712004
3,Dwight Howard,C,29.616609,18.037775,6.222639,10.554855,59.3,0.0,0.078768,0.0,...,11.3,26.8,19.3,15.1,59.3,61.6,27.2,91.89,14.493234,4434.614592
4,David West,F,27.221626,13.936821,5.623629,11.247259,49.8,0.081502,0.244506,21.1,...,6.2,16.7,11.6,10.3,50.0,54.5,24.3,91.95,12.306783,3806.137569
5,Carmelo Anthony,PF,24.622837,17.433223,6.107991,13.552104,45.2,1.399748,3.435745,40.2,...,4.9,17.1,10.8,8.6,50.3,56.1,31.8,91.35,10.307234,3619.620718
6,Draymond Green,PF,22.170588,8.944906,3.194609,6.453111,49.0,0.766706,2.04455,38.8,...,5.0,19.8,13.1,14.2,55.1,58.7,18.4,102.13,8.689337,3816.919139
7,Lonzo Ball,PG,19.970588,7.341213,2.737402,6.78129,40.3,1.430914,3.919461,37.5,...,3.2,14.2,8.7,14.3,51.1,51.7,18.1,104.82,6.283581,2752.95492
