In [62]:
import pandas as pd
import requests
import numpy as np
import unidecode
from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
from pathlib import Path


In [3]:

class GraphqlQuery:
    def __init__(self, contents : dict[str : list[str]], url : str):
        self._contents = {}
        self.add(contents)
        self.url = url
    
    #method to add further tables and/or columns to the current query
    def add(self, new_contents : dict[str : list[str]]):
        
        #validate passed arguments
        if not isinstance(new_contents, dict):
            raise ValueError("Contents must be passed as dictionary")
        
        for key, val in new_contents.items() :
            if not isinstance(key, str):
                raise ValueError("Table arguments must be passed as strings")
            if not isinstance(val, list):
                raise ValueError("Set of columns must be a list")
            if not all(isinstance(column, str) for column in val):
                raise ValueError("Column arguments must be passed as strings")
            
        for table, columns in new_contents.items():
            if table not in self._contents:
                self._contents[table] = columns
            else:
                self._contents[table].extend(columns)

    #property to store current contents od query in dict format
    @property
    def contents(self):
        return self._contents
    
    #property to store finished query in graphql format
    @property
    def query(self):
        query_str =  ["query Myquery {"]
        for index, (table, columns) in enumerate(self._contents.items()):
            subquery = []
            subquery.append(table)
            subquery.append('{')
            subquery.extend(columns)
            query_str.extend(subquery)
            if index != 0: query_str +="}"
        query_str +="}}"
        
        return " ".join(query_str)
    
    def __str__(self):
        return self._query
     
    @classmethod
    def select(cls, contents : dict[str : list[str]]):
        return cls(contents)

    #method to perform get request to graphql api
    def get_data(self):
        body = {"query" : self.query}
        response = requests.post(self.url, json=body)
        if response.status_code == 200:
            data = response.json()
        else:
            with Exception as e:
                print(f"Error fetching data: {e}")
    
        return pd.json_normalize(data['data'][list(self._contents)[0]])


In [4]:
url = 'https://nbaapi.com/graphql/'

#set down tables and columns to query from nbaapi
query_teams = {
    'team' :
    ['teamName','teamAbbr', 'season', 'wins']
    }

query_players = {
    'playerPerGame' : 
    ['playerName', 'team', 'age', 'season', 'games', 'minutesPg', 'points', 'ftPercent', 'assists', 'totalRb', 'steals', 'turnovers', 'blocks']
    }

#define Graphqlquery objects
player_data = GraphqlQuery(query_players, url)
team_data = GraphqlQuery(query_teams, url)

#get data from the api and store it in respective dataframes
player_stats = player_data.get_data()
team_stats = team_data.get_data()

#replace "*" and handle string format of player names
team_stats.teamName = team_stats.teamName.str.replace('*', '')

player_stats.playerName = player_stats.playerName.str.replace('*', '')
player_stats.playerName = player_stats.playerName.apply(unidecode.unidecode)

player_stats = player_stats[player_stats.team != 'TOT']

#set down team abbreviations that need to be changed
team_names = {
    'BRK' : 'NJN',
    'NOP' : 'NOH',
    'CHO' : 'CHA',
    'SEA' : 'OKC',
    'NOK' : 'CHA',
    'CHH' : 'CHA',
    'VAN' : 'MEM',
    'WSB' : 'WAS'
}
player_stats = player_stats.replace({'team' : team_names})

#merge player and team statistics into one dataframe
player_stats = player_stats.merge(
    team_stats,
    how='inner',
    left_on=['season', 'team'],
    right_on=['season', 'teamAbbr']
).drop(['teamAbbr'], axis=1)

player_stats.head()

Unnamed: 0,playerName,team,age,season,games,minutesPg,points,ftPercent,assists,totalRb,steals,turnovers,blocks,teamName,wins
0,Precious Achiuwa,TOR,23,2023,55,20.7,9.2,0.702,0.9,6.0,0.6,1.1,0.5,Toronto Raptors,41
1,OG Anunoby,TOR,25,2023,67,35.6,16.8,0.838,2.0,5.0,1.9,2.0,0.7,Toronto Raptors,41
2,Dalano Banton,TOR,23,2023,31,9.0,4.6,0.708,1.2,1.5,0.4,0.6,0.4,Toronto Raptors,41
3,Scottie Barnes,TOR,21,2023,77,34.8,15.3,0.772,4.8,6.6,1.1,2.0,0.8,Toronto Raptors,41
4,Will Barton,TOR,32,2023,16,13.2,4.5,1.0,1.1,1.6,0.7,0.2,0.2,Toronto Raptors,41


In [5]:
#read mvp data and perform data clean up
current_folder = Path.cwd()

mvp_files = [f for f in current_folder.glob("*.csv") if "MVP Data" in f.name]

mvps = pd.concat(
    [pd.read_csv(file, sep=',', usecols=['Rank', 'Player', 'year', 'Tm'])
     for file in mvp_files]
)

mvps = mvps[(~mvps['Rank'].str.contains('T')) & mvps.Rank == 1]
mvps.Player = mvps.Player.apply(unidecode.unidecode)

#add data to our main dataframe
player_stats = player_stats.merge(
    mvps,
    how='left',
    left_on=['playerName','season'],
    right_on=['Player', 'year']
).drop(['Player', 'Tm','year'], axis=1)

#fill na fields with zero to prepare for forecasts 
player_stats.Rank = player_stats.Rank.fillna(0)
player_stats.ftPercent = player_stats.ftPercent.fillna(0)

#read all start data and again perform clean up
df_allstar = pd.read_csv('1980-2022_AllStar_Data.csv',
    sep=',',
    usecols=['first', 'last', 'team', 'year']
)

df_allstar['PlayerName'] = df_allstar['first'].astype(str) + ' ' + df_allstar['last'].astype(str)
#make seasons match
df_allstar['year'] = df_allstar['year'] + 1
df_allstar = df_allstar[['PlayerName', 'team', 'year']]
#mark players that made the All Star Team in preparation for forecast
df_allstar['AllStar'] = 1
df_allstar = df_allstar.replace({'team' : team_names})

#add data to main dataframe
player_stats = player_stats.merge(
    df_allstar,
    how='outer',
    left_on=['playerName','season', 'team'],
    right_on=['PlayerName', 'year', 'team']
).drop(['PlayerName','year'], axis=1)

player_stats = player_stats[~player_stats.season.isna()]
player_stats.AllStar = player_stats.AllStar.fillna(0)

columns = ['points', 'assists', 'turnovers', 'games', 'ftPercent', 'blocks', 'totalRb', 'wins']
player_stats[columns] = player_stats[columns].apply(pd.to_numeric, errors='coerce')

player_stats.head()

Unnamed: 0,playerName,team,age,season,games,minutesPg,points,ftPercent,assists,totalRb,steals,turnovers,blocks,teamName,wins,Rank,AllStar
0,Precious Achiuwa,TOR,23.0,2023.0,55.0,20.7,9.2,0.702,0.9,6.0,0.6,1.1,0.5,Toronto Raptors,41.0,0,0.0
1,OG Anunoby,TOR,25.0,2023.0,67.0,35.6,16.8,0.838,2.0,5.0,1.9,2.0,0.7,Toronto Raptors,41.0,0,0.0
2,Dalano Banton,TOR,23.0,2023.0,31.0,9.0,4.6,0.708,1.2,1.5,0.4,0.6,0.4,Toronto Raptors,41.0,0,0.0
3,Scottie Barnes,TOR,21.0,2023.0,77.0,34.8,15.3,0.772,4.8,6.6,1.1,2.0,0.8,Toronto Raptors,41.0,0,0.0
4,Will Barton,TOR,32.0,2023.0,16.0,13.2,4.5,1.0,1.1,1.6,0.7,0.2,0.2,Toronto Raptors,41.0,0,0.0


In [14]:
df_train = player_stats[player_stats.season.between(2000, 2015)]
df_test = player_stats[player_stats.season > 2015]

In [26]:
from xgboost import XGBRegressor, XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from numpy import absolute
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import RepeatedKFold

X = df_train[columns]
y_mvp = df_train.Rank.astype('int')
y_allstar = df_train.AllStar.astype('int')

model_xgb_mpv = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
# fit model
model_xgb_mpv.fit(X, y_mvp)

model_xgb_allstar = XGBClassifier()
model_xgb_allstar.fit(X, y_allstar)

#Train Logistic Regression classifier
model_log_mvp = LogisticRegression(solver='liblinear', C=1.0).fit(X, y_mvp)
model_log_allstar = LogisticRegression(solver='liblinear', C=1.0).fit(X, y_allstar)

# Train Decision Tree Classifer
model_clf_mvp = DecisionTreeClassifier().fit(X,y_mvp)
model_clf_allstar = DecisionTreeClassifier().fit(X,y_allstar)

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate model
#scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

# # force scores to be positive
# scores = absolute(scores)

# y_pred = cross_val_predict(model, X, y, cv=5, n_jobs=-1)

In [27]:
cross_val_score(model_clf_allstar, X, y_allstar, scoring='f1', cv=5).mean()

0.5938072019911658

In [54]:
df_score = df_test.copy()

df_score['Log_Prediction'] = model_xgb_allstar.predict(df_score[columns])
df_score['XGB_Prediction'] = model_log_allstar.predict(df_score[columns])
df_score['Tree_Prediction'] = model_clf_allstar.predict(df_score[columns])

f1 = (
    df_score.groupby('season')
    .apply(lambda x: pd.Series({
        'F1_Log': f1_score(x['AllStar'], x['Log_Prediction']),
        'F1_XGB': f1_score(x['AllStar'], x['XGB_Prediction']),
        'F1_Tree': f1_score(x['AllStar'], x['Tree_Prediction'])
    }))
    .reset_index()
)

auc = (
    df_score.groupby('season')
    .apply(lambda x: pd.Series({
        'AUC_Log': roc_auc_score(x['AllStar'], x['Log_Prediction']),
        'AUC_XGB': roc_auc_score(x['AllStar'], x['XGB_Prediction']),
        'AUC_Tree': roc_auc_score(x['AllStar'], x['Tree_Prediction'])
    }))
    .reset_index()
)

In [69]:
def score_evolution_plot(df_data, column_list, label):

    x_axis = df_data.season

    fig = go.Figure(data=[
    go.Bar(name='Log Regression', x=x_axis, y=df_data[column_list[0]], marker_color='green'),
    go.Bar(name='XGB Classifier', x=x_axis, y=df_data[column_list[1]], marker_color='chocolate'),
    go.Bar(name='Decision Tree', x=x_axis, y=df_data[column_list[2]], marker_color='blue')
    
    ])

    fig.update_layout(barmode='group',
                      title=label)
    fig.show()

In [70]:
score_evolution_plot(f1, ['F1_Log', 'F1_XGB', 'F1_Tree'], 'F1 Score Evolution')
score_evolution_plot(auc, ['AUC_Log', 'AUC_XGB', 'AUC_Tree'], 'AUC Score Evolution')

In [67]:
# df = df_test[columns].copy()
df = df_test.copy()

df['Log_Prediction'] = model_xgb_allstar.predict_proba(df[columns])[:,1]
df['XGB_Prediction'] = model_log_allstar.predict_proba(df[columns])[:,1]
df['Tree_Prediction'] = model_clf_allstar.predict_proba(df[columns])[:,1]

df_long = df.melt(
    id_vars = ['season', 'playerName', 'AllStar'],
    value_vars = ['Log_Prediction', 'XGB_Prediction', 'Tree_Prediction'],
    var_name='Model',
    value_name='Prediction'
)

top_preds = (
    df_long
        .sort_values(by=['season', 'Model', 'Prediction'], ascending=[True, True, False])
        .groupby(by=['season', 'Model'])
        .head(24)
)

allstars = df_test.groupby(by='season')['AllStar'].sum()

correct = (
    top_preds
        .groupby(by=['season', 'Model'])['AllStar'].sum()
        .reset_index(name="Correct")
        .merge(allstars, on="season")
        .assign(pct=lambda x : x.Correct / x.AllStar)
        .pivot(index='season', columns='Model', values='pct')
        .reset_index()
)

correct

Model,season,Log_Prediction,Tree_Prediction,XGB_Prediction
0,2016.0,0.75,0.666667,0.791667
1,2017.0,0.75,0.625,0.75
2,2018.0,0.695652,0.565217,0.652174
3,2019.0,0.692308,0.423077,0.769231
4,2020.0,0.708333,0.625,0.75
5,2021.0,0.636364,0.636364,0.727273
6,2022.0,0.73913,0.391304,0.695652
7,2023.0,0.666667,0.333333,0.666667


In [72]:
score_evolution_plot(correct, ['Log_Prediction', 'XGB_Prediction', 'Tree_Prediction'], 'Accuracy Score Evolution')