<a href="https://colab.research.google.com/github/COGS118A/Group014-Wi23/blob/main/KNN_Model_118A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_fscore_support, make_scorer

In [2]:
url = "https://raw.githubusercontent.com/COGS118A/Group014-Wi23/main/nba_api_merged_injuries"
data = pd.read_csv(url)

In [3]:
data.loc[:,["SEVERE_INJURY", "MINOR_INJURY"]] = data[["MINOR_INJURY", "SEVERE_INJURY"]].fillna(False) #replaces NAs with falses, because 


In [4]:
cols_list = ['AGE', 'GP', 'W_PCT', 'MIN', 'E_OFF_RATING', 'E_DEF_RATING', 'AST_PCT',
       'AST_TO', 'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TOV_PCT',
       'USG_PCT', 'E_USG_PCT', 'PACE_PER40', 'PIE', 'POSS', 'FGA_PG',
       'FG_PCT'] # importing the columns that we went through for manual feature selection

In [5]:
data.head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,sp_work_PACE_RANK,PIE_RANK,FGM_RANK,FGA_RANK,FGM_PG_RANK,FGA_PG_RANK,FG_PCT_RANK,SEASON_YEAR,SEVERE_INJURY,MINOR_INJURY
0,201985,AJ Price,AJ,1610612754,IND,24.0,50,22,28,0.44,...,153,278,270,240,253,210,408,2010,True,False
1,201166,Aaron Brooks,Aaron,1610612756,PHX,26.0,59,26,33,0.441,...,133,259,179,143,151,109,390,2010,True,False
2,201189,Aaron Gray,Aaron,1610612740,NOH,26.0,41,21,20,0.512,...,420,332,338,361,347,378,23,2010,True,False
3,201151,Acie Law,Acie,1610612744,GSW,26.0,51,20,31,0.392,...,100,307,305,304,326,326,264,2010,True,False
4,1733,Al Harrington,Al,1610612743,DEN,31.0,73,45,28,0.616,...,50,258,130,112,141,119,319,2010,True,False


In [6]:
X = data.loc[:,cols_list]
y = data.iloc[:,-2:]
y['INJURY'] = (y['SEVERE_INJURY'] | y['MINOR_INJURY'])
y = y[['INJURY']]
y = np.ravel(y.values)
X = X.values

y

array([ True,  True,  True, ..., False, False, False])

In [7]:
sum(y)/len(y) # sanity check to make sure there wasn't a logic error

0.5613529500985894

In [8]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42) # as per TA feedback, we implement Group Shuffle Split to ensure each
# player is either all in the train set or all in the test set, even if we have multiple years of data for them. 
train_idx, test_idx = next(gss.split(X=X, y=y, groups=data['PLAYER_NAME']))
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]

In [15]:
def fbeta_score(y_true, y_pred, beta = 2):
    precision, recall, fbeta, _ = precision_recall_fscore_support(y_true, y_pred, beta=beta, average="binary") #error function for our F beta which weights recall more than precision
    return precision, recall, fbeta
    
def fbeta_score_gridsearchcv(y_true, y_pred, beta = 2):
    precision, recall, fbeta, _ = precision_recall_fscore_support(y_true, y_pred, beta=beta, average="binary") #slight modification of error function for the gridsearch
    return fbeta

In [10]:
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),   # Step 1: Preprocessing with StandardScaler
    ('knn', KNeighborsClassifier()) # Step 2: KNN classification
])

In [11]:
np.sqrt(X_train.shape[0])

72.42237223399962

In [12]:
param_grid = {'knn__n_neighbors': range(1, 73, 2)}

In [13]:
grid_search = GridSearchCV(estimator=knn_pipeline, param_grid=param_grid, scoring=make_scorer(fbeta_score_gridsearchcv), cv=5)

In [17]:
grid_search.fit(X_train, y_train)

best_k = grid_search.best_params_['knn__n_neighbors']
precision, recall, fbeta = fbeta_score(y_test, grid_search.predict(X_test), 2)
print(f"The best value of k is {best_k}.")
print(f"The precision is {precision}, the recall is {recall}, and the fbeta with beta=2 is {fbeta}")

Traceback (most recent call last):
  File "C:\Users\kyra-\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\kyra-\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\kyra-\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
TypeError: fbeta_score_gridsearchcv() missing 1 required positional argument: 'beta'

Traceback (most recent call last):
  File "C:\Users\kyra-\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\kyra-\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\

The best value of k is 1.
The precision is 0.6514131897711979, the recall is 0.622107969151671, and the fbeta with beta=2 is 0.6277561608300908


Traceback (most recent call last):
  File "C:\Users\kyra-\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\kyra-\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\kyra-\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
TypeError: fbeta_score_gridsearchcv() missing 1 required positional argument: 'beta'

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


Unfortunately, our effort at manual feature selection actually resulted in a lower F-beta score than the original/preliminary result model. Realistically, with 80 features, we have 2^80 possibilities for the number of possible combinations of features, so a grid search for feature selection will never work. We could try a random search, but I think a better choice is to use a Random Forest model to do the feature selection for us .