In [1]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support, make_scorer
import numpy as np

In [2]:
dataset = pd.read_csv('nba_api_merged_injuries.csv')

In [3]:
features = dataset[['AGE', 'GP', 'W_PCT', 'MIN', 'E_OFF_RATING', 'E_DEF_RATING', 'AST_PCT',
       'AST_TO', 'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TOV_PCT',
       'USG_PCT', 'E_USG_PCT', 'PACE_PER40', 'PIE', 'POSS', 'FGA_PG',
       'FG_PCT']]
features = features.values
print(features)

[[2.400e+01 5.000e+01 4.400e-01 ... 1.579e+03 6.400e+00 3.560e-01]
 [2.600e+01 5.900e+01 4.410e-01 ... 2.571e+03 9.900e+00 3.750e-01]
 [2.600e+01 4.100e+01 5.120e-01 ... 9.970e+02 2.400e+00 5.660e-01]
 ...
 [2.200e+01 4.400e+01 6.590e-01 ... 1.199e+03 3.500e+00 5.720e-01]
 [2.100e+01 3.100e+01 6.130e-01 ... 1.054e+03 5.300e+00 4.110e-01]
 [2.200e+01 2.900e+01 5.860e-01 ... 2.031e+03 1.620e+01 6.080e-01]]


In [4]:
labels = pd.read_csv('https://raw.githubusercontent.com/COGS118A/Group014-Wi23/main/nba_api_merged_injuries').iloc[:,-2:]
labels = labels.fillna(False)
labels['INJ'] = (labels['SEVERE_INJURY'] | labels['MINOR_INJURY'])
labels = labels[['INJ']]
labels = np.ravel(labels.values)
print(labels)

[ True  True  True ... False False False]


In [5]:
print(labels.shape)
print(features.shape)

(6593,)
(6593, 20)


In [6]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42) # as per TA feedback, we implement Group Shuffle Split to ensure each
# player is either all in the train set or all in the test set, even if we have multiple years of data for them. 
train_idx, test_idx = next(gss.split(X=features, y=labels, groups=dataset['PLAYER_NAME']))
X_train, y_train = features[train_idx], labels[train_idx]
X_test, y_test = features[test_idx], labels[test_idx]

In [7]:
def fbeta_score(y_true, y_pred):
    precision, recall, fbeta, _ = precision_recall_fscore_support(y_true, y_pred, beta=2, average="binary") #error function for our F beta which weights recall more than precision
    return precision, recall, fbeta
    
def fbeta_score_gridsearchcv(estimator, X, y):
    y_pred = estimator.predict(X)
    precision, recall, fbeta, _ = precision_recall_fscore_support(y, y_pred, beta=2, average="binary") #slight modification of error function for the gridsearch

    return {'precision': precision, 'recall': recall, 'fbeta' : fbeta}

In [8]:
SVC_pipeline = Pipeline([
    ('scaler', StandardScaler()),   # Step 1: Preprocessing with StandardScaler
    ('svm', SVC()) # Step 2: SVC classification
])

In [9]:
param_grid = {'svm__kernel': ('poly', 'rbf'),
              'svm__C' : (1,10),
              'svm__degree' : (2,3,7)
             }

In [18]:
grid_search = GridSearchCV(estimator=SVC_pipeline, param_grid=param_grid, scoring=fbeta_score_gridsearchcv, cv=5, verbose=3, refit='recall')

In [19]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END svm__C=1, svm__degree=2, svm__kernel=poly; fbeta: (test=0.847) precision: (test=0.568) recall: (test=0.966) total time=   0.6s
[CV 2/5] END svm__C=1, svm__degree=2, svm__kernel=poly; fbeta: (test=0.849) precision: (test=0.585) recall: (test=0.957) total time=   0.7s
[CV 3/5] END svm__C=1, svm__degree=2, svm__kernel=poly; fbeta: (test=0.838) precision: (test=0.579) recall: (test=0.944) total time=   0.7s
[CV 4/5] END svm__C=1, svm__degree=2, svm__kernel=poly; fbeta: (test=0.821) precision: (test=0.586) recall: (test=0.913) total time=   0.7s
[CV 5/5] END svm__C=1, svm__degree=2, svm__kernel=poly; fbeta: (test=0.812) precision: (test=0.590) recall: (test=0.896) total time=   0.7s
[CV 1/5] END svm__C=1, svm__degree=2, svm__kernel=rbf; fbeta: (test=0.838) precision: (test=0.575) recall: (test=0.947) total time=   0.7s
[CV 2/5] END svm__C=1, svm__degree=2, svm__kernel=rbf; fbeta: (test=0.871) precision: (test=0.661) r

In [20]:
best_params = grid_search.best_params_
print(f"The best params is {best_params}.")
best_est = grid_search.best_estimator_
print(f"The best estimator is {best_est}.")
best_score = grid_search.best_score_
print(f"The best score is {best_score}.")

precision, recall, fbeta = fbeta_score(y_test, grid_search.predict(X_test))
print(f"The precision is {precision}, the recall is {recall}, and the f-score {fbeta}")

The best params is {'svm__C': 1, 'svm__degree': 2, 'svm__kernel': 'poly'}.
The best estimator is Pipeline(steps=[('scaler', StandardScaler()),
                ('svm', SVC(C=1, degree=2, kernel='poly'))]).
The best score is 0.9349771689497718.
The precision is 0.5944849959448499, the recall is 0.9421593830334191, and the f-score 0.8434982738780208


# Note

If you want you can remove the last line: <br>
precision, recall, fbeta = fbeta_score(y_test, grid_search.predict(X_test))<br>
print(f"The precision is {precision}, the recall is {recall}, and the f-score {fbeta}")<br>

This is because I configured GridSearchCV to calculate the highest score with respect to the *accuracy* metric! Specifically the best_score_ attribute is defined as: Mean cross-validated score of the best_estimator.<br>
Thus the score of the SVM as calculated by GridSearchCV uses the highest average accuracy of cross-validation on the test set. <br>
<br>
In this case the best estimator is the SVM with a polynomial kernel of degree 2, with no regularization (ie C = 1). <br>