In [10]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [11]:
# import the merged data set
url = "https://raw.githubusercontent.com/COGS118A/Group014-Wi23/main/nba_api_merged_injuries"
data1 = pd.read_csv(url)

In [12]:
data1.shape
data1.head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,sp_work_PACE_RANK,PIE_RANK,FGM_RANK,FGA_RANK,FGM_PG_RANK,FGA_PG_RANK,FG_PCT_RANK,SEASON_YEAR,SEVERE_INJURY,MINOR_INJURY
0,201985,AJ Price,AJ,1610612754,IND,24.0,50,22,28,0.44,...,153,278,270,240,253,210,408,2010,True,False
1,201166,Aaron Brooks,Aaron,1610612756,PHX,26.0,59,26,33,0.441,...,133,259,179,143,151,109,390,2010,True,False
2,201189,Aaron Gray,Aaron,1610612740,NOH,26.0,41,21,20,0.512,...,420,332,338,361,347,378,23,2010,True,False
3,201151,Acie Law,Acie,1610612744,GSW,26.0,51,20,31,0.392,...,100,307,305,304,326,326,264,2010,True,False
4,1733,Al Harrington,Al,1610612743,DEN,31.0,73,45,28,0.616,...,50,258,130,112,141,119,319,2010,True,False


In [13]:
print(data1.columns)

Index(['PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'AGE', 'GP', 'W', 'L', 'W_PCT', 'MIN', 'E_OFF_RATING', 'OFF_RATING',
       'sp_work_OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
       'sp_work_DEF_RATING', 'E_NET_RATING', 'NET_RATING',
       'sp_work_NET_RATING', 'AST_PCT', 'AST_TO', 'AST_RATIO', 'OREB_PCT',
       'DREB_PCT', 'REB_PCT', 'TM_TOV_PCT', 'E_TOV_PCT', 'EFG_PCT', 'TS_PCT',
       'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE', 'PACE_PER40', 'sp_work_PACE',
       'PIE', 'POSS', 'FGM', 'FGA', 'FGM_PG', 'FGA_PG', 'FG_PCT', 'GP_RANK',
       'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK', 'E_OFF_RATING_RANK',
       'OFF_RATING_RANK', 'sp_work_OFF_RATING_RANK', 'E_DEF_RATING_RANK',
       'DEF_RATING_RANK', 'sp_work_DEF_RATING_RANK', 'E_NET_RATING_RANK',
       'NET_RATING_RANK', 'sp_work_NET_RATING_RANK', 'AST_PCT_RANK',
       'AST_TO_RANK', 'AST_RATIO_RANK', 'OREB_PCT_RANK', 'DREB_PCT_RANK',
       'REB_PCT_RANK', 'TM_TOV_PCT_RANK', 'E_TOV_PCT_

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Create a new column "INJURY" based on the "MINOR" and "SEVERE" columns
data1['INJURY'] = (data1['MINOR_INJURY'] == True) | (data1['SEVERE_INJURY'] == True)


# Select the features and target columns
X = data1.loc[:,['AGE', 'GP', 'W_PCT', 'MIN', 'E_OFF_RATING', 'E_DEF_RATING', 'AST_PCT',
       'AST_TO', 'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TOV_PCT',
       'USG_PCT', 'E_USG_PCT', 'PACE_PER40', 'PIE', 'POSS', 'FGA_PG',
       'FG_PCT']]
y = data1['INJURY']

# Perform Train Test Split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform Shuffle Split 
shuffle_split = ShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 142)
for train_index, test_index in shuffle_split.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


# Create a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=142)

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the classifier
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

       False       0.73      0.66      0.69       573
        True       0.76      0.81      0.78       746

    accuracy                           0.74      1319
   macro avg       0.74      0.73      0.74      1319
weighted avg       0.74      0.74      0.74      1319

Confusion Matrix:
[[378 195]
 [143 603]]
Accuracy Score:
0.7437452615617892


In [33]:
from sklearn.metrics import fbeta_score

# Compute F-beta score
beta = 1
fbeta_w = fbeta_score(y_test, y_pred, beta=beta, average='weighted')
fbeta_b = fbeta_score(y_test, y_pred, beta=beta, average='binary')

print("F-beta Score weighted:")
print(fbeta_w)

print("F-beta Score binary:")
print(fbeta_b)

F-beta Score weighted:
0.7419702828509411
F-beta Score binary:
0.7810880829015543
