In [1]:
import pandas as pd
from tabulate import tabulate
from scipy.stats import ttest_ind
import warnings
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
pd.set_option('display.max_columns', None)



In [2]:
play2024_df = pd.read_csv("data/play_by_play_2024.csv")
injury2024_df = pd.read_csv("data/injuries_2024.csv")

  play2024_df = pd.read_csv("data/play_by_play_2024.csv")


In [3]:
def first_last_a(name):
    parts = name.split(" ", 1)
    return f"{parts[0][0]}.{parts[1]}"

In [4]:
def first_last_b(name):
    parts = name.split(" ", 1)
    return f"{parts[0][0:2]}.{parts[1]}"

In [5]:
warnings.filterwarnings('ignore')

def merge_play_injury_dfs(play_df, injury_df):

    # preprocessing
    injury_df["date"] = pd.to_datetime(injury_df['date_modified'])
    play_df["date"] = pd.to_datetime(play_df['game_date'])
    injury_df['date'] = injury_df['date'].dt.tz_localize(None)
    play_df['date'] = play_df['date'].dt.tz_localize(None)

    # filtering
    plays_with_injuries = play_df[play_df['desc'].str.contains("was injured", na=False)]
    pattern = r'(\w+\.(?:\w|-|\.|\')+(?: \w+)*) was injured'
    # Extract the injured player's name from the desc column
    injured_players = plays_with_injuries.loc[:, "desc"].str.extract(pattern)

    # concatenation
    plays_with_injuries = pd.concat([plays_with_injuries, injured_players], axis=1)
    plays_with_injuries.rename(columns={0: "injured_player"}, inplace=True)
    plays_with_injuries = plays_with_injuries.reset_index(drop=True)
    
    # merging
    injuries = []
    for (week, team), group_injury_df in injury_df.groupby(['week', 'team']):
        group_play_df = plays_with_injuries[(plays_with_injuries['week'] == week) & ((plays_with_injuries['home_team'] == team) | (plays_with_injuries['away_team'] == team))]

        group_injury_df = group_injury_df[group_injury_df.date >= group_play_df.date.max()]

        group_injury_df["first_type"] = group_injury_df['full_name'].apply(first_last_a)
        group_injury_df["second_type"] = group_injury_df['full_name'].apply(first_last_b)

        x = pd.merge(group_play_df, group_injury_df, left_on="injured_player", right_on="first_type", how="inner")
        y = pd.merge(group_play_df, group_injury_df, left_on="injured_player", right_on="second_type", how="inner")

        injuries.append(pd.concat([x, y], axis = 0, ignore_index=True))

    plays_with_injuries_and_injury_record = (pd.concat(injuries, axis=0, ignore_index=True)).drop(columns=["first_type", "second_type"])
    plays_with_injuries_and_injury_record = plays_with_injuries_and_injury_record.sort_values('play_id', ascending=False).drop_duplicates(subset=['week_x', 'full_name', "team"], keep='first')

    return plays_with_injuries, plays_with_injuries_and_injury_record
#returns (plays where injuries occurred, plays were injuries occurred and missed time)



In [6]:
def populate_cols_in_play_df(play_df, plays_with_injuries, plays_with_injuries_and_injury_record):
    columns_to_check = ['play_id', 'game_id']
    play_df["was_injured"] = 0
    play_df["missed_time"] = 0
    play_df.loc[play_df[columns_to_check].apply(tuple, 1).isin(plays_with_injuries[columns_to_check].apply(tuple, 1)), 'was_injured'] = 1
    play_df.loc[play_df[columns_to_check].apply(tuple, 1).isin(plays_with_injuries_and_injury_record[columns_to_check].apply(tuple, 1)), 'missed_time'] = 1

    return play_df

In [7]:
plays_with_injuries_2024, plays_with_injuries_and_injury_record_2024 = merge_play_injury_dfs(play2024_df, injury2024_df)


In [8]:
# Print number of rows for each returned table
print("Rows in plays_with_injuries_2024:", len(plays_with_injuries_2024))
print("Rows in plays_with_injuries_and_injury_record_2024:", len(plays_with_injuries_and_injury_record_2024))


Rows in plays_with_injuries_2024: 457
Rows in plays_with_injuries_and_injury_record_2024: 185


In [9]:
play2024_df = populate_cols_in_play_df(play2024_df, plays_with_injuries_2024, plays_with_injuries_and_injury_record_2024)

### MODELING 

In [10]:
# dropping meta data columns
play2024_df = play2024_df.drop(columns=["play_id","game_id","old_game_id"])

In [11]:
numeric_columns = play2024_df.select_dtypes(include=['number']).columns

In [12]:
play2024_df = play2024_df.fillna("missing")

In [13]:
# converting categorical data into numeric

categorical_cols = play2024_df.select_dtypes(include=['object']).columns.tolist()
label_encoder = LabelEncoder()

for col in categorical_cols:
    if play2024_df[col].dtype == 'object':
        play2024_df[col] = label_encoder.fit_transform(play2024_df[col].astype(str))

play2024_df['date'] = pd.to_datetime(play2024_df['date'])

play2024_df['game_month'] = play2024_df['date'].dt.month
play2024_df['game_day'] = play2024_df['date'].dt.day

# Drop the original 'game_date' column if not needed
play2024_df = play2024_df.drop(columns=['date'])

In [14]:
# dimensionality reduction

# Apply PCA to reduce dimensions
pca = PCA(n_components=0.95, svd_solver="full") 
# Separate features (X) and target (y)
X = play2024_df.drop(columns=['was_injured', "missed_time"])  # Drop the target column
y = play2024_df['was_injured']  # Target column
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Check explained variance to understand how much information is retained
print(f"Explained Variance Ratio: {pca.explained_variance_ratio_}")
print(f"Total Explained Variance: {sum(pca.explained_variance_ratio_)}")

components_df = pd.DataFrame(pca.components_, columns=X.columns)

# Print out the components dataframe
print("PCA Component Contributions:")
print(components_df)



Explained Variance Ratio: [0.19002688 0.15583773 0.13990312 0.10048059 0.0830114  0.05997082
 0.05451246 0.04865525 0.0331084  0.02957479 0.02040134 0.01655902
 0.01565574 0.01150382]
Total Explained Variance: 0.9592013814699233
PCA Component Contributions:
       home_team     away_team   season_type      week   posteam  \
0  -3.998781e-06 -7.429437e-07 -0.000000e+00  0.000039  0.000056   
1   7.408963e-06 -2.063335e-06 -5.551115e-17 -0.000020  0.000002   
2  -1.311545e-05  8.914307e-06  6.661338e-16 -0.000187 -0.000126   
3   2.489114e-06 -1.326907e-05  2.706169e-16 -0.000186  0.000096   
4  -3.492052e-05  4.033114e-05 -2.775558e-16  0.000040 -0.000017   
5  -1.387533e-05  1.674427e-05 -5.898060e-16 -0.000057  0.000114   
6  -1.406475e-05  1.753410e-05 -5.568462e-16 -0.000034  0.000022   
7   1.708387e-05 -2.231561e-06  5.288738e-16  0.000012 -0.000032   
8  -1.591382e-05 -2.784620e-06 -1.110223e-16  0.000082 -0.000206   
9  -4.103848e-07 -7.438283e-06 -4.542807e-17 -0.000011 -0.0000

In [15]:
# Calculate scale_pos_weight (Ratio of class 0 to class 1)
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
print(f"Calculated scale_pos_weight: {scale_pos_weight}")

# Initialize XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)


# Fit the model on training data
model.fit(X_train_pca, y_train)

# Predict on the test set
y_pred = model.predict(X_test_pca)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Calculated scale_pos_weight: 57.91693290734824


Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      7760
           1       0.04      0.01      0.02       144

    accuracy                           0.98      7904
   macro avg       0.51      0.50      0.50      7904
weighted avg       0.96      0.98      0.97      7904

