In [56]:
import pandas as pd
from tabulate import tabulate
from scipy.stats import ttest_ind
import warnings
pd.set_option('display.max_columns', None)

In [57]:
play2024_df = pd.read_csv("data/play_by_play_2024.csv")
injury2024_df = pd.read_csv("data/injuries_2024.csv")

In [58]:
def first_last_a(name):
    parts = name.split(" ", 1)
    return f"{parts[0][0]}.{parts[1]}"

In [59]:
def first_last_b(name):
    parts = name.split(" ", 1)
    return f"{parts[0][0:2]}.{parts[1]}"

In [60]:
warnings.filterwarnings('ignore')

def merge_play_injury_dfs(play_df, injury_df):

    # preprocessing
    injury_df["date"] = pd.to_datetime(injury_df['date_modified'])
    play_df["date"] = pd.to_datetime(play_df['game_date'])
    injury_df['date'] = injury_df['date'].dt.tz_localize(None)
    play_df['date'] = play_df['date'].dt.tz_localize(None)

    # filtering
    plays_with_injuries = play_df[play_df['desc'].str.contains("was injured", na=False)]
    pattern = r'(\w+\.(?:\w|-|\.|\')+(?: \w+)*) was injured'
    # Extract the injured player's name from the desc column
    injured_players = plays_with_injuries.loc[:, "desc"].str.extract(pattern)

    # concatenation
    plays_with_injuries = pd.concat([plays_with_injuries, injured_players], axis=1)
    plays_with_injuries.rename(columns={0: "injured_player"}, inplace=True)
    plays_with_injuries = plays_with_injuries.reset_index(drop=True)
    
    # merging
    injuries = []
    for (week, team), group_injury_df in injury_df.groupby(['week', 'team']):
        group_play_df = plays_with_injuries[(plays_with_injuries['week'] == week) & ((plays_with_injuries['home_team'] == team) | (plays_with_injuries['away_team'] == team))]

        group_injury_df = group_injury_df[group_injury_df.date >= group_play_df.date.max()]

        group_injury_df["first_type"] = group_injury_df['full_name'].apply(first_last_a)
        group_injury_df["second_type"] = group_injury_df['full_name'].apply(first_last_b)

        x = pd.merge(group_play_df, group_injury_df, left_on="injured_player", right_on="first_type", how="inner")
        y = pd.merge(group_play_df, group_injury_df, left_on="injured_player", right_on="second_type", how="inner")

        injuries.append(pd.concat([x, y], axis = 0, ignore_index=True))

    plays_with_injuries_and_injury_record = (pd.concat(injuries, axis=0, ignore_index=True)).drop(columns=["first_type", "second_type"])
    plays_with_injuries_and_injury_record = plays_with_injuries_and_injury_record.sort_values('play_id', ascending=False).drop_duplicates(subset=['week_x', 'full_name', "team"], keep='first')

    return plays_with_injuries, plays_with_injuries_and_injury_record
#returns (plays where injuries occurred, plays were injuries occurred and missed time)



In [61]:
def populate_cols_in_play_df(play_df, plays_with_injuries, plays_with_injuries_and_injury_record):
    columns_to_check = ['play_id', 'game_id']
    play_df["was_injured"] = 0
    play_df["missed_time"] = 0
    play_df.loc[play_df[columns_to_check].apply(tuple, 1).isin(plays_with_injuries[columns_to_check].apply(tuple, 1)), 'was_injured'] = 1
    play_df.loc[play_df[columns_to_check].apply(tuple, 1).isin(plays_with_injuries_and_injury_record[columns_to_check].apply(tuple, 1)), 'missed_time'] = 1

    return play_df

In [62]:
plays_with_injuries_2024, plays_with_injuries_and_injury_record_2024 = merge_play_injury_dfs(play2024_df, injury2024_df)


In [63]:
# Print number of rows for each returned table
print("Rows in plays_with_injuries_2024:", len(plays_with_injuries_2024))
print("Rows in plays_with_injuries_and_injury_record_2024:", len(plays_with_injuries_and_injury_record_2024))


Rows in plays_with_injuries_2024: 457
Rows in plays_with_injuries_and_injury_record_2024: 185


In [64]:
play2024_df = populate_cols_in_play_df(play2024_df, plays_with_injuries_2024, plays_with_injuries_and_injury_record_2024)
print(len(play2024_df))
print(len(play2024_df.columns.tolist()))
print(play2024_df.columns.tolist())


26345
375
['play_id', 'game_id', 'old_game_id', 'home_team', 'away_team', 'season_type', 'week', 'posteam', 'posteam_type', 'defteam', 'side_of_field', 'yardline_100', 'game_date', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'game_half', 'quarter_end', 'drive', 'sp', 'qtr', 'down', 'goal_to_go', 'time', 'yrdln', 'ydstogo', 'ydsnet', 'desc', 'play_type', 'yards_gained', 'shotgun', 'no_huddle', 'qb_dropback', 'qb_kneel', 'qb_spike', 'qb_scramble', 'pass_length', 'pass_location', 'air_yards', 'yards_after_catch', 'run_location', 'run_gap', 'field_goal_result', 'kick_distance', 'extra_point_result', 'two_point_conv_result', 'home_timeouts_remaining', 'away_timeouts_remaining', 'timeout', 'timeout_team', 'td_team', 'td_player_name', 'td_player_id', 'posteam_timeouts_remaining', 'defteam_timeouts_remaining', 'total_home_score', 'total_away_score', 'posteam_score', 'defteam_score', 'score_differential', 'posteam_score_post', 'defteam_score_post', 'score_di

### MODELING 

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE

In [66]:
# Drop unnecessary columns from X
X = play2024_df.drop(columns=[
    'was_injured', 'missed_time', 'play_id', 'game_id', 'game_date', 'desc', 
    'td_player_name', 'passer_player_name', 'rusher_player_name', 'receiver_player_name', 
    'nfl_api_id', 'fantasy_player_name', 'fantasy_player_id', 'passer_jersey_number', 
    'rusher_jersey_number', 'receiver_jersey_number', 'jersey_number'
])

# Drop all columns with datetime64 data type from X
X = X.select_dtypes(exclude=['datetime64'])

# Define target variables
y_injury = play2024_df['was_injured']

In [67]:
print(play2024_df['was_injured'].value_counts(normalize=True))


was_injured
0    0.982653
1    0.017347
Name: proportion, dtype: float64


In [68]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import lightgbm as lgb
from collections import Counter
import time


# Step 1: Handle Categorical and Missing Values
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing Pipeline
numeric_transformer = SimpleImputer(strategy='mean')  # Replace NaN with column mean
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace NaN with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Apply Preprocessing
X_processed = preprocessor.fit_transform(X)

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_injury, stratify=y_injury, random_state=42, test_size=0.1
)

# Step 3: Check Class Distribution
print("Original class distribution:", Counter(y_injury))

# 2. Undersampling
undersampler = RandomUnderSampler(random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)
print("Undersampled class distribution:", Counter(y_train_undersampled))

# Step 4: Model Training and Evaluation
def train_and_evaluate_model(model, model_name, X_train, y_train, X_test, y_test):
    print(f"\n=== {model_name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    
    # Confusion Matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Models to Evaluate
models = [
    ("Random Forest (Balanced)", RandomForestClassifier(class_weight="balanced", random_state=42)),
    ("LightGBM", lgb.LGBMClassifier(is_unbalance=True, random_state=42)),
]

# Evaluate Models with Undersampling
print("\n*** Using Undersampling ***")
for model_name, model in models:
    start = time.time()
    train_and_evaluate_model(model, model_name, X_train_undersampled, y_train_undersampled, X_test, y_test)
    end = time.time()
    print(f"Time taken: {end - start:.2f} seconds")


Original class distribution: Counter({0: 25888, 1: 457})
Undersampled class distribution: Counter({0: 411, 1: 411})

*** Using Undersampling ***

=== Random Forest (Balanced) ===
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.54      0.70      2589
           1       0.02      0.63      0.05        46

    accuracy                           0.54      2635
   macro avg       0.51      0.58      0.37      2635
weighted avg       0.97      0.54      0.69      2635

ROC-AUC Score: 0.6101
Confusion Matrix:
[[1395 1194]
 [  17   29]]
Time taken: 0.53 seconds

=== LightGBM ===
[LightGBM] [Info] Number of positive: 411, number of negative: 411
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009254 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17839
[LightGBM] [Info] Number of data points in the train set: 822, number of used features: 540
[Ligh

In [69]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import lightgbm as lgb
from collections import Counter
import time


# Step 1: Handle Categorical and Missing Values
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing Pipeline
numeric_transformer = SimpleImputer(strategy='mean')  # Replace NaN with column mean
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace NaN with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Apply Preprocessing
X_processed = preprocessor.fit_transform(X)

# Convert sparse matrix to dense
X_dense = X_processed.toarray()

# Step 2: PCA Dimensionality Reduction
pca = PCA(n_components=50)  # Retain 50 principal components (adjust as needed)
X_pca = pca.fit_transform(X_dense)

# Calculate and display explained variance
total_variance_explained = sum(pca.explained_variance_ratio_)
print(f"Total explained variance by PCA: {total_variance_explained:.4f}")

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y_injury, stratify=y_injury, random_state=42, test_size=0.2
)

# Step 4: Undersampling
undersampler = RandomUnderSampler(random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)
print("Undersampled class distribution:", Counter(y_train_undersampled))

# Step 5: Model Training and Evaluation
def train_and_evaluate_model(model, model_name, X_train, y_train, X_test, y_test):
    print(f"\n=== {model_name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    
    # Confusion Matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Models to Evaluate
models = [
    ("Random Forest (Balanced)", RandomForestClassifier(class_weight="balanced", random_state=42)),
    ("LightGBM", lgb.LGBMClassifier(is_unbalance=True, random_state=42)),
]

# Evaluate Models with PCA and Undersampling
print("\n*** Using PCA and Undersampling ***")
for model_name, model in models:
    start = time.time()
    train_and_evaluate_model(model, model_name, X_train_undersampled, y_train_undersampled, X_test, y_test)
    end = time.time()
    print(f"Time taken: {end - start:.2f} seconds")


Total explained variance by PCA: 1.0000
Undersampled class distribution: Counter({0: 366, 1: 366})

*** Using PCA and Undersampling ***

=== Random Forest (Balanced) ===
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.56      0.72      5178
           1       0.02      0.63      0.05        91

    accuracy                           0.56      5269
   macro avg       0.51      0.59      0.38      5269
weighted avg       0.97      0.56      0.70      5269

ROC-AUC Score: 0.6409
Confusion Matrix:
[[2905 2273]
 [  34   57]]
Time taken: 0.26 seconds

=== LightGBM ===
[LightGBM] [Info] Number of positive: 366, number of negative: 366
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12213
[LightGBM] [Info] Number of data points in the train set: 732, number of used features: 50
[LightGBM] [Inf