In [85]:
import pandas as pd
from tabulate import tabulate
from scipy.stats import ttest_ind
import warnings
pd.set_option('display.max_columns', None)

In [86]:
play2024_df = pd.read_csv("data/play_by_play_2024.csv")
injury2024_df = pd.read_csv("data/injuries_2024.csv")

In [87]:
def first_last_a(name):
    parts = name.split(" ", 1)
    return f"{parts[0][0]}.{parts[1]}"

In [88]:
def first_last_b(name):
    parts = name.split(" ", 1)
    return f"{parts[0][0:2]}.{parts[1]}"

In [89]:
warnings.filterwarnings('ignore')

def merge_play_injury_dfs(play_df, injury_df):

    # preprocessing
    injury_df["date"] = pd.to_datetime(injury_df['date_modified'])
    play_df["date"] = pd.to_datetime(play_df['game_date'])
    injury_df['date'] = injury_df['date'].dt.tz_localize(None)
    play_df['date'] = play_df['date'].dt.tz_localize(None)

    # filtering
    plays_with_injuries = play_df[play_df['desc'].str.contains("was injured", na=False)]
    pattern = r'(\w+\.(?:\w|-|\.|\')+(?: \w+)*) was injured'
    # Extract the injured player's name from the desc column
    injured_players = plays_with_injuries.loc[:, "desc"].str.extract(pattern)

    # concatenation
    plays_with_injuries = pd.concat([plays_with_injuries, injured_players], axis=1)
    plays_with_injuries.rename(columns={0: "injured_player"}, inplace=True)
    plays_with_injuries = plays_with_injuries.reset_index(drop=True)
    
    # merging
    injuries = []
    for (week, team), group_injury_df in injury_df.groupby(['week', 'team']):
        group_play_df = plays_with_injuries[(plays_with_injuries['week'] == week) & ((plays_with_injuries['home_team'] == team) | (plays_with_injuries['away_team'] == team))]

        group_injury_df = group_injury_df[group_injury_df.date >= group_play_df.date.max()]

        group_injury_df["first_type"] = group_injury_df['full_name'].apply(first_last_a)
        group_injury_df["second_type"] = group_injury_df['full_name'].apply(first_last_b)

        x = pd.merge(group_play_df, group_injury_df, left_on="injured_player", right_on="first_type", how="inner")
        y = pd.merge(group_play_df, group_injury_df, left_on="injured_player", right_on="second_type", how="inner")

        injuries.append(pd.concat([x, y], axis = 0, ignore_index=True))

    plays_with_injuries_and_injury_record = (pd.concat(injuries, axis=0, ignore_index=True)).drop(columns=["first_type", "second_type"])
    plays_with_injuries_and_injury_record = plays_with_injuries_and_injury_record.sort_values('play_id', ascending=False).drop_duplicates(subset=['week_x', 'full_name', "team"], keep='first')

    return plays_with_injuries, plays_with_injuries_and_injury_record
#returns (plays where injuries occurred, plays were injuries occurred and missed time)



In [90]:
def populate_cols_in_play_df(play_df, plays_with_injuries, plays_with_injuries_and_injury_record):
    columns_to_check = ['play_id', 'game_id']
    play_df["was_injured"] = 0
    play_df["missed_time"] = 0
    play_df.loc[play_df[columns_to_check].apply(tuple, 1).isin(plays_with_injuries[columns_to_check].apply(tuple, 1)), 'was_injured'] = 1
    play_df.loc[play_df[columns_to_check].apply(tuple, 1).isin(plays_with_injuries_and_injury_record[columns_to_check].apply(tuple, 1)), 'missed_time'] = 1

    return play_df

In [91]:
plays_with_injuries_2024, plays_with_injuries_and_injury_record_2024 = merge_play_injury_dfs(play2024_df, injury2024_df)


In [92]:
# Print number of rows for each returned table
print("Rows in plays_with_injuries_2024:", len(plays_with_injuries_2024))
print("Rows in plays_with_injuries_and_injury_record_2024:", len(plays_with_injuries_and_injury_record_2024))


Rows in plays_with_injuries_2024: 457
Rows in plays_with_injuries_and_injury_record_2024: 185


In [93]:
play2024_df = populate_cols_in_play_df(play2024_df, plays_with_injuries_2024, plays_with_injuries_and_injury_record_2024)
len(play2024_df)
print(len(play2024_df.columns.tolist()))
print(play2024_df.columns.tolist())


375
['play_id', 'game_id', 'old_game_id', 'home_team', 'away_team', 'season_type', 'week', 'posteam', 'posteam_type', 'defteam', 'side_of_field', 'yardline_100', 'game_date', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'game_half', 'quarter_end', 'drive', 'sp', 'qtr', 'down', 'goal_to_go', 'time', 'yrdln', 'ydstogo', 'ydsnet', 'desc', 'play_type', 'yards_gained', 'shotgun', 'no_huddle', 'qb_dropback', 'qb_kneel', 'qb_spike', 'qb_scramble', 'pass_length', 'pass_location', 'air_yards', 'yards_after_catch', 'run_location', 'run_gap', 'field_goal_result', 'kick_distance', 'extra_point_result', 'two_point_conv_result', 'home_timeouts_remaining', 'away_timeouts_remaining', 'timeout', 'timeout_team', 'td_team', 'td_player_name', 'td_player_id', 'posteam_timeouts_remaining', 'defteam_timeouts_remaining', 'total_home_score', 'total_away_score', 'posteam_score', 'defteam_score', 'score_differential', 'posteam_score_post', 'defteam_score_post', 'score_differen

### MODELING 

In [94]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import class_weight

In [95]:
X = play2024_df.drop(columns=[
    'was_injured', 'missed_time', 'play_id', 'game_id', 'game_date', 'desc', 
    'td_player_name', 'passer_player_name', 'rusher_player_name', 'receiver_player_name', 
    'nfl_api_id', 'fantasy_player_name', 'fantasy_player_id', 'passer_jersey_number', 'rusher_jersey_number',
    'receiver_jersey_number', 'jersey_number'
])

# Drop all columns with datetime64 data type from X
X = X.select_dtypes(exclude=['datetime64'])

y_injury = play2024_df['was_injured']
y_severe_injury = play2024_df['missed_time']

# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Fill missing values for numerical columns with the mean
X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].mean())

# Fill missing values for categorical columns with 'Unknown'
X[categorical_cols] = X[categorical_cols].fillna('Unknown')

# Encode categorical variables
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column].astype(str))
    label_encoders[column] = le  # Store encoders for future use

In [96]:
# Split the data into training and test sets
X_train, X_test, y_train_injury, y_test_injury = train_test_split(X, y_injury, test_size=0.3, random_state=42)
_, _, y_train_severe, y_test_severe = train_test_split(X, y_severe_injury, test_size=0.3, random_state=42)

# Compute class weights based on the training labels, not the entire dataset
class_weights_injury = class_weight.compute_sample_weight('balanced', y_train_injury)
class_weights_severe = class_weight.compute_sample_weight('balanced', y_train_severe)


In [97]:
print(len(class_weights_injury))
print(len(X_train))

18441
18441


In [98]:
# Train the Random Forest for predicting injury
rf_injury = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_injury.fit(X_train, y_train_injury, sample_weight=class_weights_injury)


# Train the Random Forest for predicting severe injury
rf_severe = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_severe.fit(X_train, y_train_severe, sample_weight=class_weights_severe)


In [99]:
# Predict on the test set
y_pred_injury = rf_injury.predict(X_test)
y_pred_severe = rf_severe.predict(X_test)

# Print classification report for both models
print("Injury Prediction Report")
print(classification_report(y_test_injury, y_pred_injury, target_names=['No Injury', 'Injury']))

print("\nSevere Injury Prediction Report")
print(classification_report(y_test_severe, y_pred_severe, target_names=['No Missed Time', 'Missed Time']))

# Feature importance
injury_importances = pd.Series(rf_injury.feature_importances_, index=X.columns).sort_values(ascending=False)
severe_importances = pd.Series(rf_severe.feature_importances_, index=X.columns).sort_values(ascending=False)

# Display the top important features for both models
print("\nTop Features for Injury Prediction:")
print(injury_importances.head(10))

print("\nTop Features for Severe Injury Prediction:")
print(severe_importances.head(10))


Injury Prediction Report
              precision    recall  f1-score   support

   No Injury       0.98      1.00      0.99      7760
      Injury       0.00      0.00      0.00       144

    accuracy                           0.98      7904
   macro avg       0.49      0.50      0.50      7904
weighted avg       0.96      0.98      0.97      7904


Severe Injury Prediction Report
                precision    recall  f1-score   support

No Missed Time       0.99      1.00      1.00      7850
   Missed Time       0.00      0.00      0.00        54

      accuracy                           0.99      7904
     macro avg       0.50      0.50      0.50      7904
  weighted avg       0.99      0.99      0.99      7904


Top Features for Injury Prediction:
touchback          0.034169
opp_safety_prob    0.022136
td_prob            0.021894
fg_prob            0.019104
safety_prob        0.018856
no_score_prob      0.018683
opp_td_prob        0.017249
opp_fg_prob        0.016630
rushing_yards  