In [25]:
import pandas as pd
from tabulate import tabulate
from scipy.stats import ttest_ind
import warnings
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
pd.set_option('display.max_columns', None)

In [1]:
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import ADASYN

In [None]:
play2024_df = pd.read_csv("data/play_by_play_2024.csv")
injury2024_df = pd.read_csv("data/injuries_2024.csv")

In [3]:
def first_last_a(name):
    parts = name.split(" ", 1)
    return f"{parts[0][0]}.{parts[1]}"

In [4]:
def first_last_b(name):
    parts = name.split(" ", 1)
    return f"{parts[0][0:2]}.{parts[1]}"

In [5]:
warnings.filterwarnings('ignore')

def merge_play_injury_dfs(play_df, injury_df):
    # preprocessing
    injury_df["date"] = pd.to_datetime(injury_df['date_modified'])
    play_df["date"] = pd.to_datetime(play_df['game_date'])
    injury_df['date'] = injury_df['date'].dt.tz_localize(None)
    play_df['date'] = play_df['date'].dt.tz_localize(None)

    # filtering
    plays_with_injuries = play_df[play_df['desc'].str.contains("was injured", na=False)]
    pattern = r'(\w+\.(?:\w|-|\.|\')+(?: \w+)*) was injured'
    # Extract the injured player's name from the desc column
    injured_players = plays_with_injuries.loc[:, "desc"].str.extract(pattern)

    # concatenation
    plays_with_injuries = pd.concat([plays_with_injuries, injured_players], axis=1)
    plays_with_injuries.rename(columns={0: "injured_player"}, inplace=True)
    plays_with_injuries = plays_with_injuries.reset_index(drop=True)
    
    # merging
    injuries = []
    for (week, team), group_injury_df in injury_df.groupby(['week', 'team']):
        group_play_df = plays_with_injuries[(plays_with_injuries['week'] == week) & ((plays_with_injuries['home_team'] == team) | (plays_with_injuries['away_team'] == team))]

        group_injury_df = group_injury_df[group_injury_df.date >= group_play_df.date.max()]

        group_injury_df["first_type"] = group_injury_df['full_name'].apply(first_last_a)
        group_injury_df["second_type"] = group_injury_df['full_name'].apply(first_last_b)

        x = pd.merge(group_play_df, group_injury_df, left_on="injured_player", right_on="first_type", how="inner")
        y = pd.merge(group_play_df, group_injury_df, left_on="injured_player", right_on="second_type", how="inner")

        injuries.append(pd.concat([x, y], axis = 0, ignore_index=True))

    plays_with_injuries_and_injury_record = (pd.concat(injuries, axis=0, ignore_index=True)).drop(columns=["first_type", "second_type"])
    plays_with_injuries_and_injury_record = plays_with_injuries_and_injury_record.sort_values('play_id', ascending=False).drop_duplicates(subset=['week_x', 'full_name', "team"], keep='first')

    return plays_with_injuries, plays_with_injuries_and_injury_record
#returns (plays where injuries occurred, plays were injuries occurred and missed time)



In [6]:
def populate_cols_in_play_df(play_df, plays_with_injuries, plays_with_injuries_and_injury_record):
    columns_to_check = ['play_id', 'game_id']
    play_df["was_injured"] = 0
    play_df["missed_time"] = 0
    play_df.loc[play_df[columns_to_check].apply(tuple, 1).isin(plays_with_injuries[columns_to_check].apply(tuple, 1)), 'was_injured'] = 1
    play_df.loc[play_df[columns_to_check].apply(tuple, 1).isin(plays_with_injuries_and_injury_record[columns_to_check].apply(tuple, 1)), 'missed_time'] = 1

    return play_df

In [7]:
plays_with_injuries_2024, plays_with_injuries_and_injury_record_2024 = merge_play_injury_dfs(play2024_df, injury2024_df)


In [None]:
# Print number of rows for each returned table
print("Rows in plays_with_injuries_2024:", len(plays_with_injuries_2024))
print("Rows in plays_with_injuries_and_injury_record_2024:", len(plays_with_injuries_and_injury_record_2024))


In [9]:
play2024_df = populate_cols_in_play_df(play2024_df, plays_with_injuries_2024, plays_with_injuries_and_injury_record_2024)

In [10]:
columns_needed = [
    # "play_id",
    # "game_id",
    "home_team",
    "away_team",
    "season_type",
    "game_date",
    "down",
    "play_type",
    "score_differential_post",
    # "order_sequence",
    # "time_of_day",
    "stadium",
    "weather",
    "roof",
    "surface",
    "temp",
    "wind",
    "was_injured",
    "missed_time"
]

In [11]:
final_df = play2024_df[[*columns_needed]]

In [None]:
print(final_df)

In [None]:
print(play2024_df['weather'].unique())

In [14]:
import pandas as pd
import numpy as np
import re

# Assuming play2024_df is your DataFrame

# Step 1: Extract and categorize weather condition
def extract_weather_condition(description):
    if pd.isna(description) or 'N/A' in description:
        return 'Unknown'
    
    description = description.lower()
    
    # Expanded weather condition checks with more keywords
    if any(word in description for word in ['sunny', 'mostly sunny', 'partly sunny', 'sun']):
        return 'Sunny'
    elif any(word in description for word in ['cloudy', 'mostly cloudy', 'partly cloudy', 'overcast']):
        return 'Cloudy'
    elif 'clear' in description:
        return 'Clear'
    elif 'rain' in description or 'rainy' in description:
        return 'Rain'
    elif 'fog' in description or 'foggy' in description:
        return 'Foggy'
    elif 'wind' in description or 'blustery' in description:
        return 'Windy'
    elif 'controlled climate' in description or 'indoors' in description:
        return 'Controlled Climate'
    elif 'fair' in description:
        return 'Fair'
    else:
        return 'Other'

# Apply the improved categorization function
play2024_df['weather_condition'] = play2024_df['weather'].apply(extract_weather_condition)

# Step 2: Extract and categorize temperature
def extract_temperature(description):
    match = re.search(r'Temp: (\d+)', description) if isinstance(description, str) else None
    return int(match.group(1)) if match else np.nan

play2024_df['temperature'] = play2024_df['weather'].apply(extract_temperature)

def categorize_temperature(temp):
    if pd.isna(temp):
        return 'Unknown'
    elif temp < 50:
        return 'Cold'
    elif 50 <= temp <= 75:
        return 'Moderate'
    else:
        return 'Hot'

play2024_df['temperature_category'] = play2024_df['temperature'].apply(categorize_temperature)

# Step 3: Extract and categorize humidity
def extract_humidity(description):
    match = re.search(r'Humidity: (\d+)%', description) if isinstance(description, str) else None
    return int(match.group(1)) if match else np.nan

play2024_df['humidity'] = play2024_df['weather'].apply(extract_humidity)

def categorize_humidity(humidity):
    if pd.isna(humidity):
        return 'Unknown'
    elif humidity < 40:
        return 'Low'
    elif 40 <= humidity <= 70:
        return 'Moderate'
    else:
        return 'High'

play2024_df['humidity_category'] = play2024_df['humidity'].apply(categorize_humidity)

# Step 4: Extract and categorize wind
def extract_wind_speed(description):
    match = re.search(r'Wind: [A-Za-z]* (\d+)', description) if isinstance(description, str) else None
    return int(match.group(1)) if match else np.nan

play2024_df['wind_speed'] = play2024_df['weather'].apply(extract_wind_speed)

def categorize_wind_speed(wind_speed):
    if pd.isna(wind_speed):
        return 'Unknown'
    elif wind_speed < 5:
        return 'Calm'
    elif 5 <= wind_speed <= 15:
        return 'Breezy'
    else:
        return 'Windy'

play2024_df['wind_category'] = play2024_df['wind_speed'].apply(categorize_wind_speed)


In [None]:
play2024_df[['weather', 'weather_condition', 'temperature', 'temperature_category', 
                   'humidity', 'humidity_category', 'wind_speed', 'wind_category']]

In [None]:
play2024_df['weather_condition'].value_counts()

### MODELING 

In [None]:
# dropping meta data columns
modeling = final_df.drop(columns=["play_id","game_id"])

In [None]:
# Select numeric features and target
numeric_features = ["down", "score_differential_post", "temp", "wind"]
data = play2024_df[numeric_features + ["was_injured"]]

# Calculate correlation matrix
corr_matrix = data.corr()

# Plot the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Analysis for Numeric Features")
plt.show()


In [None]:
# Define a broader set of columns to improve context
columns_needed = [
    "home_team", "away_team", "season_type",
    "down", "play_type", "score_differential_post", 
    "stadium", "roof", "surface", "temp", "wind", "was_injured", "missed_time",
    "weather_condition", "temperature_category", "humidity_category", "wind_category"
]

# Select relevant columns from the DataFrame
X = play2024_df[columns_needed].drop(columns=["was_injured", "missed_time"])
y = play2024_df['was_injured']

# Define numerical and categorical features
numeric_features = ["down", "score_differential_post", "temp", "wind"]
categorical_features = ["home_team", "away_team", "season_type", "play_type", "stadium", "roof", 
                        "surface", "weather_condition", "temperature_category", "humidity_category", "wind_category"]

In [None]:
# Preprocessing for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

In [None]:
# Define the Logistic Regression model with adjusted regularization
model = LogisticRegression(class_weight='balanced', max_iter=1000, C=1.0, penalty='l2')

# Apply SMOTE with a lower sampling strategy to avoid overcompensation
smote = SMOTE(sampling_strategy=0.2, random_state=42)

# Create the pipeline with preprocessing, SMOTE, and the classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', model)
])

In [None]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [None]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Adjust decision threshold
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]  # Get the probability of the positive class
threshold = 0.5  # Adjust this threshold as needed (e.g., 0.35, 0.4, etc.)
y_pred = (y_pred_proba >= threshold).astype(int)  # Convert probabilities to binary predictions

# Evaluate performance with the adjusted threshold
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from imblearn.pipeline import Pipeline
import pandas as pd
import numpy as np

# Define columns to use
columns_needed = [
    "home_team", "away_team", "season_type",
    "down", "play_type", "score_differential_post", 
    "stadium", "roof", "surface", "temp", "wind", "was_injured", "missed_time",
    "weather_condition", "temperature_category", "humidity_category", "wind_category"
]

# Select relevant columns from the DataFrame
data = play2024_df[columns_needed]

# converting categorical data into numeric

categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
label_encoder = LabelEncoder()

for col in categorical_cols:
    if data[col].dtype == 'object':
        data[col] = label_encoder.fit_transform(data[col].astype(str))

data = data.dropna()

# Separate majority and minority classes
injury_class = data[data['was_injured'] == 1]
no_injury_class = data[data['was_injured'] == 0]

# Bootstrapping the injury class to add more samples with replacement
# bootstrap_injury = injury_class.sample(n=8000, replace=True, random_state=42)

# Combine bootstrapped minority samples with majority class
# augmented_data = pd.concat([no_injury_class, injury_class, bootstrap_injury])
augmented_data = pd.concat([no_injury_class, injury_class])

# Separate features and target
X = augmented_data.drop(columns=["was_injured", "missed_time"])
y = augmented_data['was_injured']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Define numerical and categorical features
numeric_features = ["down", "score_differential_post", "temp", "wind"]
categorical_features = ["home_team", "away_team", "season_type", "play_type", "stadium", "roof", 
                        "surface", "weather_condition", "temperature_category", "humidity_category", "wind_category"]

# Preprocessing for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Balanced Random Forest model with SMOTE
model = BalancedRandomForestClassifier(n_estimators=500, max_depth=3, random_state=42)

# Hybrid sampling strategy with SMOTE after bootstrapping
# smote = SMOTE(sampling_strategy=0.5, random_state=42)

# Create the pipeline with preprocessing, SMOTE, and the classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', model)
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Adjust decision threshold
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]  # Get the probability of the positive class
threshold = 0.5  # Adjust this threshold as needed (e.g., 0.35, 0.4, etc.)
y_pred = (y_pred_proba >= threshold).astype(int)  # Convert probabilities to binary predictions

# Evaluate performance with the adjusted threshold
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred)

# Print confusion matrix
print("Confusion Matrix:")
print(cm)

# Visualize confusion matrix as a heatmap
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap="Blues")  # You can customize the color map if needed