In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("final_clarkson_mens_hockey_capstone_dataset.csv")

df.head()
df.info()


In [None]:
df.describe()


In [None]:
pei_rank = (
    df.sort_values("PEI", ascending=False)
      .reset_index(drop=True)
)

pei_rank[['Player', 'Position', 'Year', 'G', 'A', 'PEI']].head(10)


In [None]:
df.groupby("Position")["PEI"].agg(
    ['mean', 'median', 'std', 'count']
)


In [None]:
df[['PEI', 'Injury_Burden', 'Return Timeline (days)']].corr()


In [None]:
import statsmodels.api as sm

X = df[['Time on Ice (min/game)', 'Injury_Burden', 'Sleep Hours/Day']]
X = sm.add_constant(X)

y = df['PEI']

model = sm.OLS(y, X, missing='drop').fit()
model.summary()


In [None]:
df['Ice_Time_Tier'] = pd.qcut(
    df['Time on Ice (min/game)'],
    q=3,
    labels=['Low', 'Medium', 'High']
)

import scipy.stats as stats

low = df[df['Ice_Time_Tier']=='Low']['PEI']
med = df[df['Ice_Time_Tier']=='Medium']['PEI']
high = df[df['Ice_Time_Tier']=='High']['PEI']

stats.f_oneway(low.dropna(), med.dropna(), high.dropna())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_excel("Full_Expanded_Clarkson_Hockey_Dataset.xlsx")

# Variables for histograms
hist_vars = [
    "Training Hours/Week",
    "Sleep Hours/Day",
    "Games Missed",
    "Time on Ice (min/game)"
]

for col in hist_vars:
    plt.figure()
    plt.hist(df[col], bins=10)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Select numeric columns
corr_cols = [
    "Training Hours/Week",
    "Weight Room Hours/Week",
    "On-Ice Conditioning Score",
    "Sleep Hours/Day",
    "Games Missed",
    "Time on Ice (min/game)",
    "Faceoff %",
    "Blocked Shots"
]

corr_matrix = df[corr_cols].corr()

# Plot correlation matrix
plt.figure(figsize=(10, 8))
plt.imshow(corr_matrix)
plt.colorbar()
plt.xticks(range(len(corr_cols)), corr_cols, rotation=45, ha="right")
plt.yticks(range(len(corr_cols)), corr_cols)
plt.title("Correlation Matrix – Clarkson Hockey Dataset")
plt.tight_layout()
plt.show()


In [None]:
# Injury frequency
injury_counts = df["Injury"].value_counts()

plt.figure()
plt.bar(injury_counts.index, injury_counts.values)
plt.title("Injury Severity Distribution – Clarkson Hockey")
plt.xlabel("Injury Severity")
plt.ylabel("Number of Players")
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_excel("Full_Expanded_Clarkson_Hockey_Dataset.xlsx")

# Create binary injury target
df["Injury_Flag"] = df["Injury"].apply(lambda x: 0 if x == "None" else 1)

features = [
    "Training Hours/Week",
    "Weight Room Hours/Week",
    "On-Ice Conditioning Score",
    "Sleep Hours/Day",
    "Time on Ice (min/game)",
    "Faceoff %",
    "Blocked Shots"
]

X = df[features]
y = df["Injury_Flag"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Scale for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
print(df["Games Missed"].describe())
print(df["Injury_Flag"].value_counts())


In [None]:
df["Injury_Flag"] = (df["Games Missed"] > 1).astype(int)


In [None]:
print(df["Injury_Flag"].value_counts())


In [None]:
from sklearn.model_selection import train_test_split

X = df[features]
y = df["Injury_Flag"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42
)

print("Training classes:\n", y_train.value_counts())


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

y_pred = log_model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Injury occurrence was defined using a threshold-based approach. Players missing more than one game were classified as injured, while players missing one or fewer games were classified as non-injured. This definition reflects common practice in hockey, where minor absences do not constitute reportable injuries, and ensured sufficient class variation for predictive modeling.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

y_pred_log = log_model.predict(X_test_scaled)
y_prob_log = log_model.predict_proba(X_test_scaled)[:,1]

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_log))


In [None]:
fpr, tpr, _ = roc_curve(y_test, y_prob_log)

plt.figure()
plt.plot(fpr, tpr, label="Logistic Regression")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – Logistic Regression")
plt.legend()
plt.show()


In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(
    max_depth=4,
    random_state=42
)

dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)
y_prob_dt = dt_model.predict_proba(X_test)[:,1]

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_dt))


In [None]:
fpr, tpr, _ = roc_curve(y_test, y_prob_dt)

plt.figure()
plt.plot(fpr, tpr, label="Decision Tree")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – Decision Tree")
plt.legend()
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    random_state=42
)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:,1]

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))


In [None]:
fpr, tpr, _ = roc_curve(y_test, y_prob_rf)

plt.figure()
plt.plot(fpr, tpr, label="Random Forest")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – Random Forest")
plt.legend()
plt.show()


Random Forest is expected to perform best due to its ability to capture nonlinear relationships between training load, recovery, and injury risk.


Logistic Regression

Provides interpretability

Shows direction of injury risk factors

Used as a baseline model

Decision Tree

Easy to explain to coaches

Shows decision paths (training load → injury risk)

Random Forest

Best predictive performance

Captures complex interactions

Recommended for operational use