# 01_EDA_Sports.ipynb — Sports Performance & Injury Risk Analytics

**Goal:** Perform comprehensive exploratory analysis on football (soccer) player, training, match, wellness, and injury data to understand workload dynamics, wellness correlations, and injury risk drivers.

**Data sources (synthetic):**
- `Players.csv`, `Teams.csv`, `Matches.csv`
- `PlayerMatchStats.csv`, `TrainingSessions.csv`, `WellnessSnapshots.csv`, `Injuries.csv`

**Outputs:**
- Cleaned analytical summaries saved to `../data/` for modeling
- Visuals covering demographics, workload, performance, injuries, and integrated insights

*Note:* This notebook uses **matplotlib only** for plotting.


In [7]:
# --- 1. Imports & setup ---
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Display defaults
pd.set_option("display.max_columns", 120)
plt.rcParams.update({
    "figure.figsize": (10, 5),
    "axes.grid": True
})

# Paths
BASE = Path(".").resolve()
CSV_DIR = (BASE / ".." / "Sports Performance & Injury Risk Analytics" / "DataSet").resolve()   # adjust if needed
DATA_OUT = (BASE / ".." / "data").resolve()
DATA_OUT.mkdir(parents=True, exist_ok=True)

print("CSV_DIR:", CSV_DIR)
print("DATA_OUT:", DATA_OUT)


CSV_DIR: D:\Sports Performance & Injury Risk Analytics\Notebooks\Sports Performance & Injury Risk Analytics\DataSet
DATA_OUT: D:\Sports Performance & Injury Risk Analytics\Notebooks\data


In [None]:
# --- 2. Load data ---
players = pd.read_csv(CSV_DIR / "Players.csv")
teams = pd.read_csv(CSV_DIR / "Teams.csv")
matches = pd.read_csv(CSV_DIR / "Matches.csv", parse_dates=["Date"])
pms = pd.read_csv(CSV_DIR / "PlayerMatchStats.csv")
train = pd.read_csv(CSV_DIR / "TrainingSessions.csv", parse_dates=["Date"])
well = pd.read_csv(CSV_DIR / "WellnessSnapshots.csv", parse_dates=["SnapshotDate"])
inj = pd.read_csv(CSV_DIR / "Injuries.csv", parse_dates=["InjuryDate","RecoveryDate"])

shapes = {
    "Players": players.shape,
    "Teams": teams.shape,
    "Matches": matches.shape,
    "PlayerMatchStats": pms.shape,
    "TrainingSessions": train.shape,
    "WellnessSnapshots": well.shape,
    "Injuries": inj.shape
}
print("Shapes:", shapes)

date_span = f"{train['Date'].min().date()} to {train['Date'].max().date()}" if len(train) else "N/A"
print("Training date span:", date_span)

# Basic NA check
na_summary = {
    "Players": players.isna().sum().to_dict(),
    "Teams": teams.isna().sum().to_dict(),
    "Matches": matches.isna().sum().to_dict(),
    "PlayerMatchStats": pms.isna().sum().to_dict(),
    "TrainingSessions": train.isna().sum().to_dict(),
    "WellnessSnapshots": well.isna().sum().to_dict(),
    "Injuries": inj.isna().sum().to_dict(),
}
na_summary


FileNotFoundError: [Errno 2] No such file or directory: 'D:\\Sports Performance & Injury Risk Analytics\\Notebooks\\Sports Performance & Injury Risk Analytics\\DataSet\\Players.csv'

## 3. Player Demographics & Team Composition

In [None]:
# Position distribution
pos_counts = players["Position"].value_counts().sort_index()
plt.figure()
pos_counts.plot(kind="bar")
plt.title("Players by Position"); plt.xlabel("Position"); plt.ylabel("Count")
plt.show()

# Age distribution
plt.figure()
players["Age"].plot(kind="hist", bins=20)
plt.title("Age Distribution"); plt.xlabel("Age"); plt.ylabel("Frequency")
plt.show()

# Height vs Weight scatter (colored by position via markers)
pos_markers = {"GK":"o","DF":"s","MF":"^","FW":"x"}
plt.figure()
for pos, mk in pos_markers.items():
    sub = players[players["Position"]==pos]
    plt.scatter(sub["HeightCM"], sub["WeightKG"], label=pos, marker=mk, alpha=0.7)
plt.title("Height vs Weight by Position"); plt.xlabel("Height (cm)"); plt.ylabel("Weight (kg)")
plt.legend(); plt.show()

# BMI & InjuryProneScore
players["BMI"] = players["WeightKG"] / (players["HeightCM"]/100.0)**2
plt.figure()
plt.scatter(players["BMI"], players["InjuryProneScore"], alpha=0.6)
plt.title("InjuryProneScore vs BMI"); plt.xlabel("BMI"); plt.ylabel("InjuryProneScore")
plt.show()

print("Average BMI:", round(players["BMI"].mean(), 2))


## 4. Training Load & Wellness Patterns

In [None]:
# Prepare training features
train = train.sort_values(["PlayerID","Date"]).copy()
train["Load"] = train["DistanceKM"] + 0.5*(train["DurationMinutes"]/60.0)

# Rolling loads
train["Acute7"] = train.groupby("PlayerID")["Load"].rolling(7, min_periods=1).sum().reset_index(0,drop=True)
train["Chronic28"] = train.groupby("PlayerID")["Load"].rolling(28, min_periods=7).sum().reset_index(0,drop=True)
train["ACWR"] = train["Acute7"] / train["Chronic28"].replace({0: np.nan})

# Plot ACWR distribution
plt.figure()
train["ACWR"].plot(kind="hist", bins=40)
plt.title("ACWR Distribution"); plt.xlabel("ACWR"); plt.ylabel("Frequency")
plt.show()

# Wellness correlations (numeric only)
well_num = well[["SleepHours","StressLevel","MuscleSoreness","HydrationLevel","RecoveryScore"]].copy()
corr = well_num.corr(numeric_only=True)
print("Wellness correlations:\n", corr.round(2))

# Visualize correlation matrix with matplotlib (no seaborn)
fig, ax = plt.subplots(figsize=(6,5))
cax = ax.imshow(corr.values, cmap="viridis")
ax.set_xticks(range(len(corr.columns))); ax.set_yticks(range(len(corr.columns)))
ax.set_xticklabels(corr.columns, rotation=45, ha="right"); ax.set_yticklabels(corr.columns)
fig.colorbar(cax, ax=ax)
ax.set_title("Wellness Correlation Matrix")
plt.tight_layout(); plt.show()

# Weekly averages of wellness
well["WeekStart"] = well["SnapshotDate"] - pd.to_timedelta(well["SnapshotDate"].dt.weekday, unit="D")
weekly_well = (well.groupby("WeekStart")[["SleepHours","StressLevel","RecoveryScore"]].mean().reset_index())
plt.figure()
plt.plot(weekly_well["WeekStart"], weekly_well["SleepHours"], label="SleepHours")
plt.plot(weekly_well["WeekStart"], weekly_well["StressLevel"], label="StressLevel")
plt.plot(weekly_well["WeekStart"], weekly_well["RecoveryScore"], label="RecoveryScore")
plt.title("Weekly Wellness Trends"); plt.xlabel("WeekStart"); plt.ylabel("Value"); plt.legend()
plt.show()


## 5. Match Performance Analysis

In [None]:
# Merge position onto match stats
pms_pos = pms.merge(players[["PlayerID","Position"]], on="PlayerID", how="left")

# Avg distance by position
avg_dist_pos = pms_pos.groupby("Position")["DistanceCoveredKM"].mean().reindex(["GK","DF","MF","FW"])
plt.figure()
avg_dist_pos.plot(kind="bar")
plt.title("Avg Match Distance by Position (KM)"); plt.xlabel("Position"); plt.ylabel("KM")
plt.show()

# Pass accuracy vs match rating (scatter)
plt.figure()
plt.scatter(pms["PassAccuracyPct"], pms["MatchRating"], alpha=0.4)
plt.title("Pass Accuracy vs Match Rating"); plt.xlabel("PassAccuracy (%)"); plt.ylabel("Match Rating")
plt.show()

# Minutes played trend (seasonal)
matches["Season"] = matches["Date"].dt.year
pms_season = pms.merge(matches[["MatchID","Season"]], on="MatchID", how="left")
mins_season = pms_season.groupby("Season")["MinutesPlayed"].sum()
plt.figure()
mins_season.plot(kind="bar")
plt.title("Total Minutes Played by Season"); plt.xlabel("Season"); plt.ylabel("Minutes")
plt.show()


## 6. Injury Analysis

In [None]:
# Injuries by severity and season
inj["Season"] = inj["InjuryDate"].dt.year
cnt_sev = inj["Severity"].value_counts()
plt.figure()
cnt_sev.plot(kind="bar")
plt.title("Injuries by Severity"); plt.xlabel("Severity"); plt.ylabel("Count")
plt.show()

inj_season = inj.groupby("Season").size()
plt.figure()
inj_season.plot(kind="bar")
plt.title("Injuries per Season"); plt.xlabel("Season"); plt.ylabel("Count")
plt.show()

# Days out distribution
plt.figure()
inj["DaysOut"].plot(kind="hist", bins=30)
plt.title("Days Out Distribution"); plt.xlabel("Days"); plt.ylabel("Frequency")
plt.show()

# Injury incidence per 1000 training hours
total_hours = train["DurationMinutes"].sum()/60.0 if len(train) else 0.0
rate = (len(inj) / total_hours * 1000) if total_hours>0 else np.nan
print(f"Injury rate: {rate:.2f} per 1000 training hours")

# Injury vs ACWR band incidence
def acwr_band(x):
    if pd.isna(x): return "NA"
    if x < 0.8: return "<0.8"
    if x < 1.3: return "0.8–1.3"
    if x < 1.5: return "1.3–1.5"
    return ">=1.5"

train["ACWRBand"] = train["ACWR"].apply(acwr_band)
inj_daily = inj[["PlayerID","InjuryDate"]].copy().rename(columns={"InjuryDate":"Date"})
inj_daily["Inj"] = 1
merged = pd.merge(train[["PlayerID","Date","ACWRBand"]], inj_daily, on=["PlayerID","Date"], how="left")
incidence = merged.groupby("ACWRBand")["Inj"].mean().reindex(["<0.8","0.8–1.3","1.3–1.5",">=1.5","NA"])
plt.figure()
incidence.plot(kind="bar")
plt.title("Injury Incidence by ACWR Band"); plt.xlabel("ACWR Band"); plt.ylabel("Incidence")
plt.show()


## 7. Integrated Insights & Summary Dataset

In [None]:
# Player-level summary for modeling & BI
summary = (
    pms.groupby("PlayerID")
       .agg(TotalMinutes=("MinutesPlayed","sum"),
            AvgMatchRating=("MatchRating","mean"),
            AvgDistanceKM=("DistanceCoveredKM","mean"),
            AvgPassAcc=("PassAccuracyPct","mean"),
            TotalSprints=("Sprints","sum"))
       .reset_index()
       .merge(players[["PlayerID","TeamID","Position","InjuryProneScore","Age"]], on="PlayerID", how="left")
)

inj_counts = inj.groupby("PlayerID").size().reset_index(name="InjuryCount")
summary = summary.merge(inj_counts, on="PlayerID", how="left").fillna({"InjuryCount":0})

summary["MinutesPerInjury"] = summary["TotalMinutes"] / (summary["InjuryCount"] + 1)

print("Summary shape:", summary.shape)
summary.head()


## 8. Save Analytical Outputs

In [None]:
# Save cleaned/engineered datasets for modeling
summary_path = DATA_OUT / "player_summary_for_model.csv"
train_out_path = DATA_OUT / "train_features_prepared.csv"

summary.to_csv(summary_path, index=False)
train.to_csv(train_out_path, index=False)

print("Saved:", summary_path)
print("Saved:", train_out_path)


## 9. Next Steps
- Build `02_InjuryRisk_Model.ipynb` (feature engineering for rolling loads, wellness trends; train Logistic/RandomForest/XGBoost).
- Create Power BI report with pages: **Squad Overview**, **Load vs Fatigue**, **Injury Risk & Alerts**.
- Write SQL views for model serving (`InjuryRiskScores`) and integrate back into the BI layer.
