In [1]:
import pandas as pd
import numpy as np

match = pd.read_csv("../data/raw/match_data.csv", parse_dates=["match_date"])
train = pd.read_csv("../data/raw/training_load.csv", parse_dates=["date"])
inj = pd.read_csv("../data/raw/injury_history.csv", parse_dates=["injury_date"])

match.head(), train.head(), inj.head()


(      season       team match_id match_date player_id player_name position  \
 0  2025-2026  TR_Club_A      M01 2025-08-11       P01   Player_01       AM   
 1  2025-2026  TR_Club_A      M01 2025-08-11       P02   Player_02        W   
 2  2025-2026  TR_Club_A      M01 2025-08-11       P03   Player_03       CM   
 3  2025-2026  TR_Club_A      M01 2025-08-11       P04   Player_04       ST   
 4  2025-2026  TR_Club_A      M01 2025-08-11       P05   Player_05       AM   
 
    minutes  distance_km  sprints  high_intensity_runs     xg     xa  \
 0       45         4.07        5                   15  0.063  0.109   
 1       75         6.39       21                   15  0.180  0.000   
 2       89         9.17        3                   25  0.000  0.118   
 3       27         1.71        2                    6  0.032  0.036   
 4       65         4.32        1                   10  0.000  0.137   
 
    passes_attempted  pass_accuracy  duels  duels_won  yellow  red  
 0                18 

In [2]:
# Son 14 gün antrenman yükü
recent_load = (
    train.sort_values("date")
    .groupby("player_id")
    .tail(14)
    .groupby("player_id")["training_load"]
    .mean()
    .reset_index(name="recent_load")
)

# Toplam sakatlık günleri
injury_days = (
    inj.groupby("player_id")["days_out"]
    .sum()
    .reset_index(name="total_injury_days")
)

# Sakatlık sayısı
injury_count = (
    inj.groupby("player_id")
    .size()
    .reset_index(name="injury_count")
)


In [3]:
risk = recent_load.merge(injury_days, on="player_id", how="left") \
                  .merge(injury_count, on="player_id", how="left")

risk.fillna(0, inplace=True)

# Normalize (0–1)
risk["load_norm"] = (risk["recent_load"] - risk["recent_load"].min()) / (risk["recent_load"].max() - risk["recent_load"].min() + 1e-6)
risk["injury_days_norm"] = (risk["total_injury_days"] - risk["total_injury_days"].min()) / (risk["total_injury_days"].max() - risk["total_injury_days"].min() + 1e-6)
risk["injury_count_norm"] = (risk["injury_count"] - risk["injury_count"].min()) / (risk["injury_count"].max() - risk["injury_count"].min() + 1e-6)

# Baseline Injury Risk Score
risk["injury_risk_score"] = (
    0.5 * risk["load_norm"] +
    0.3 * risk["injury_days_norm"] +
    0.2 * risk["injury_count_norm"]
)

risk.sort_values("injury_risk_score", ascending=False).head(10)


Unnamed: 0,player_id,recent_load,total_injury_days,injury_count,load_norm,injury_days_norm,injury_count_norm,injury_risk_score
13,P14,469.385714,27.0,2.0,0.710853,0.490909,1.0,0.702699
8,P09,512.2,14.0,1.0,0.968493,0.254545,0.5,0.66061
16,P17,467.3,21.0,1.0,0.698302,0.381818,0.5,0.563696
11,P12,427.385714,22.0,2.0,0.458113,0.4,1.0,0.549056
10,P11,357.185714,55.0,2.0,0.035676,1.0,1.0,0.517838
15,P16,454.535714,19.0,1.0,0.621492,0.345455,0.5,0.514382
2,P03,440.507143,25.0,1.0,0.537073,0.454545,0.5,0.5049
5,P06,517.435714,0.0,0.0,1.0,0.0,0.0,0.5
12,P13,507.078571,0.0,0.0,0.937675,0.0,0.0,0.468837
4,P05,442.928571,11.0,1.0,0.551644,0.2,0.5,0.435822


In [4]:
risk["risk_label"] = pd.cut(
    risk["injury_risk_score"],
    bins=[-1, 0.33, 0.66, 1.1],
    labels=["LOW", "MEDIUM", "HIGH"]
)

risk[["player_id", "injury_risk_score", "risk_label"]].sort_values("injury_risk_score", ascending=False)


Unnamed: 0,player_id,injury_risk_score,risk_label
13,P14,0.702699,HIGH
8,P09,0.66061,HIGH
16,P17,0.563696,MEDIUM
11,P12,0.549056,MEDIUM
10,P11,0.517838,MEDIUM
15,P16,0.514382,MEDIUM
2,P03,0.5049,MEDIUM
5,P06,0.5,MEDIUM
12,P13,0.468837,MEDIUM
4,P05,0.435822,MEDIUM
