# 03_Performance_Optimization.ipynb — Training Load Optimization (LP)

**Objective:** Optimize next-week training loads per player to minimize **expected injury risk** while maintaining **performance readiness**.

**Method:** Prescriptive analytics using **Linear Programming (PuLP)** with three planning scenarios:
- **Balanced** — standard constraints
- **Recovery** — stricter capacity & penalties for high-risk players
- **HighPerformance** — looser capacity to push readiness

**Inputs (from previous notebooks):**
- `../reports/InjuryRiskScores.csv` — `PlayerID`, `PredictedRisk`, (optional: `Position`, `Age`)
- `../data/player_summary_for_model.csv` — `PlayerID`, `TeamID`, `Position`, `Age`, `InjuryProneScore`, `MinutesPerInjury`

**Outputs:**
- `../reports/optimized_training_plan.csv` — per-player load plan across scenarios
- `../reports/optimization_kpis.csv` — team-level totals per scenario
- Narrative text summary at the end (and saved to `../reports/optimization_summary.md`)


In [None]:
# --- 1) Imports & Setup ---
import os, json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pulp

plt.rcParams.update({'figure.figsize': (9,5), 'axes.grid': True})

BASE = Path('.').resolve()
DATA_DIR = (BASE / '..' / 'data').resolve()
REPORTS_DIR = (BASE / '..' / 'reports').resolve()
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

risk_path = REPORTS_DIR / 'InjuryRiskScores.csv'
summary_path = DATA_DIR / 'player_summary_for_model.csv'

print('risk_path:', risk_path)
print('summary_path:', summary_path)


## 2) Load Inputs

In [None]:
risk = pd.read_csv(risk_path)
summary = pd.read_csv(summary_path)

# Ensure required columns exist
req_risk = {'PlayerID','PredictedRisk'}
req_sum = {'PlayerID','TeamID','Position','Age'}
assert req_risk.issubset(risk.columns), f"Missing columns in risk file: {req_risk - set(risk.columns)}"
assert req_sum.issubset(summary.columns), f"Missing columns in summary file: {req_sum - set(summary.columns)}"

# Merge
df = risk.merge(summary[['PlayerID','TeamID','Position','Age','InjuryProneScore','MinutesPerInjury']],
                on='PlayerID', how='left')

# Fallbacks / cleaning
df['MinutesPerInjury'] = df['MinutesPerInjury'].fillna(3000.0)
df['InjuryProneScore'] = df['InjuryProneScore'].fillna(0.3)
df['PredictedRisk'] = df['PredictedRisk'].clip(0,1)
df['Position'] = df['Position'].fillna('MF')  # neutral default
df['Age'] = df['Age'].fillna(df['Age'].median())

print('Players loaded:', len(df))
df.head()


## 3) Define Optimization Parameters

In [None]:
# Per-position min/max LoadUnits (weekly microcycle proxy)
POS_MIN = {'GK': 30, 'DF': 60, 'MF': 70, 'FW': 60}
POS_MAX = {'GK':120, 'DF':180, 'MF':200, 'FW':180}

def player_caps(row):
    base_min = POS_MIN.get(row['Position'], 60)
    base_max = POS_MAX.get(row['Position'], 180)
    age_penalty = 0.15 if row['Age']>=30 else 0.0
    prone_penalty = 0.4 * float(row.get('InjuryProneScore',0.3))
    cap = base_max * (1 - age_penalty - prone_penalty)
    cap = max(cap, base_min + 20)  # feasibility headroom
    return base_min, cap

minmax = df.apply(player_caps, axis=1, result_type='expand')
df['MinLoad'] = minmax[0].astype(float)
df['MaxLoad'] = minmax[1].astype(float)

# Risk coefficient — blend model risk with prone score & resilience proxy
low_minutes_factor = (1.0 + (1_500.0 / df['MinutesPerInjury'].replace(0,np.nan).fillna(3_000.0)))
df['RiskCoeff'] = (df['PredictedRisk'] * (1 + 0.6*df['InjuryProneScore']) * low_minutes_factor).clip(lower=0.05)

# Team capacity baseline (scaled by headcount and average bounds)
TEAM_CAPACITY = df.groupby('TeamID').apply(lambda g: (g['MinLoad']+g['MaxLoad']).mean() * len(g) * 0.55).to_dict()

print('Sample caps & risk coeff:')
df[['PlayerID','TeamID','Position','Age','MinLoad','MaxLoad','PredictedRisk','InjuryProneScore','RiskCoeff']].head()


## 4) LP Optimizer (PuLP)

In [None]:
def optimize_plan(players_df, mode='Balanced'):
    """Minimize risk-weighted load subject to per-player bounds and team capacity.
    modes:
      - Balanced: default capacity
      - Recovery: stricter capacity, penalties for High risk
      - HighPerformance: looser capacity
    Returns: solution DataFrame with LoadUnits & risk contribution.
    """
    pdata = players_df.copy().reset_index(drop=True)
    n = len(pdata)
    prob = pulp.LpProblem(f"train_plan_{mode}", pulp.LpMinimize)
    
    # Decision variables
    L = [pulp.LpVariable(f"L_{i}", lowBound=pdata.loc[i,'MinLoad'], upBound=pdata.loc[i,'MaxLoad']) for i in range(n)]
    
    # Scenario modifiers
    cap_mult = 1.0
    penalty_highrisk = 0.0
    if mode == 'Recovery':
        cap_mult = 0.9
        penalty_highrisk = 0.5
    elif mode == 'HighPerformance':
        cap_mult = 1.1
        penalty_highrisk = -0.2
    
    # Objective
    risk_coeff = pdata['RiskCoeff'].values.astype(float)
    risk_cat = pd.cut(pdata['PredictedRisk'], bins=[0,0.3,0.6,1.0], labels=['Low','Med','High'])
    extra = np.where(risk_cat=='High', penalty_highrisk, 0.0) + np.where(risk_cat=='Low', -penalty_highrisk/2, 0.0)
    prob += pulp.lpSum( (risk_coeff[i] + extra[i]) * L[i] for i in range(n) )
    
    # Team capacities
    for team_id, g in pdata.groupby('TeamID'):
        idx = g.index.tolist()
        cap = TEAM_CAPACITY.get(team_id, 1000.0) * cap_mult
        prob += pulp.lpSum(L[i] for i in idx) <= cap, f"TeamCap_{team_id}"
    
    # Solve
    prob.solve(pulp.PULP_CBC_CMD(msg=False))
    status = pulp.LpStatus[prob.status]
    if status != 'Optimal':
        raise RuntimeError("LP failed: " + status)
    
    pdata['LoadUnits'] = [L[i].value() for i in range(n)]
    pdata['ExpRiskContribution'] = pdata['LoadUnits'] * pdata['RiskCoeff']
    return pdata


## 5) Run Optimization Scenarios

In [None]:
scenarios = {}
for mode in ['Balanced','Recovery','HighPerformance']:
    sol = optimize_plan(df, mode=mode)
    kpis = sol.groupby('TeamID').agg(TotalLoad=('LoadUnits','sum'),
                                     ExpectedRisk=('ExpRiskContribution','sum')).reset_index()
    scenarios[mode] = {'solution': sol, 'kpis': kpis}
    print(f"{mode}: TotalLoad={kpis['TotalLoad'].sum():.1f}, ExpectedRisk={kpis['ExpectedRisk'].sum():.2f}")


## 6) Visualize Scenario KPIs (matplotlib)

In [None]:
labels = list(scenarios.keys())
tot_load = [scenarios[m]['kpis']['TotalLoad'].sum() for m in labels]
tot_risk = [scenarios[m]['kpis']['ExpectedRisk'].sum() for m in labels]

plt.figure()
plt.bar(labels, tot_load)
plt.title('Total Team Load by Scenario'); plt.ylabel('Load Units')
plt.show()

plt.figure()
plt.bar(labels, tot_risk)
plt.title('Total Expected Risk by Scenario'); plt.ylabel('Risk-weighted Load')
plt.show()

# Team-wise comparison
for metric in ['TotalLoad','ExpectedRisk']:
    plt.figure(figsize=(10,5))
    for mode in labels:
        s = scenarios[mode]['kpis'].set_index('TeamID')[metric].sort_index()
        plt.plot(s.index, s.values, marker='o', label=mode)
    plt.title(f'{metric} by Team across Scenarios'); plt.xlabel('TeamID'); plt.ylabel(metric)
    plt.legend(); plt.tight_layout(); plt.show()


## 7) Save Plans & KPIs

In [None]:
# Concatenate and save per-player plan
out_all = []
for mode, obj in scenarios.items():
    sol = obj['solution'].copy()
    sol['Scenario'] = mode
    out_all.append(sol[['PlayerID','TeamID','Position','Age','PredictedRisk','RiskCoeff','MinLoad','MaxLoad','LoadUnits','ExpRiskContribution','Scenario']])
final_out = pd.concat(out_all, ignore_index=True)

plan_path = REPORTS_DIR / 'optimized_training_plan.csv'
final_out.to_csv(plan_path, index=False)
print('Saved optimized plans to:', plan_path)

# Save KPIs
kpi_rows = []
for mode, obj in scenarios.items():
    k = obj['kpis'].copy(); k['Scenario'] = mode
    kpi_rows.append(k)
kpi_df = pd.concat(kpi_rows, ignore_index=True)
kpi_path = REPORTS_DIR / 'optimization_kpis.csv'
kpi_df.to_csv(kpi_path, index=False)
print('Saved scenario KPIs to:', kpi_path)


## 8) Narrative Summary (auto-generated)

In [None]:
def summarize_scenarios(kpi_df):
    piv = kpi_df.groupby('Scenario').agg(TotalLoad=('TotalLoad','sum'),
                                         ExpectedRisk=('ExpectedRisk','sum')).reset_index()
    base = piv[piv['Scenario']=='Balanced'].iloc[0]
    lines = []
    for scen in ['Recovery','HighPerformance']:
        row = piv[piv['Scenario']==scen].iloc[0]
        load_delta = (row['TotalLoad'] - base['TotalLoad'])/max(base['TotalLoad'],1e-6)*100
        risk_delta = (row['ExpectedRisk'] - base['ExpectedRisk'])/max(base['ExpectedRisk'],1e-6)*100
        sign = lambda x: '+' if x>=0 else ''
        lines.append(f"- **{scen}** vs Balanced: Total Load {sign(load_delta):}{load_delta:.1f}%, Expected Risk {sign(risk_delta):}{risk_delta:.1f}%.")
    return "\n".join(lines), piv

summary_text, piv = summarize_scenarios(kpi_df)
print("Scenario deltas (vs Balanced):\n" + summary_text)

# Save markdown summary
md = [
"# Optimization Scenario Summary\n",
"**Balanced** is the baseline weekly plan. Below compares other scenarios to it:\n\n",
summary_text, "\n\n",
"**Notes:**\n",
"- *Recovery* enforces stricter team capacity and penalizes high-risk player loads.\n",
"- *HighPerformance* allows more load to increase readiness, accepting slightly higher risk.\n"
]
sum_path = REPORTS_DIR / 'optimization_summary.md'
with open(sum_path, 'w', encoding='utf-8') as f:
    f.write("".join(md))
print('Saved narrative summary to:', sum_path)


---

### Next
- (Optional) Connect these outputs to Power BI:
  - Use `optimized_training_plan.csv` for **per-player** plans and scenario slicers.
  - Use `optimization_kpis.csv` for **team-level** KPIs by scenario.
- Add business rules (e.g., minimum rest days post-injury, position-specific doubling days).

*This notebook intentionally uses **matplotlib only** for charts.*
