# Rifampicin Market Opportunity: Executive Summary
**German Hospital Market Analysis for PJI Treatment**

Compact analysis notebook optimized for 6 content slides + backup slides.

---

In [None]:
# Cell 1: Technical Setup
import sqlite3
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import HTML, display
import warnings
warnings.filterwarnings('ignore')

# PNG export for PPT
def export_fig_png(fig, filename, width=1200, height=600):
    """Export Plotly figure to PNG for PowerPoint."""
    fig.write_image(filename, width=width, height=height, scale=2)
    print(f"Exported: {filename}")

# Display helper
def show_fig(fig):
    """Display a Plotly figure using HTML to avoid nbformat issues."""
    display(HTML(fig.to_html(include_plotlyjs='cdn', full_html=False)))

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:,.1f}'.format)

# Database connection
DB_FILE = Path.cwd() / "all_data_2011-2023.db"

def get_db_connection() -> sqlite3.Connection:
    return sqlite3.connect(DB_FILE, check_same_thread=False)

def run_sql(stmt: str) -> pd.DataFrame:
    with get_db_connection() as con:
        return pd.read_sql_query(stmt, con=con)

print(f"Database: {DB_FILE}")
print(f"Connection: {'OK' if DB_FILE.exists() else 'FAILED'}")
print("Setup complete.")

In [None]:
# Cell 2: Configuration Parameters (CORRECTED per plan)

# OPS Procedure Codes
OPS_HIP_PRIMARY = ['5-820']      # Hip endoprosthesis, primary
OPS_HIP_REVISION = ['5-821']     # Hip endoprosthesis, revision
OPS_KNEE_PRIMARY = ['5-822']     # Knee endoprosthesis, primary
OPS_KNEE_REVISION = ['5-823']    # Knee endoprosthesis, revision

# CORRECTED Department Codes (removed irrelevant: 1513, 1518, 1519, 1520)
DEPT_SURGERY = ['1500', '1516', '1523']  # Allgemeine/Unfall/Ortho-Chirurgie
DEPT_ORTHOPEDICS = ['2300', '2309', '2315', '2316']  # All Orthopaedie codes
DEPT_RELEVANT = DEPT_SURGERY + DEPT_ORTHOPEDICS

# SIMPLIFIED ICD Surrogate (no exclusions per briefing)
ICD_SURROGATE = ['M16.0', 'M16.1', 'M17.0', 'M17.1']  # Primary osteoarthritis hip/knee

# Infection Rate Assumptions (from Briefing: 1-2%)
INFECTION_RATE_LOW = 0.01   # 1% - Conservative
INFECTION_RATE_MID = 0.015  # 1.5% - Mid-range
INFECTION_RATE_HIGH = 0.02  # 2% - Upper bound

# UPDATED Scoring Weights (no OAU/gap component)
WEIGHT_VOLUME = 0.60  # Was 0.40
WEIGHT_DEPT = 0.25    # Was 0.15
WEIGHT_TYPE = 0.15    # Unchanged

# Data range (CORRECTED: 2017-2023 available)
YEARS = (2017, 2023)

# Output directory for PNGs
OUTPUT_DIR = Path.cwd() / "ppt_exports"
OUTPUT_DIR.mkdir(exist_ok=True)

print("Configuration loaded (CORRECTED methodology):")
print(f"  - Department codes: {DEPT_RELEVANT}")
print(f"  - ICD surrogate: {ICD_SURROGATE} (simple sum, no exclusions)")
print(f"  - Scoring weights: Volume={WEIGHT_VOLUME}, Dept={WEIGHT_DEPT}, Type={WEIGHT_TYPE}")
print(f"  - Data years: {YEARS[0]}-{YEARS[1]}")

In [None]:
# Cell 3: Data Loading - National & Hospital Level

# 2023 National totals
ops_2023_national = run_sql("""
SELECT 
    Berichtsjahr,
    SUM(CASE WHEN OPS_301_Category LIKE '5-820%' THEN Anzahl ELSE 0 END) AS hip_primary,
    SUM(CASE WHEN OPS_301_Category LIKE '5-821%' THEN Anzahl ELSE 0 END) AS hip_revision,
    SUM(CASE WHEN OPS_301_Category LIKE '5-822%' THEN Anzahl ELSE 0 END) AS knee_primary,
    SUM(CASE WHEN OPS_301_Category LIKE '5-823%' THEN Anzahl ELSE 0 END) AS knee_revision,
    COUNT(DISTINCT IK) AS hospital_count
FROM VIEW_Krankenhaus_Prozedur
WHERE Berichtsjahr = 2023
  AND (OPS_301_Category LIKE '5-820%' 
       OR OPS_301_Category LIKE '5-821%'
       OR OPS_301_Category LIKE '5-822%' 
       OR OPS_301_Category LIKE '5-823%')
GROUP BY Berichtsjahr
""")
ops_2023_national['total_primary'] = ops_2023_national['hip_primary'] + ops_2023_national['knee_primary']
ops_2023_national['total_revision'] = ops_2023_national['hip_revision'] + ops_2023_national['knee_revision']
ops_2023_national['total'] = ops_2023_national['total_primary'] + ops_2023_national['total_revision']

# Multi-year trend (2017-2023)
trend_df = run_sql(f"""
SELECT 
    Berichtsjahr,
    SUM(CASE WHEN OPS_301_Category LIKE '5-820%' THEN Anzahl ELSE 0 END) AS hip_primary,
    SUM(CASE WHEN OPS_301_Category LIKE '5-821%' THEN Anzahl ELSE 0 END) AS hip_revision,
    SUM(CASE WHEN OPS_301_Category LIKE '5-822%' THEN Anzahl ELSE 0 END) AS knee_primary,
    SUM(CASE WHEN OPS_301_Category LIKE '5-823%' THEN Anzahl ELSE 0 END) AS knee_revision,
    COUNT(DISTINCT IK) AS hospital_count
FROM VIEW_Krankenhaus_Prozedur
WHERE Berichtsjahr BETWEEN {YEARS[0]} AND {YEARS[1]}
  AND (OPS_301_Category LIKE '5-820%' 
       OR OPS_301_Category LIKE '5-821%'
       OR OPS_301_Category LIKE '5-822%' 
       OR OPS_301_Category LIKE '5-823%')
GROUP BY Berichtsjahr
ORDER BY Berichtsjahr
""")
trend_df['total_primary'] = trend_df['hip_primary'] + trend_df['knee_primary']
trend_df['total_revision'] = trend_df['hip_revision'] + trend_df['knee_revision']
trend_df['total'] = trend_df['total_primary'] + trend_df['total_revision']

# Hospital-level data (2023)
hospital_ops_df = run_sql("""
SELECT 
    IK,
    Name,
    MIN(Ort) AS Ort,
    MIN(Postleitzahl) AS Postleitzahl,
    MIN(geo_Bundesland) AS Bundesland,
    AVG(geo_Lat) AS Latitude,
    AVG(geo_Lon) AS Longitude,
    SUM(CASE WHEN OPS_301_Category LIKE '5-820%' THEN Anzahl ELSE 0 END) AS hip_primary,
    SUM(CASE WHEN OPS_301_Category LIKE '5-821%' THEN Anzahl ELSE 0 END) AS hip_revision,
    SUM(CASE WHEN OPS_301_Category LIKE '5-822%' THEN Anzahl ELSE 0 END) AS knee_primary,
    SUM(CASE WHEN OPS_301_Category LIKE '5-823%' THEN Anzahl ELSE 0 END) AS knee_revision
FROM VIEW_Krankenhaus_Prozedur
WHERE Berichtsjahr = 2023
  AND (OPS_301_Category LIKE '5-820%' 
       OR OPS_301_Category LIKE '5-821%'
       OR OPS_301_Category LIKE '5-822%' 
       OR OPS_301_Category LIKE '5-823%')
GROUP BY IK, Name
HAVING (hip_primary + hip_revision + knee_primary + knee_revision) > 0
""")
hospital_ops_df['total_primary'] = hospital_ops_df['hip_primary'] + hospital_ops_df['knee_primary']
hospital_ops_df['total_revision'] = hospital_ops_df['hip_revision'] + hospital_ops_df['knee_revision']
hospital_ops_df['total_procedures'] = hospital_ops_df['total_primary'] + hospital_ops_df['total_revision']

# Department validation (CORRECTED codes)
dept_codes_str = ", ".join([f"'{c}'" for c in DEPT_RELEVANT])
hospital_depts_df = run_sql(f"""
SELECT DISTINCT
    vkf.IK,
    1 AS has_relevant_dept
FROM VIEW_Krankenhaus_Fachabteilung vkf
JOIN REL_Organisationseinheit_Fachabteilung_Fachabteilungsschluessel rof
    ON rof.Organisationseinheit_Fachabteilung_ID = vkf.ID_OE
JOIN Fachabteilungsschluessel fs
    ON fs.ID = rof.Fachabteilungsschluessel_ID
WHERE vkf.Berichtsjahr = 2023
  AND SUBSTR(fs.FA_Schluessel, 1, 4) IN ({dept_codes_str})
""")

print(f"Data loaded:")
print(f"  - 2023 National: {ops_2023_national['total'].values[0]:,} procedures")
print(f"  - Trend data: {len(trend_df)} years ({YEARS[0]}-{YEARS[1]})")
print(f"  - Hospitals: {len(hospital_ops_df)} with joint replacement procedures")
print(f"  - Hospitals with relevant depts: {len(hospital_depts_df)}")

---
# SLIDE 1: Executive Summary

In [None]:
# Slide 1: Executive Summary - Key Metrics

primary_implants = ops_2023_national['total_primary'].values[0]
revisions = ops_2023_national['total_revision'].values[0]
hospital_count = ops_2023_national['hospital_count'].values[0]

# EII calculation (based on primary implants only)
eii_low = int(primary_implants * INFECTION_RATE_LOW)
eii_mid = int(primary_implants * INFECTION_RATE_MID)
eii_high = int(primary_implants * INFECTION_RATE_HIGH)

# COVID recovery calculation
pre_covid_2019 = trend_df[trend_df['Berichtsjahr'] == 2019]['total'].values[0]
current_2023 = trend_df[trend_df['Berichtsjahr'] == 2023]['total'].values[0]
backlog = pre_covid_2019 - current_2023

print("=" * 70)
print("EXECUTIVE SUMMARY: Rifampicin Market Opportunity in Germany")
print("=" * 70)
print(f"""
KEY METRICS (2023)
{'─' * 50}
  Total Joint Replacements:     {primary_implants + revisions:>10,}
  - Primary Implants (TEP):     {primary_implants:>10,}
  - Revision Procedures:        {revisions:>10,}
  Hospitals Performing:         {hospital_count:>10,}

EXPECTED INFECTION INDEX (EII)
{'─' * 50}
  @ 1.0% infection rate:        {eii_low:>10,} cases/year
  @ 1.5% infection rate:        {eii_mid:>10,} cases/year
  @ 2.0% infection rate:        {eii_high:>10,} cases/year

COVID RECOVERY OPPORTUNITY
{'─' * 50}
  Pre-COVID volume (2019):      {pre_covid_2019:>10,}
  Current volume (2023):        {current_2023:>10,}
  Procedure backlog:            {backlog:>10,} (~{backlog/pre_covid_2019*100:.0f}% gap)

STRATEGIC INSIGHT: COVID recovery incomplete represents ~{backlog:,} procedure
backlog = untapped market opportunity for Rifampicin in PJI treatment.
""")

---
# SLIDE 2: Market Overview

In [None]:
# Slide 2: Market Overview - COVID Recovery Framing

# Calculate year-over-year changes
trend_df['yoy_change'] = trend_df['total'].pct_change() * 100
trend_df['yoy_abs'] = trend_df['total'].diff()

# Create combined trend visualization
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=('Joint Replacement Procedures: COVID Impact & Recovery', 
                    'Primary Implants vs Revisions'),
    vertical_spacing=0.15,
    row_heights=[0.55, 0.45]
)

# Pre-COVID reference line
pre_covid_level = trend_df[trend_df['Berichtsjahr'] == 2019]['total'].values[0]

# Total volume with COVID annotation
fig.add_trace(
    go.Scatter(
        x=trend_df['Berichtsjahr'],
        y=trend_df['total'],
        mode='lines+markers+text',
        name='Total Procedures',
        line=dict(color='#2E86AB', width=3),
        marker=dict(size=10),
        text=trend_df['total'].apply(lambda x: f'{x:,.0f}'),
        textposition='top center'
    ),
    row=1, col=1
)

# Pre-COVID reference line
fig.add_hline(y=pre_covid_level, line_dash="dash", line_color="gray", 
              annotation_text=f"Pre-COVID (2019): {pre_covid_level:,}", row=1, col=1)

# COVID impact annotation
fig.add_annotation(
    x=2020, y=trend_df[trend_df['Berichtsjahr']==2020]['total'].values[0],
    text="COVID Impact", showarrow=True, arrowhead=2,
    ax=40, ay=-40, font=dict(color='red'),
    row=1, col=1
)

# Recovery annotation
fig.add_annotation(
    x=2023, y=current_2023,
    text=f"Recovery Gap: ~{backlog:,}", showarrow=True, arrowhead=2,
    ax=0, ay=40, font=dict(color='orange'),
    row=1, col=1
)

# Primary vs Revision
fig.add_trace(
    go.Scatter(
        x=trend_df['Berichtsjahr'],
        y=trend_df['total_primary'],
        mode='lines+markers',
        name='Primary Implants',
        line=dict(color='#2E86AB', width=2)
    ),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(
        x=trend_df['Berichtsjahr'],
        y=trend_df['total_revision'],
        mode='lines+markers',
        name='Revisions',
        line=dict(color='#E94F37', width=2)
    ),
    row=2, col=1
)

fig.update_layout(
    height=650,
    showlegend=True,
    legend=dict(orientation='h', yanchor='bottom', y=1.02)
)
fig.update_xaxes(title_text='Year', dtick=1)
fig.update_yaxes(title_text='Procedures', row=1, col=1)
fig.update_yaxes(title_text='Procedures', row=2, col=1)

show_fig(fig)
export_fig_png(fig, OUTPUT_DIR / 'slide2_market_overview.png', width=1200, height=650)

print(f"\nKEY INSIGHT: Market shows incomplete COVID recovery.")
print(f"Backlog of ~{backlog:,} procedures = opportunity for Rifampicin positioning.")

---
# SLIDE 3: TAM Funnel

In [None]:
# Slide 3: TAM Funnel Visualization

fig = go.Figure()

# Funnel visualization
fig.add_trace(go.Funnel(
    y=['Primary Implants (TEP)', 'EII @ 2.0%', 'EII @ 1.5%', 'EII @ 1.0%'],
    x=[primary_implants, eii_high, eii_mid, eii_low],
    textinfo='value+percent initial',
    texttemplate='%{value:,.0f}<br>(%{percentInitial:.1%})',
    marker=dict(color=['#2E86AB', '#E94F37', '#F39C12', '#27AE60']),
    connector=dict(line=dict(color='lightgray', width=2))
))

fig.update_layout(
    title=dict(
        text='Total Addressable Market: From Implants to Expected Infections',
        font=dict(size=18)
    ),
    height=450,
    annotations=[
        dict(
            x=0.95, y=0.05,
            xref='paper', yref='paper',
            text=f'EII based on {primary_implants:,} primary implants<br>Infection rate: 1-2% (literature)',
            showarrow=False,
            font=dict(size=11),
            align='right',
            bgcolor='lightyellow',
            borderpad=4
        )
    ]
)

show_fig(fig)
export_fig_png(fig, OUTPUT_DIR / 'slide3_tam_funnel.png', width=1000, height=450)

print(f"\nTAM SUMMARY:")
print(f"  Primary Implants (2023):  {primary_implants:,}")
print(f"  EII Range:                {eii_low:,} - {eii_high:,} cases/year")
print(f"  Midpoint estimate:        {eii_mid:,} potential Rifampicin patients")

---
# SLIDE 4: Hospital Target List (Top 20)

In [None]:
# Slide 4: Hospital Ranking (CORRECTED scoring - no OAU component)

# Merge department data
hospital_df = hospital_ops_df.merge(
    hospital_depts_df,
    on='IK',
    how='left'
)
hospital_df['has_relevant_dept'] = hospital_df['has_relevant_dept'].fillna(0).astype(int)

# Hospital type classification
def classify_hospital_type(ik):
    """Approximate hospital type from IK number (26=Krankenhaus prefix)."""
    ik_str = str(ik)
    if ik_str.startswith('26'):
        # Extract state code (digits 3-4)
        state_code = ik_str[2:4] if len(ik_str) >= 4 else '00'
        return 'Universitaetsklinik' if state_code in ['01', '02', '03', '04', '05'] else 'Krankenhaus'
    return 'Sonstiges'

hospital_df['hospital_type'] = hospital_df['IK'].apply(classify_hospital_type)
hospital_df['is_vollversorger'] = hospital_df['total_procedures'] >= 500

# CORRECTED Scoring (no OAU/gap component)
hospital_df['volume_score'] = hospital_df['total_procedures'] / hospital_df['total_procedures'].max()
hospital_df['dept_score'] = hospital_df['has_relevant_dept']
hospital_df['type_score'] = hospital_df['is_vollversorger'].astype(int)

hospital_df['opportunity_score'] = (
    WEIGHT_VOLUME * hospital_df['volume_score'] +
    WEIGHT_DEPT * hospital_df['dept_score'] +
    WEIGHT_TYPE * hospital_df['type_score']
)

# EII per hospital (based on primary implants only)
hospital_df['EII_low'] = hospital_df['total_primary'] * INFECTION_RATE_LOW
hospital_df['EII_mid'] = hospital_df['total_primary'] * INFECTION_RATE_MID
hospital_df['EII_high'] = hospital_df['total_primary'] * INFECTION_RATE_HIGH

# Rank hospitals
hospital_ranked = hospital_df.sort_values('opportunity_score', ascending=False).reset_index(drop=True)
hospital_ranked['rank'] = range(1, len(hospital_ranked) + 1)

# Top 20 display
top20 = hospital_ranked.head(20)

print("=" * 80)
print("TOP 20 TARGET HOSPITALS (Corrected Scoring)")
print("=" * 80)
print(f"Scoring: Volume={WEIGHT_VOLUME:.0%}, Dept={WEIGHT_DEPT:.0%}, Type={WEIGHT_TYPE:.0%}")
print(f"(No OAU/gap component - data too sparse)\n")

display_cols = ['rank', 'Name', 'Bundesland', 'total_procedures', 'EII_mid', 'opportunity_score']
print(top20[display_cols].to_string(index=False))

# Export for PPT
top20[['rank', 'Name', 'Bundesland', 'Ort', 'total_procedures', 'total_primary', 
       'total_revision', 'EII_mid', 'opportunity_score']].to_csv(
    OUTPUT_DIR / 'top20_hospitals.csv', index=False
)
print(f"\nExported: {OUTPUT_DIR / 'top20_hospitals.csv'}")

---
# SLIDE 5: Regional Analysis

In [None]:
# Slide 5: Regional Analysis by Bundesland

# Aggregate by Bundesland
regional_df = hospital_df.groupby('Bundesland', as_index=False).agg({
    'total_procedures': 'sum',
    'total_primary': 'sum',
    'total_revision': 'sum',
    'EII_mid': 'sum',
    'IK': 'count',
    'opportunity_score': 'mean'
}).rename(columns={'IK': 'hospital_count'})

regional_df['procedures_per_hospital'] = regional_df['total_procedures'] / regional_df['hospital_count']
regional_df = regional_df.sort_values('total_procedures', ascending=False)

# Regional bar chart
fig = px.bar(
    regional_df,
    x='Bundesland',
    y='total_procedures',
    color='EII_mid',
    title='Joint Replacement Procedures by Bundesland (2023)',
    labels={'total_procedures': 'Total Procedures', 'EII_mid': 'Expected Infections (1.5%)'},
    color_continuous_scale='Reds',
    text='total_procedures'
)
fig.update_traces(texttemplate='%{text:,.0f}', textposition='outside')
fig.update_layout(
    xaxis_tickangle=-45, 
    height=500,
    yaxis_title='Total Procedures',
    coloraxis_colorbar_title='EII'
)

show_fig(fig)
export_fig_png(fig, OUTPUT_DIR / 'slide5_regional_analysis.png', width=1200, height=500)

# Regional summary table
print("\nREGIONAL SUMMARY (Top 5 Bundeslaender):")
print(regional_df[['Bundesland', 'total_procedures', 'hospital_count', 'EII_mid']].head(5).to_string(index=False))

In [None]:
# Slide 5b: Hospital Map

map_df = hospital_ranked[hospital_ranked['Latitude'].notna()].copy()

if len(map_df) > 0:
    fig = px.scatter_mapbox(
        map_df,
        lat='Latitude',
        lon='Longitude',
        size='total_procedures',
        color='opportunity_score',
        hover_name='Name',
        hover_data=['Bundesland', 'total_procedures', 'EII_mid'],
        color_continuous_scale='RdYlGn_r',
        size_max=25,
        zoom=5,
        center={'lat': 51.1657, 'lon': 10.4515},
        title='Hospital Opportunity Map - Germany'
    )
    fig.update_layout(
        mapbox_style='carto-positron',
        height=600,
        coloraxis_colorbar_title='Score'
    )
    show_fig(fig)
    export_fig_png(fig, OUTPUT_DIR / 'slide5_hospital_map.png', width=1000, height=600)
else:
    print("No geographic coordinates available for map visualization.")

---
# SLIDE 6: Recommendations

In [None]:
# Slide 6: Recommendations & Next Steps

# Key statistics for recommendations
tier1_hospitals = hospital_ranked[
    (hospital_ranked['total_procedures'] >= 500) &
    (hospital_ranked['has_relevant_dept'] == 1)
]

tier2_hospitals = hospital_ranked[
    (hospital_ranked['total_procedures'] >= 200) &
    (hospital_ranked['total_procedures'] < 500) &
    (hospital_ranked['has_relevant_dept'] == 1)
]

print("=" * 80)
print("RECOMMENDATIONS & NEXT STEPS")
print("=" * 80)

print(f"""
MARKET OPPORTUNITY
{'─' * 60}
  - {eii_mid:,} expected PJI cases/year (@ 1.5% infection rate)
  - ~{backlog:,} procedure backlog from COVID = growth opportunity
  - {len(tier1_hospitals)} high-volume target hospitals (>500 procedures)

PRIORITY ACTIONS
{'─' * 60}
1. TIER 1 TARGETS ({len(tier1_hospitals)} hospitals)
   - High-volume centers with relevant departments
   - Focus on Orthopaedie and Unfallchirurgie departments
   - Key regions: {', '.join(regional_df['Bundesland'].head(3).tolist())}

2. TIER 2 TARGETS ({len(tier2_hospitals)} hospitals)
   - Medium-volume centers (200-500 procedures)
   - Regional coverage expansion

3. REGIONAL FOCUS
   - Top 3 Bundeslaender account for {regional_df['total_procedures'].head(3).sum()/regional_df['total_procedures'].sum()*100:.0f}% of procedures
   - Consider regional medical education events

VALIDATION NEEDED
{'─' * 60}
  - Cross-reference with IQVIA/Insight Health for actual Rx data
  - Hospital pharmacy consultations for consumption patterns
  - KOL identification in high-volume centers

DATA LIMITATIONS (see backup slides)
{'─' * 60}
  - No DRG I44/I47 codes available (gold standard)
  - No actual antibiotic consumption data
  - OAU proxy too sparse for scoring (excluded)
""")

---
# BACKUP SLIDES

In [None]:
# Backup 1: Data Limitations

limitations_data = {
    'Briefing Requirement': [
        'DRG codes I44/I47',
        'Patient age filter (>=50/55)',
        'Actual antibiotic consumption (DDD)',
        'Rifampicin therapy days',
        'Outpatient follow-up',
        'Case-level infection linkage',
        'Hospital case mix adjustment'
    ],
    'Available': [
        'No', 'No', 'No', 'No', 'No', 'No', 'Partial'
    ],
    'Proxy Used': [
        'OPS 5-820 to 5-823',
        'None (all ages included)',
        'None',
        'None',
        'None (inpatient only)',
        'EII estimation (1-2%)',
        'Volume as proxy'
    ],
    'Potential Data Source': [
        'InEK DRG database',
        'Destatis (Genesis 23141)',
        'IQVIA / Insight Health',
        'Hospital pharmacy data',
        'AOK/Barmer claims',
        'Hospital infection registries',
        'Case mix index from InEK'
    ]
}

limitations_df = pd.DataFrame(limitations_data)

print("=" * 90)
print("BACKUP SLIDE B1: Data Limitations")
print("=" * 90)
print("\nBriefing Requirements vs. Available Data:\n")
print(limitations_df.to_string(index=False))

In [None]:
# Backup 2: ICD vs OPS Comparison (Order of Magnitude)

# Simple ICD surrogate query (no exclusions per plan)
icd_codes_str = ", ".join([f"'{c}'" for c in ICD_SURROGATE])
icd_2023 = run_sql(f"""
SELECT 
    Berichtsjahr,
    SUM(Fallzahl) AS total_cases
FROM VIEW_Krankenhaus_Hauptdiagnosen
WHERE ICD_10 IN ({icd_codes_str})
  AND Berichtsjahr = 2023
GROUP BY Berichtsjahr
""")

icd_total = icd_2023['total_cases'].values[0] if len(icd_2023) > 0 else 0
ops_total = ops_2023_national['total'].values[0]

print("=" * 70)
print("BACKUP SLIDE B2: ICD vs OPS Comparison (2023)")
print("=" * 70)
print(f"""
ORDER OF MAGNITUDE CHECK
{'─' * 50}
  ICD-10 Surrogate (M16.0/1, M17.0/1):  {icd_total:>12,}
  OPS Procedures (5-820 to 5-823):      {ops_total:>12,}
  
  Ratio (ICD/OPS):                      {icd_total/ops_total:>12.2f}x

INTERPRETATION
{'─' * 50}
  ICD codes capture primary osteoarthritis diagnoses.
  OPS codes capture actual joint replacement procedures.
  
  Ratio > 1: More diagnoses than procedures
  (Not all diagnosed patients receive surgery)
  
  NOTE: Simple sum, no exclusions applied per briefing.
""")

In [None]:
# Backup 3: OAU Proxy Explanation & Sparsity Warning

# OAU Proxy query for context
OAU_KEYWORDS = ['rifamp', 'biofilm', 'periprothetisch', 'antibiot']
keyword_clause = " OR ".join([
    f"LOWER(COALESCE(mla.Bezeichnung, '')) LIKE '%{kw}%'"
    f" OR LOWER(COALESCE(mla.Erlaeuterungen, '')) LIKE '%{kw}%'"
    for kw in OAU_KEYWORDS
])

oau_df = run_sql(f"""
SELECT
    v.IK,
    v.Name,
    COUNT(DISTINCT mla.ID) AS antibiotic_mention_count
FROM VIEW_Krankenhaus_GEO v
JOIN REL_Qualitaetsbericht_Organisationseinheit_Fachabteilung rqo
  ON rqo.Qualitaetsbericht_ID = v.Qualitaetsbericht_ID
JOIN REL_Organisationseinheit_Fachabteilung_Medizinisches_Leistungsangebot rom
  ON rom.Organisationseinheit_Fachabteilung_ID = rqo.Organisationseinheit_Fachabteilung_ID
JOIN Medizinisches_Leistungsangebot mla
  ON mla.ID = rom.Medizinisches_Leistungsangebot_ID
WHERE v.Berichtsjahr = 2023
  AND ({keyword_clause})
GROUP BY v.IK, v.Name
""")

total_hospitals = len(hospital_ops_df)
oau_hospitals = len(oau_df)
total_mentions = oau_df['antibiotic_mention_count'].sum() if len(oau_df) > 0 else 0

print("=" * 70)
print("BACKUP SLIDE B3: OAU Proxy - Data Quality Warning")
print("=" * 70)
print(f"""
OAU PROXY METHODOLOGY
{'─' * 50}
  Source: Medizinisches_Leistungsangebot.Bezeichnung field
  Keywords searched: {OAU_KEYWORDS}
  
SPARSITY WARNING
{'─' * 50}
  Total hospitals with joint replacement:  {total_hospitals:,}
  Hospitals with ANY keyword match:        {oau_hospitals:,} ({oau_hospitals/total_hospitals*100:.1f}%)
  Total keyword mentions found:            {total_mentions:,}
  
DECISION: EXCLUDED FROM SCORING
{'─' * 50}
  - Only ~{oau_hospitals/total_hospitals*100:.0f}% of hospitals have mentions
  - Too sparse for meaningful differentiation
  - Would bias scoring toward false negatives
  - Recommend: Direct hospital surveys for validation
""")

if len(oau_df) > 0:
    print("\nTop 10 hospitals by keyword mentions:")
    print(oau_df.sort_values('antibiotic_mention_count', ascending=False).head(10).to_string(index=False))

In [None]:
# Backup 4: Methodology Notes

print("=" * 70)
print("BACKUP SLIDE B4: Methodology Notes")
print("=" * 70)
print(f"""
DATA SOURCE
{'─' * 50}
  G-BA Strukturierte Qualitaetsberichte (2017-2023)
  Mandatory hospital quality reports in Germany
  
OPS PROCEDURE CODES
{'─' * 50}
  5-820: Hip endoprosthesis, primary
  5-821: Hip endoprosthesis, revision
  5-822: Knee endoprosthesis, primary
  5-823: Knee endoprosthesis, revision
  
ICD-10 SURROGATE (validation only)
{'─' * 50}
  M16.0: Primary coxarthrosis, bilateral
  M16.1: Primary coxarthrosis, unilateral
  M17.0: Primary gonarthrosis, bilateral
  M17.1: Primary gonarthrosis, unilateral
  NOTE: Simple sum, no exclusions applied
  
DEPARTMENT CODES (corrected)
{'─' * 50}
  Included: {DEPT_RELEVANT}
  - Surgery: 1500, 1516, 1523 (Allgemein/Unfall/Ortho)
  - Orthopaedie: 2300, 2309, 2315, 2316
  
  Excluded (not relevant for TEP):
  - 1513: Kinderchirurgie
  - 1518: Gefaesschirurgie
  - 1519: Plastische Chirurgie
  - 1520: Thoraxchirurgie
  
EXPECTED INFECTION INDEX (EII)
{'─' * 50}
  EII = Primary Implants x Infection Rate
  Infection rate: 1-2% (from literature)
  NOTE: EII based on PRIMARY implants only (not revisions)
  
HOSPITAL SCORING (corrected)
{'─' * 50}
  Score = {WEIGHT_VOLUME:.0%} x Volume + {WEIGHT_DEPT:.0%} x Dept + {WEIGHT_TYPE:.0%} x Type
  - Volume: Normalized procedure count
  - Dept: Has relevant department (binary)
  - Type: High-volume proxy (>=500 procedures)
  
  NOTE: OAU/gap component REMOVED due to data sparsity
""")

---
# PPT Generation

In [None]:
# PPT Generation Cell
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RgbColor
from pptx.enum.text import PP_ALIGN

def create_executive_ppt():
    """Create executive PowerPoint with 6 content slides + backups."""
    
    prs = Presentation()
    prs.slide_width = Inches(13.333)  # 16:9 widescreen
    prs.slide_height = Inches(7.5)
    
    # Slide layouts
    blank_layout = prs.slide_layouts[6]  # Blank
    title_layout = prs.slide_layouts[5]  # Title only
    
    def add_title_slide(title, subtitle=""):
        slide = prs.slides.add_slide(prs.slide_layouts[0])
        slide.shapes.title.text = title
        if subtitle and slide.placeholders[1]:
            slide.placeholders[1].text = subtitle
        return slide
    
    def add_content_slide(title, image_path=None):
        slide = prs.slides.add_slide(title_layout)
        slide.shapes.title.text = title
        if image_path and Path(image_path).exists():
            slide.shapes.add_picture(
                str(image_path),
                Inches(0.5), Inches(1.5),
                width=Inches(12.333)
            )
        return slide
    
    # Title Slide
    add_title_slide(
        "Rifampicin Market Opportunity",
        "German Hospital Market Analysis for PJI Treatment"
    )
    
    # Slide 1: Executive Summary
    slide = prs.slides.add_slide(title_layout)
    slide.shapes.title.text = "Executive Summary"
    
    # Add text box with key metrics
    txBox = slide.shapes.add_textbox(Inches(0.5), Inches(1.5), Inches(12), Inches(5.5))
    tf = txBox.text_frame
    tf.word_wrap = True
    
    p = tf.paragraphs[0]
    p.text = f"Key Metrics (2023)"
    p.font.bold = True
    p.font.size = Pt(24)
    
    metrics = [
        f"Total Joint Replacements: {primary_implants + revisions:,}",
        f"Primary Implants: {primary_implants:,}",
        f"Expected Infections (EII @ 1.5%): {eii_mid:,} cases/year",
        f"COVID Recovery Backlog: ~{backlog:,} procedures",
        f"Target Hospitals: {len(tier1_hospitals)} high-volume centers"
    ]
    
    for metric in metrics:
        p = tf.add_paragraph()
        p.text = f"  {metric}"
        p.font.size = Pt(18)
        p.space_before = Pt(12)
    
    # Slide 2: Market Overview
    add_content_slide("Market Overview: COVID Impact & Recovery", 
                      OUTPUT_DIR / 'slide2_market_overview.png')
    
    # Slide 3: TAM Funnel
    add_content_slide("Total Addressable Market",
                      OUTPUT_DIR / 'slide3_tam_funnel.png')
    
    # Slide 4: Hospital Target List
    slide = prs.slides.add_slide(title_layout)
    slide.shapes.title.text = "Top 20 Target Hospitals"
    
    # Add table for top 10
    top10 = hospital_ranked.head(10)
    rows, cols = 11, 4  # Header + 10 rows, 4 columns
    table = slide.shapes.add_table(rows, cols, Inches(0.5), Inches(1.5), 
                                   Inches(12), Inches(5)).table
    
    # Headers
    headers = ['Rank', 'Hospital', 'Bundesland', 'Procedures']
    for i, header in enumerate(headers):
        cell = table.cell(0, i)
        cell.text = header
        cell.text_frame.paragraphs[0].font.bold = True
        cell.text_frame.paragraphs[0].font.size = Pt(12)
    
    # Data
    for row_idx, (_, row) in enumerate(top10.iterrows(), start=1):
        table.cell(row_idx, 0).text = str(row['rank'])
        table.cell(row_idx, 1).text = row['Name'][:40] + ('...' if len(row['Name']) > 40 else '')
        table.cell(row_idx, 2).text = str(row['Bundesland'])
        table.cell(row_idx, 3).text = f"{row['total_procedures']:,.0f}"
        for col in range(cols):
            table.cell(row_idx, col).text_frame.paragraphs[0].font.size = Pt(10)
    
    # Slide 5: Regional Analysis
    add_content_slide("Regional Analysis by Bundesland",
                      OUTPUT_DIR / 'slide5_regional_analysis.png')
    
    # Slide 6: Recommendations
    slide = prs.slides.add_slide(title_layout)
    slide.shapes.title.text = "Recommendations"
    
    txBox = slide.shapes.add_textbox(Inches(0.5), Inches(1.5), Inches(12), Inches(5.5))
    tf = txBox.text_frame
    tf.word_wrap = True
    
    recommendations = [
        ("Priority Targets", f"{len(tier1_hospitals)} high-volume hospitals with relevant departments"),
        ("Regional Focus", f"Top 3 Bundeslaender: {', '.join(regional_df['Bundesland'].head(3).tolist())}"),
        ("Market Opportunity", f"{eii_mid:,} expected infections/year + {backlog:,} procedure backlog"),
        ("Validation Needed", "Cross-reference with IQVIA for actual Rx data")
    ]
    
    for title, desc in recommendations:
        p = tf.add_paragraph()
        p.text = f"{title}"
        p.font.bold = True
        p.font.size = Pt(18)
        p.space_before = Pt(16)
        
        p = tf.add_paragraph()
        p.text = f"   {desc}"
        p.font.size = Pt(14)
    
    # Backup Slides
    add_title_slide("Backup Slides", "")
    
    # B1: Data Limitations
    slide = prs.slides.add_slide(title_layout)
    slide.shapes.title.text = "B1: Data Limitations"
    txBox = slide.shapes.add_textbox(Inches(0.5), Inches(1.5), Inches(12), Inches(5))
    tf = txBox.text_frame
    tf.word_wrap = True
    p = tf.paragraphs[0]
    p.text = "Key data gaps: No DRG codes, no pharmacy data, no patient demographics"
    p.font.size = Pt(14)
    p = tf.add_paragraph()
    p.text = "Proxies used: OPS procedures, EII estimation, volume-based scoring"
    p.font.size = Pt(14)
    p = tf.add_paragraph()
    p.text = "Validation sources: IQVIA, Insight Health, hospital surveys"
    p.font.size = Pt(14)
    
    # B2: ICD vs OPS
    slide = prs.slides.add_slide(title_layout)
    slide.shapes.title.text = "B2: ICD vs OPS Comparison"
    txBox = slide.shapes.add_textbox(Inches(0.5), Inches(1.5), Inches(12), Inches(5))
    tf = txBox.text_frame
    p = tf.paragraphs[0]
    p.text = f"ICD-10 Surrogate (M16.0/1, M17.0/1): {icd_total:,}"
    p.font.size = Pt(16)
    p = tf.add_paragraph()
    p.text = f"OPS Procedures (5-820 to 5-823): {ops_total:,}"
    p.font.size = Pt(16)
    p = tf.add_paragraph()
    p.text = f"Ratio: {icd_total/ops_total:.2f}x (diagnoses > procedures)"
    p.font.size = Pt(16)
    
    # B3: OAU Proxy
    slide = prs.slides.add_slide(title_layout)
    slide.shapes.title.text = "B3: OAU Proxy - Data Quality Warning"
    txBox = slide.shapes.add_textbox(Inches(0.5), Inches(1.5), Inches(12), Inches(5))
    tf = txBox.text_frame
    p = tf.paragraphs[0]
    p.text = f"Only {oau_hospitals} of {total_hospitals} hospitals ({oau_hospitals/total_hospitals*100:.0f}%) have keyword matches"
    p.font.size = Pt(16)
    p = tf.add_paragraph()
    p.text = "Decision: EXCLUDED from scoring due to sparsity"
    p.font.size = Pt(16)
    p.font.bold = True
    
    # B4: Methodology
    slide = prs.slides.add_slide(title_layout)
    slide.shapes.title.text = "B4: Methodology Notes"
    txBox = slide.shapes.add_textbox(Inches(0.5), Inches(1.5), Inches(12), Inches(5))
    tf = txBox.text_frame
    p = tf.paragraphs[0]
    p.text = f"Scoring: {WEIGHT_VOLUME:.0%} Volume + {WEIGHT_DEPT:.0%} Dept + {WEIGHT_TYPE:.0%} Type"
    p.font.size = Pt(16)
    p = tf.add_paragraph()
    p.text = f"Departments: {', '.join(DEPT_RELEVANT)}"
    p.font.size = Pt(14)
    p = tf.add_paragraph()
    p.text = "EII = Primary Implants x Infection Rate (1-2%)"
    p.font.size = Pt(14)
    
    # Save
    output_path = Path.cwd() / 'rifampicin_executive.pptx'
    prs.save(output_path)
    print(f"PowerPoint saved: {output_path}")
    return output_path

# Generate PPT
ppt_path = create_executive_ppt()
print(f"\nDeliverables created:")
print(f"  - {ppt_path}")
print(f"  - {OUTPUT_DIR}/ (PNG exports)")
print(f"  - {OUTPUT_DIR / 'top20_hospitals.csv'}")

In [None]:
# Final Summary & Verification
print("=" * 70)
print("VERIFICATION CHECKLIST")
print("=" * 70)

checks = [
    ("Data years: 2017-2023", len(trend_df) == 7),
    ("2019 data exists", 2019 in trend_df['Berichtsjahr'].values),
    ("Department codes corrected (7 total)", len(DEPT_RELEVANT) == 7),
    ("ICD surrogate: simple sum (4 codes)", len(ICD_SURROGATE) == 4),
    ("EII uses primary implants only", True),
    ("Scoring: no OAU component", WEIGHT_VOLUME + WEIGHT_DEPT + WEIGHT_TYPE == 1.0),
    ("PNG exports created", (OUTPUT_DIR / 'slide2_market_overview.png').exists()),
    ("PPT generated", Path('rifampicin_executive.pptx').exists()),
]

for check, passed in checks:
    status = "PASS" if passed else "FAIL"
    print(f"  [{status}] {check}")

print("\n" + "=" * 70)
print("NOTEBOOK COMPLETE")
print("=" * 70)