<a href="https://colab.research.google.com/github/24p11/recode-scenario/blob/main/scenario_oncology_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Create fictive clinical notes from Code set (DRG + ICD)

Code set are the raw classification data, we can extract from National database (Base nationale PMSI en France). They are made of 
* classification profile made of grouping variables from DRG records which are prepared with their frequency in the national database
    - age (class)
    - sexe
    - DRG (racine GHM)
    - Main diagnosis (ICD10) : cf
    - Hospitalization management type : cf
* diagnosis associated to each classification profile, extracted with their frequencies
* procedures associated to each classification profile, specialy for surgery and technical gestures, extracted with their frequencies

From thoses raw information we produce a coded clinical scenario which will be uses a seed.

This scenario is transformed into a detail prompt that will be given to a LLM for generation.
From the combinaision of primary and related diagnosis in French discharge abstract, we derived two notions :
* Primary diagnosis : host the notion of principal pathology, it is rather the primary diagnosis of the discharge abstract or the related diagnosis when it exists and that the primary diagnosis of the discharge abstract is from the chapter "Facteurs influant sur l’état de santé" of ICD10
* The Hospitalization management type is rather the term "Primary diagnosis" or the ICD-10 code of the related diagnosis when it exists


In [301]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [302]:
import pandas as pd
import numpy as np
import datetime as dt

In [303]:
from utils import *

In [304]:
gs = generate_scenario()
# Load official dictionaries
# col_names option allow you to algin your column names the project dictionary.
gs.load_offical_icd("cim_2024.xlsx",col_names={"code" : "icd_code","libelle":"icd_code_description"} )
gs.load_offical_procedures("ccam_actes_2024.xlsx",col_names={"code":"procedure","libelle_long":"procedure_description"} )
col_names={"Code CIM":"icd_parent_code","Localisation":"primary_site","Type Histologique":"histological_type",
	"Stade":"stage","Marqueurs Tumoraux":"biomarkers","Traitement":"treatment_recommandation","Protocole de Chimiothérapie":"chemotherapy_regimen"}
gs.load_cancer_treatement_recommandations("Tableau récapitulatif traitement cancer.xlsx",col_names ) 

In [305]:
# Load data from BN  PMSI
col_names={"racine":"drg_parent_code","das": "icd_secondary_code","diag":"icd_primary_code","categ_cim":"icd_primary_parent_code",
            "mdp":"case_management_type","nb_situations":"nb","acte":"procedure",
            "mode_entree":"admission_mode",
            "mode_sortie":"discharge_disposition",
            "mode_hospit":"admission_type"}
gs.load_classification_profile("bn_pmsi_cases_20250819.csv", col_names)
gs.load_secondary_icd("bn_pmsi_related_diag_20250818.csv",col_names)
gs.load_procedures("bn_pmsi_procedures_20250818.csv",col_names)

In [306]:
cols_scenario = ["first_name","last_name","cage2","cage","sexe",
                "last_name_med","icd_primary_code",
                "admission_type","admission_mode","discharge_disposition",
                'drg_parent_code','icd_primary_code','icd_secondaray_code','cd_md_pec']
                
cols_cancer = ["cancer_stage","TNM_score","histological_type","treatment_recommandation","chemotherapy_regimen"]

In [307]:
#Prepare cases
df_profile = gs.df_classification_profile.drop(columns="nb")
df_profile = df_profile[df_profile.icd_primary_code.isin(gs.icd_codes_cancer)]

In [308]:
# test_profile = df_profile[(df_profile["icd_primary_code"]=="C50") & (df_profile["case_management_type"]=="DP")].iloc[0].copy()
# test_profile

In [309]:
# test_scenario = gs.generate_scenario_from_profile(test_profile)
# test_scenario

In [310]:
def create_prompt(scenario):
    if scenario['admission_type'] == "Inpatient" and scenario['drg_parent_code'][2:3]=="C" :
        template_name = "surgery_complete.txt"
    elif scenario['admission_type'] == "Outpatient" and scenario['drg_parent_code'][2:3]=="C" :
        template_name = "surgery_outpatient.txt"
    else:
        template_name = "scenario_onco_v1.txt"
    
    case  = gs.make_prompts_marks_from_scenario(scenario)
    prompt = prepare_prompt("templates/" + template_name, case=case)
    return prompt

In [311]:
# test_prompt = create_prompt(test_scenario)
# print (test_prompt)

In [312]:
# col_names = ["icd_primary_code", "case_management_type", "drg_parent_code", "cage2","cage", "sexe", "admission_type","admission_mode", "discharge_disposition",
#              "dms", "los_mean", "los_sd", "drg_parent_description"]
# test_generation_v1 = df_profile.sample(20)[col_names].reset_index(drop=True)
test_generation_v1 = df_profile.sample(20).reset_index(drop=True)
test_generation_v1

Unnamed: 0,icd_primary_code,case_management_type,drg_parent_code,age,cage,cage2,sexe,admission_type,admission_mode,discharge_disposition,...,da,libelle_da,gp_cas,libelle_gp_cas,ga,libelle_ga,da_gp,da_gp_ga,anseqta,aso
0,C447,DP,09C25,ge_18,[60-70[,[50-[,2,Outpatient,DOMICILE,DOMICILE,...,D20,Tissu cutané et tissu sous-cutané,C26,Chirurgie de la peau,G218,chirurgie tumeurs malignes de la peau,D20C26,D20C26G218,2024,C
1,C73,DP,10C11,ge_18,[40-50[,[18-50[,1,Inpatient,DOMICILE,DOMICILE,...,D19,Endocrinologie,C22,"Chirurgie de la Thyroide, Parathyroide, du Tra...",G158,Chirurgie de la thyroide,D19C22,D19C22G158,2024,C
2,C220,Z515,23Z02,ge_18,[60-70[,[50-[,1,Outpatient,DOMICILE,DOMICILE,...,D24,"Douleurs chroniques, Soins palliatifs",X22,Douleur et soins palliatifs,G176,Soins palliatifs,D24X22,D24X22G176,2024,M
3,C250,DP,07M06,ge_18,[60-70[,[50-[,2,Inpatient,DOMICILE,DECES,...,D01,Digestif,X02,Hépato-Gastro-Entérologie,G019,Affections hépatiques sévères et affections du...,D01X02,D01X02G019,2024,M
4,C435,DP,09C25,ge_18,[40-50[,[18-50[,1,Outpatient,DOMICILE,DOMICILE,...,D20,Tissu cutané et tissu sous-cutané,C26,Chirurgie de la peau,G218,chirurgie tumeurs malignes de la peau,D20C26,D20C26G218,2024,C
5,C64,Z452,05K14,ge_18,[80-[,[50-[,1,Outpatient,DOMICILE,DOMICILE,...,D07,Cardio-vasculaire (hors cathétérismes vasculai...,K06,Mise en place d'accès vasculaire,G199,Mise en place d'accès vasculaire,D07K06,D07K06G199,2024,C
6,C50,Z088,09M13,ge_18,[60-70[,[50-[,2,Outpatient,DOMICILE,DOMICILE,...,D12,Gynécologie - sein,X11,"Gynécologie, Sénologie (hors Obstétrique)",G115,Explorations et surveillance gynécologiques et...,D12X11,D12X11G115,2024,M
7,C20,Z452,05K14,ge_18,[50-60[,[50-[,1,Outpatient,DOMICILE,DOMICILE,...,D07,Cardio-vasculaire (hors cathétérismes vasculai...,K06,Mise en place d'accès vasculaire,G199,Mise en place d'accès vasculaire,D07K06,D07K06G199,2024,C
8,C549,Z5100,28Z19,ge_18,[70-80[,[50-[,2,Outpatient,DOMICILE,DOMICILE,...,D27,Séances,S04,Radiothérapie,G189,Séances : radiothérapie,D27S04,D27S04G189,2024,M
9,C05,DP,03M07,ge_18,[70-80[,[50-[,2,Inpatient,DOMICILE,DOMICILE,...,D10,"ORL, Stomatologie",X09,"ORL, Stomato",G092,Prise en charge médicale des tumeurs malignes ...,D10X09,D10X09G092,2024,M


In [313]:
list_scenario = []

for i in range(len(test_generation_v1)):
    profile = test_generation_v1.iloc[i].copy()
    scenario = gs.generate_scenario_from_profile(profile)
    row = {k:scenario[k] for k in scenario.keys()}
    prompt = create_prompt(scenario)
    row["prompt"] = prompt
    list_scenario.append(row)

In [314]:
keep_cols = ['age', 'sexe', 'date_entry', 'date_discharge', 'date_of_birth',
       'first_name', 'last_name', 'icd_primary_code', 'case_management_type',
       'icd_secondaray_code', 'admission_mode', 'discharge_disposition',
       'cancer_stage', 'score_TNM', 'histological_type',
       'treatment_recommandation', 'chemotherapy_regimen', 'drg_parent_code',
       'cage', 'cage2', 'admission_type', 'dms', 'los_mean', 'los_sd',
       'drg_parent_description', 'icd_parent_code', 'icd_primary_description',
       'case_management_type_description', 'first_name_med', 'last_name_med',
       'text_secondary_icd_official', 'procedure', 'text_procedure',
       'case_management_type_text', 'cd_md_pec', 'prompt', 'biomarkers']
df_scenario = pd.DataFrame(list_scenario)[keep_col]
df_scenario

NameError: name 'keep_col' is not defined

In [None]:
df_scenario.to_csv("test_generation_v1.")

In [None]:
# df_scenario =[]
# for i in range(0,5):

#     current_profile = df_profile.iloc[i,:]

#     scenario = gs.generate_scenario_from_profile(current_profile)
#     row = {k:scenario[k] for k in scenario if k in cols_scenario }
#     cancer = [scenario[k] for k in scenario if k in cols_cancer ]

#     row.update({"cancer":cancer})
    
#     case  = gs.make_prompts_marks_from_scenario(scenario)
    
#     row.update({'case': case})


#     if row['admission_type'] == "Inpatient" and row['drg_parent_code'][2:3]=="C" :
#         template_name = "surgery_complete.txt"
#     elif row['admission_type'] == "Outpatient" and row['drg_parent_code'][2:3]=="C" :
#         template_name = "surgery_outpatient.txt"
#     elif row['drg_parent_code'][2:3]=="K" :
#         template_name = "interventionnel.txt"
#     elif row['cd_md_pec']==17 :
#         template_name = "bilan.txt"
#     else:
#         template_name = "scenario_onco_v1.txt"
        
#     prompt =  prepare_prompt("templates/" + template_name ,case =case)
#     row.update({'prompt': prompt})

#     df_scenario.append(row)

In [226]:
# df_scenario= pd.DataFrame(df_scenario)

In [227]:
# df_scenario.to_csv(gs.path_data + "test_scenario_v1.csv")

In [None]:
# df_scenario[]