In [None]:
import json
import pandas as pd
import zipfile
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
with open('or_dataset/release_evidences.json', 'r') as f:
    evidence_dict = json.load(f)

with open('or_dataset/release_conditions.json', 'r') as f:
    condition_dict = json.load(f)

# Load patients data
def load_patients(file_path):
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        with zip_ref.open(zip_ref.namelist()[0]) as f:
            return pd.read_csv(f)

# train_patients = load_patients('or_dataset/release_train_patients.zip')
# val_patients = load_patients('or_dataset/release_validate_patients.zip')
# test_patients = load_patients('or_dataset/release_test_patients.zip')

train_patients = pd.read_csv('sampled_combined_data.csv')
val_patients = pd.read_csv('sampled_validate_combined_data.csv')
test_patients = pd.read_csv('sampled_test_combined_data.csv')

In [8]:
def create_text_representation(row, output_path):
    # Gather patient information
    age = row['AGE']
    sex = row['SEX']
    pathology = row['PATHOLOGY']
    initial_evidence = row['INITIAL_EVIDENCE']
    evidences = eval(row['EVIDENCES'])
    evidences = [initial_evidence] + evidences
    # For differential diagnosis 
    # data = eval(row['DIFFERENTIAL_DIAGNOSIS'])
    # diseases = [item[0] for item in data]
    # diseases = ', '.join(diseases)

    description = f"Age: {age}, Sex: {sex}. "
    # Add detailed symptoms and antecedents
    symptom_texts = []
    antecedents = []
    for evidence_code in evidences:
        # Separate multi-choice evidence by value
        if "_@_" in evidence_code:
            evidence, value = evidence_code.split('_@_')
            evidence_text = evidence_dict[evidence]['question_en']
            value_text = evidence_dict[evidence]['value_meaning'].get(value)
            value_text = value_text['en'] if value_text is not None else value
            if evidence_dict[evidence]['is_antecedent']:
                antecedents.append(f"{evidence_text}: {value_text}")
            else:
                symptom_texts.append(f"{evidence_text}: {value_text}")
        else:
            if evidence_dict[evidence_code]['is_antecedent']:
                antecedents.append(evidence_dict[evidence_code]['question_en']+'Y')
            else:
                symptom_texts.append(evidence_dict[evidence_code]['question_en']+'Y')

    description += "History:" + "; ".join(antecedents) + ". Symptoms: " + "; ".join(symptom_texts) + "."
    label = pathology
    system_message = "You are an expert and experienced from the healthcare and biomedical domain with extensive medical knowledge and practical experience. Your name is OpenBioLLM, and you are willing to help answer the user's query which will include symptoms and history with a diagnosis. In your explanation, leverage your deep medical expertise such as relevant anatomical structures, physiological processes, diagnostic criteria, or other pertinent medical concepts. "

    with open(output_path, 'a', encoding='utf-8') as f:
        chat_format = {
            "messages": [
                {
                    "role": "user",
                    "content": str(description)
                },
                {
                    "role": "assistant",
                    "content": str(label)
                }
            ]
        }
        json.dump(chat_format, f, ensure_ascii=False)
        f.write('\n')
    

def create_json_representation(output_path, df):
    system_message = "You are an expert and experienced from the healthcare and biomedical domain with extensive medical knowledge and practical experience. Your name is OpenBioLLM, and you are willing to help answer the user's query which will include symptoms and history with a diagnosis. In your explanation, leverage your deep medical expertise such as relevant anatomical structures, physiological processes, diagnostic criteria, or other pertinent medical concepts. "
    with open(output_path, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            chat_format = {
                "messages": [
                    {
                        "role": "user",
                        "content": str(row['text'])
                    },
                    {
                        "role": "assistant",
                        "content": str(row['label'])
                    }
                ]
            }
            json.dump(chat_format, f, ensure_ascii=False)
            f.write('\n')

# Apply to datasets

#train_results = pd.DataFrame(train_patients.apply(create_text_representation, axis=1).toList(), columns=['text', 'label'])
val_patients = val_patients
val_results = val_patients.apply(create_text_representation, output_path ='val_results.jsonl', axis=1 )
#val_results = pd.DataFrame(val_results.tolist(), columns=['text', 'label'])

test_patients = test_patients
test_results = test_patients.apply(create_text_representation,output_path ='test_results.jsonl', axis=1)
#test_results = pd.DataFrame(test_results.tolist(), columns=['text', 'label'])

train_patients = train_patients
train_results = train_patients.apply(create_text_representation,output_path ='train_results.jsonl', axis=1)
#train_results = pd.DataFrame(train_results.tolist(), columns=['text', 'label'])
#train_results.to_csv('train.csv', index=False)

#val_results = pd.DataFrame(val_patients,columns=['text', 'label'])
#test_results = pd.DataFrame(test_patients.apply(create_text_representation, axis=1).toList(), columns=['text', 'label'])

print('done')


done


In [None]:
val_results.to_csv('val_patients_with_text.csv', index=False)
test_results.to_csv('test_patients_with_text.csv', index=False)
train_results.to_csv('train_patients_with_text.csv', index=False)

In [None]:
all_labels = pd.concat([train_patients['PATHOLOGY'], val_patients['PATHOLOGY'], test_patients['PATHOLOGY']])

# Fit Label Encoder
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Transform pathologies to numerical labels
train_patients_label = label_encoder.transform(train_patients['PATHOLOGY'])
val_patients_label = label_encoder.transform(val_patients['PATHOLOGY'])
test_patients_label = label_encoder.transform(test_patients['PATHOLOGY'])

In [None]:
train_file_path = 'train_patients_with_text.csv'
val_file_path = 'val_patients_with_text.csv'
test_file_path = 'test_patients_with_text.csv'

# Save each DataFrame to a CSV file
train_patients.to_csv(train_file_path, index=False)
val_patients.to_csv(val_file_path, index=False)
test_patients.to_csv(test_file_path, index=False)