# Setting

In [None]:
import pandas as pd
import os
import numpy as np
import re 
import json
import openai  
import asyncio
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import notebook
from tqdm.notebook import tqdm
from openai import AsyncOpenAI
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter 
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from rank_bm25 import BM25Okapi   # lexical search; sparse vector filter
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix, cohen_kappa_score 




warnings.filterwarnings("ignore")



In [None]:
# Set OpenAI API key
os.environ['OPENAI_API_KEY']='your_api_key_here'  
openai_api_key = os.environ.get('OPENAI_API_KEY')

In [None]:
df=pd.read_excel('Data.xlsx')


# # Load data
# def load_data(filepath):
#     return pd.read_excel(filepath)

In [None]:
doi = '급성 뇌졸중'   # Disease of Interest

# Define the imaging report types and the corresponding column names for storing results.
imaging_types = [
    ('MRI', 'MRI'),
    ('CT', 'CT'),
    ('ANG', 'ANG')
]


In [None]:
# Function to remove special characters
def remove_special_characters(text):
    # Remove all characters except alphanumeric characters and whitespace
    return re.sub(r'[^a-zA-Z0-9가-힣\s]', '', str(text))


In [None]:
# Apply the function to remove special characters from each report column
for col, _ in imaging_types:
    df[col] = df[col].apply(remove_special_characters)

# Print the first few rows of the dataframe to verify the results
df

# Stage 1 : Zero-Shot + Ensemble


In [None]:
# Zero-shot prompt for Stage 1
zeroshot_prompt = """
다음과 같은 {imaging_report}이/가 Input으로 주어졌을 때, 
차트 리뷰를 하여 환자가 {disease} 환자인지 판단합니다.

<판단> 에서 {disease} 환자일 확률을 0과 1사이의 값으로 나타냅니다.
논리적으로 생각하고 <결정 근거>에서 자신의 결정에 대한 추론을 제공합니다.

Output에는 다음 형식을 사용합니다.
<판단> : (0과 1사이의 값만 해당) 
<결정근거> : 

Input : {input_text}
Output :
"""

# Few-shot propmt for Stage 2
fewshot_prompt = """
다음과 같은 {imaging_report}이/가 Input으로 주어졌을 때, 
예시들을 참고해 차트 리뷰를 하여 환자가 {disease} 환자인지 판단합니다.
예시들은 {imaging_report}과/와 판단입니다. 

예시 : {fewshot_retrieval}

<판단> 에서 {disease} 환자일 확률을 0과 1사이의 값으로 나타냅니다.
논리적으로 생각하고 <결정 근거>에서 자신의 결정에 대한 추론을 제공합니다.

Output에는 다음 형식을 사용합니다.
<판단> : (0과 1사이의 값만 해당) 
<결정근거> : 

Input : {input_text}
Output :
"""

In [None]:
async def chat_completion(input_prompt, model='gpt-4-turbo-preview'):
    client = AsyncOpenAI()
    
    SYSTEM_PROMPT = "당신은 의료 AI 언어 모델입니다."
    USER_PROMPT_1 = """당신의 역할에 대해 명확합니까?"""
    ASSISTANT_PROMPT_1 = """네. 저는 환자를 직접 치료하지 않습니다. 하지만 의료 전문가를 도와드릴 준비가 되어있습니다. 시작하는 데 필요한 정보를 알려주세요."""

    response = await client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": USER_PROMPT_1},
            {"role": "assistant", "content": ASSISTANT_PROMPT_1},
            {"role": "user", "content":input_prompt}
        ],
        temperature=0
        )
    return response 

In [None]:

async def zeroshot_run_async(main_prompt, information, imaging_report, doi):
    tasks = []
    for i in information:
        new_prompt = main_prompt.format(imaging_report=imaging_report, disease=doi, input_text=i)
        tasks.append(chat_completion(new_prompt))
    responses = await asyncio.gather(*tasks)
    return responses




In [None]:
def detect_prob(a_value):
    pattern = r'\d+(\.\d+)?'  
    match = re.search(pattern, str(a_value))

    if match:
        return float(match.group())
    elif 'Output: ' in a_value:
        try:
            return float(a_value.split('Output: ')[1])
        except ValueError:
            return np.nan
    else:
        try:
            return float(a_value)
        except ValueError:
            return np.nan
        
        
def calculate_entropy(p):
    p = np.clip(p, 1e-15, 1 - 1e-15)  # prevent log(0) 
    return - (p * np.log(p) + (1 - p) * np.log(1 - p))   #binary cross entropy  

def uc_grouping(entropy_list, cutoff):
    high_idx = []
    low_idx = []
    for idx, entropy in enumerate(entropy_list):
        if entropy >= cutoff:
            high_idx.append(idx)
        else:
            low_idx.append(idx)
            
    return high_idx, low_idx      

In [None]:
# Initialize dataframe columns for each imaging type.
for imaging_type, base_column_name in imaging_types:
    for i in range(1, 4):  # Process each report three times
        df[f'Stage1_{base_column_name}_predict_{i}'] = np.nan

# Loop over each imaging type and count to process each three times.
for imaging_type, base_column_name in imaging_types:
    for i in range(1, 4):   # 3 times repeat
        # print(f"Processing {imaging_type}, Round {i}")
        
        # Get indices and values of non-NaN entries.
        non_nan_indices = df[imaging_type].dropna().index
        non_nan_values = df[imaging_type].dropna().values
                
        # Split the data into batches of 15.
        batches = [non_nan_values[j:j + 15].tolist() for j in range(0, len(non_nan_values), 15)]
        
        outputs = []
        for batch in tqdm(batches):
            output = await zeroshot_run_async(zeroshot_prompt, batch, imaging_type, doi)
            outputs.extend(output)
        
        # Extract results from the output
        results = [output.choices[0].message.content for output in outputs]
        
        # Store results in the corresponding column.
        result_column = f'Stage1_{base_column_name}_predict_{i}'
        for idx, result in zip(non_nan_indices, results):
            df.at[idx, result_column] = result



In [None]:
# Temporary dictionary to store intermediate results for averaging later
temp_results = {f'Stage1_{t[1]}_prob': [] for t in imaging_types}

# Process each imaging type and repeat three times for each
for imaging_type, base_column_name in imaging_types:
    for i in range(1, 4):
        predict_column = f'Stage1_{base_column_name}_predict_{i}'

        # Get non-NaN values and their indices
        non_nan_indices = df[predict_column].dropna().index
        non_nan_values = df[predict_column].dropna().values

        results = []
        for value in non_nan_values:
            results.append(detect_prob(value))

        # Append results to the corresponding list in the temporary dictionary
        temp_results[f'Stage1_{base_column_name}_prob'].append(pd.Series(results, index=non_nan_indices))

# Calculate the mean probabilities for each imaging type and store them in the dataframe
for key, value in temp_results.items():
    if value:
        df[key + '_mean'] = pd.concat(value, axis=1).mean(axis=1, skipna=True)

In [None]:
# Calculate the maximum of the mean probabilities across all imaging types and store it in the dataframe
df['Stage1_prob'] = df[[f'Stage1_{t[1]}_prob_mean' for t in imaging_types]].max(axis=1, skipna=True)
df['Stage1_prob'].fillna(0, inplace=True)

probs = df['Stage1_prob'].values
entropies = calculate_entropy(probs)
df['Stage1_entropy'] = entropies

In [None]:
Stage1_entropy_list = df['Stage1_entropy'].values
cutoff_value = df['Stage1_entropy'].median()
high, low = uc_grouping(Stage1_entropy_list, cutoff=cutoff_value)
Stage1_high = df.loc[high].reset_index(drop=True)
Stage1_low = df.loc[low].reset_index(drop=True)

Stage1_high.to_excel('Stage1_high_entropygroup.xlsx', index=False)   # Results for the high entropy group in Stage 1
Stage1_low.to_excel('Stage1_low_entropygroup.xlsx', index=False)    # Results for the low entropy group in Stage 1



In [None]:
df.to_excel('Stage1_result.xlsx', index=False)            # Results adopted from Stage 1 only

# STAGE 2 : Few-Shot + RAG

In [None]:
# Define columns to extract and new column names
RAG_datasets = {
    'MRI': ('MRI 판독문', 'Stage1_MRI_prob_mean'),
    'CT': ('CT 판독문', 'Stage1_CT_prob_mean'),
    'ANG': ('Cerebral Angiography 판독문', 'Stage1_ANG_prob_mean')
}

# rename to format
for key, (input_col, output_col) in RAG_datasets.items():

    data = Stage1_low[[input_col, output_col]].dropna()
    data.rename(columns={input_col: 'Input', output_col: 'Output'}, inplace=True)
    
    data.to_csv(f'{key}.csv', index=False)

In [None]:
def setup_retrieval_system(imaging_type):
    ##### 1) Load documents 
    loader = CSVLoader(f'{imaging_type}.csv', source_column='Output')   
    docs = loader.load()
    doc_list = [doc.page_content for doc in docs]      # List comprehension to create a list for lexical search usage

    ##### 2) Chunking
    # text_splitter = RecursiveCharacterTextSplitter(
        # chunk_size=400,  
        # chunk_overlap=10,   
        # separators=["\n\n", "\n", " ", ""]
        # )

    ##### 3) Embedding
    model_name = 'text-embedding-ada-002' 
    embeddings = OpenAIEmbeddings(
        model=model_name,
        openai_api_key=openai_api_key
    )

    ##### 4) Generate VectorStore
    vectorstore = FAISS.from_documents(docs, embeddings)

    ##### 5) Retrieval
    k = 6  # Number of top similar documents to retrieve

    # Lexical search setup using BM25
    bm25_retriever = BM25Retriever.from_texts(doc_list)
    bm25_retriever.k = k

    # Semantic search setup using FAISS (cosine similarity)
    faiss_retriever = vectorstore.as_retriever(
        search_type="similarity",  
        search_kwargs={'k': k}
    )

    # Hybrid search setup using a weighted combination of BM25 and FAISS
    ensemble_retriever = EnsembleRetriever(
        retrievers=[bm25_retriever, faiss_retriever], weights=[0.6, 0.4]
    )

    return ensemble_retriever, k




In [None]:
def retrieve_shots(query, retriever, k):
    # Retrieve documents relevant to the query
    shots_tot = retriever.get_relevant_documents(query)
    # Extract and format the content from retrieved documents
    shots = [doc.page_content for doc in shots_tot[:k]]
    formatted_shots = '\n'.join(f'{shot.split("Output: ")[0].strip()}\nOutput: {shot.split("Output: ")[1].strip()}' for shot in shots)
    return formatted_shots


async def fewshot_run_async(main_prompt, information, imaging_report, doi, retriever, k):
    tasks = []
    for i in information:
        shots = retrieve_shots(i, retriever, k)
        new_prompt = main_prompt.format(imaging_report=imaging_report, disease=doi, fewshot_retrieval=shots, input_text=i)
        # print(new_prompt)
        tasks.append(chat_completion(new_prompt))
    responses = await asyncio.gather(*tasks)
    return responses


In [None]:
for imaging_type, base_column_name in imaging_types:
    retriever, k = setup_retrieval_system(base_column_name)
        
    # Get indices and values of non-NaN entries.
    non_nan_indices = Stage1_high[imaging_type].dropna().index
    non_nan_values = Stage1_high[imaging_type].dropna().values
                
    # Split the data into batches of 15.
    batches = [non_nan_values[j:j + 15].tolist() for j in range(0, len(non_nan_values), 15)]
        
    outputs = []
    for batch in tqdm(batches):
        output = await fewshot_run_async(fewshot_prompt, batch, imaging_type, doi, retriever, k)
        outputs.extend(output)
        
    # Extract results from the output
    results = [output.choices[0].message.content for output in outputs]
        
    # Store results in the corresponding column.
    result_column = f'Stage2_{base_column_name}_predict'
    for idx, result in zip(non_nan_indices, results):
        Stage1_high.at[idx, result_column] = result

In [None]:
# Temporary dictionary to store intermediate results for averaging later
temp_results = {f'Stage2_{t[1]}_prob': [] for t in imaging_types}

# Process each imaging type and repeat three times for each
for imaging_type, base_column_name in imaging_types:
    predict_column = f'Stage2_{base_column_name}_predict'

    # Get non-NaN values and their indices
    non_nan_indices = Stage1_high[predict_column].dropna().index
    non_nan_values = Stage1_high[predict_column].dropna().values

    results = []
    for value in non_nan_values:
        results.append(detect_prob(value))

    # Append results to the corresponding list in the temporary dictionary
    temp_results[f'Stage2_{base_column_name}_prob'].append(pd.Series(results, index=non_nan_indices))

# Calculate the mean probabilities for each imaging type and store them in the dataframe
for key, value in temp_results.items():
    if value:
        Stage1_high[key + '_mean'] = pd.concat(value, axis=1).mean(axis=1, skipna=True)



In [None]:
# Calculate the maximum of the mean probabilities across all imaging types and store it in the dataframe
Stage1_high['Stage2_prob'] = Stage1_high[[f'Stage2_{t[1]}_prob_mean' for t in imaging_types]].max(axis=1, skipna=True)
Stage1_high['Stage2_prob'].fillna(0, inplace=True)

probs = Stage1_high['Stage2_prob'].values
entropies = calculate_entropy(probs)
Stage1_high['Stage2_entropy'] = entropies

In [None]:
Stage2_entropy_list = Stage1_high['Stage2_entropy'].values
cutoff_value = Stage1_high['Stage2_entropy'].median()
high, low = uc_grouping(Stage2_entropy_list, cutoff=cutoff_value)
Stage2_high = Stage1_high.loc[high].reset_index(drop=True)    
Stage2_low = Stage1_high.loc[low].reset_index(drop=True)

Stage2_high.to_excel('Stage2_high_entropygroup.xlsx', index=False)   # Results for the high entropy group in Stage 2
Stage2_low.to_excel('Stage2_low_entropygroup.xlsx', index=False)    # Results for the low entropy group in Stage 2


In [None]:
Stage1_low['Stage2_prob'] = Stage1_low['Stage1_prob']
combined_df_1= pd.concat([Stage1_low, Stage1_high], ignore_index=True)
combined_df_1.to_excel('Stage2_result.xlsx', index=False)      # Results adopted from Stage 1, and Stage 2


Stage1_low['Stage3_prob'] = Stage1_low['Stage1_prob']
Stage2_low['Stage3_prob'] = Stage2_low['Stage2_prob']
Stage2_high['Stage3_prob'] = Stage2_high['Label']
combined_df_2= pd.concat([Stage1_low, Stage2_low, Stage2_high], ignore_index=True)
combined_df_2.to_excel('Stage3_result.xlsx', index=False)      # Results adopted from Stage 1, Stage 2, and Stage 3


# Evaluation

In [None]:
evaluation_datasets = [
    ('Stage1_low_entropygroup.xlsx', 'Stage1_prob'),  # Results for the low entropy group in Stage 1
    ('Stage1_high_entropygroup.xlsx', 'Stage1_prob'),  # Results for the high entropy group in Stage 1
    ('Stage2_low_entropygroup.xlsx', 'Stage2_prob'),  # Results for the low entropy group in Stage 2
    ('Stage2_high_entropygroup.xlsx', 'Stage2_prob'),  # Results for the high entropy group in Stage 2
    ('Stage1_result.xlsx', 'Stage1_prob'),  # Results adopted from only Stage 1 
    ('Stage2_result.xlsx', 'Stage2_prob'),  # Results adopted from Stage 1, 2 
    ('Stage3_result.xlsx', 'Stage3_prob'),  # Results adopted from Stage 1, 2, 3 
]



In [None]:
def evaluate_binary_classification(df, prob_column):
    # Remove rows where the Label is 'equivocal' (Label == 1)
    df = df[df['Label'] != 1]
    # print(len(df))
    
    # Perform binary classification based on the probability threshold of 0.5
    df['Predicted'] = (df[prob_column] > 0.5).astype(int)
    df['Label'] = df['Label'].replace({0: 0, 2: 1})
    
    # Extract true labels and predicted labels
    y_true = df['Label'].values
    y_pred = df['Predicted'].values

    # Calculate True Positives, False Positives, True Negatives, and False Negatives
    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    # print(TP+TN+FP+FN)
    
    # Calculate performance metrics
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0  # also known as Recall
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
    PPV = TP / (TP + FP) if (TP + FP) > 0 else 0  # also known as Precision
    NPV = TN / (TN + FN) if (TN + FN) > 0 else 0
    f1_score = 2 * (PPV * sensitivity) / (PPV + sensitivity) if (PPV + sensitivity) > 0 else 0
    kappa = cohen_kappa_score(y_true, y_pred)

    return {
        "accuracy": accuracy,
        "sensitivity": sensitivity,
        "specificity": specificity,
        "PPV": PPV,
        "NPV": NPV,
        "f1_score": f1_score,
        "kappa": kappa
    }


In [None]:
def evaluate_multiclass_classification(df, prob_column, lower_bound=0.45, upper_bound=0.55):
    # Function to classify probabilities into multiclass categories
    def classify_multiclass(prob):
        if prob < lower_bound:
            return 0  # Negative
        elif prob > upper_bound:
            return 2  # Positive
        else:
            return 1  # Equivocal

    # Apply classification to each probability
    y_prob = np.array(df[prob_column])
    y_pred = np.array([classify_multiclass(p) for p in y_prob])
    y_true = np.array(df['Label'])
    
    # Generate a classification report
    report = classification_report(y_true, y_pred, target_names=['Negative', 'Equivocal', 'Positive'], output_dict=False)
    
    return report

In [None]:
# Iterate through the list of files and evaluate binary classification performance
for file_name, prob_column in evaluation_datasets:
    ev = pd.read_excel(file_name)
    metrics = evaluate_binary_classification(ev, prob_column)
    print(f"Binary Classification Evaluation for {file_name}:")
    for key, value in metrics.items():
        print(f"{key}: {value:.2f}")
    print("\n")


In [None]:
# Iterate through the list of files and evaluate multiclass classification
for file_name, prob_column in evaluation_datasets:
    ev = pd.read_excel(file_name)
    report = evaluate_multiclass_classification(ev, prob_column)
    print(f"Multiclass Classification Evaluation for {file_name}:\n{report}\n")