In [1]:
import pandas as pd

import networkx as nx
import numpy as np

from datasets import load_from_disk
from datasets import Dataset, ClassLabel
from datasets import load_from_disk

from llm_mri import ActivationAreas
from llm_mri.dimensionality_reduction import UMAP

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load raw data
medical_answers_path = "../datasets/medical_answers/"

df_dpoc = pd.read_excel(
    medical_answers_path + "Resultados_Anotacoes_Teste_Progresso-DPOC-Fernando.xlsx"
)
df_iam = pd.read_excel(
    medical_answers_path + "Resultados_Anotacoes_Teste_Progresso-IAM-Fernando.xlsx"
)

# Original Dataset Preprocessing

In [3]:
# Display the length of each DataFrame
def get_dataframe_len(df_dpoc, df_iam):
    if "include_in_analysis" in df_dpoc.columns:
        df_dpoc_filtered = df_dpoc[df_dpoc["include_in_analysis"] == True]
        print("df_dpoc rows (included in analysis):", len(df_dpoc_filtered))
    else:
        print("df_dpoc rows:", len(df_dpoc))


    if "include_in_analysis" in df_iam.columns:
        df_iam_filtered = df_iam[df_iam["include_in_analysis"] == True]
        print("df_iam rows (included in analysis):", len(df_iam_filtered))
    else:
        print("df_iam rows:", len(df_iam))

get_dataframe_len(df_dpoc, df_iam)

df_dpoc rows (included in analysis): 432
df_iam rows (included in analysis): 447


In [None]:
# Filter only the target columns

# user id ==> identificador único do aluno
# objective test score ==> nota obtida pelos alunos no OSCE (Objective Structured Clinical Examination)
# question 2, question ==> resposta da questão dissertativa
# global score ==> nota global dada à resposta

# target_columns_dpoc = ["user id", "objective test score", "question 2", "global score"]
# target_columns_iam = ["user id", "objective test score", "question", "global score"]

# df_dpoc = df_dpoc[target_columns_dpoc]
# df_iam = df_iam[target_columns_iam]

In [17]:
# Add a new column to mark usable rows
# Add a new column to mark usable rows
df_dpoc['include_in_analysis'] = (~df_dpoc[['question 2', 'global score']].isnull().any(axis=1)).astype(int)
df_iam['include_in_analysis'] = (~df_iam[['question', 'global score']].isnull().any(axis=1)).astype(int)

get_dataframe_len(df_dpoc, df_iam)

df_dpoc rows (included in analysis): 432
df_iam rows (included in analysis): 447


In [18]:
# Mark duplicates by "user id"
dup_mask_dpoc = df_dpoc.duplicated(subset=["user id"], keep="first")
dup_mask_iam = df_iam.duplicated(subset=["user id"], keep="first")

df_dpoc.loc[dup_mask_dpoc, "include_in_analysis"] = 0
df_iam.loc[dup_mask_iam, "include_in_analysis"] = 0

get_dataframe_len(df_dpoc, df_iam)

df_dpoc rows (included in analysis): 432
df_iam rows (included in analysis): 447


In [19]:
# Mark duplicates based on answers column
dup_mask_dpoc_answers = df_dpoc.duplicated(subset=["question 2"], keep="first")
dup_mask_iam_answers = df_iam.duplicated(subset=["question"], keep="first") 

df_dpoc.loc[dup_mask_dpoc_answers, "include_in_analysis"] = 0
df_iam.loc[dup_mask_iam_answers, "include_in_analysis"] = 0
get_dataframe_len(df_dpoc, df_iam)

df_dpoc rows (included in analysis): 432
df_iam rows (included in analysis): 447


In [20]:
# # save new column to excel files
# df_dpoc.to_excel(
#     medical_answers_path + "Resultados_Anotacoes_Teste_Progresso-DPOC-Fernando.xlsx",
#     index=False
# )
# df_iam.to_excel(
#     medical_answers_path + "Resultados_Anotacoes_Teste_Progresso-IAM-Fernando.xlsx",
#     index=False
# )

After that, a manual phase was conducted to filter out meaningless answers (such as "Não quero responder")

### DPOC:

#### Não foi removida mas é interessante trackear
* 9001000	76946666-fca1-49ac-aef1-23d6bea602bd --> "Ainda não aprendi sobre doença pulonar obstrutiva crônica no curso de Medicina. Sobre DPOC, apenas sei que se trata de várias doenças que dificultam o fluxo de ar no pulmão, de modo que a respiração seja afetada."
* 9004190	ae0e68aa-78ab-4ad8-8463-342c5ac1c00e --> "fumantes"

#### Removidos:
* 5000254	2fd88eb0-3274-4b2c-a5a7-7957bbbe528b
* 5000548	ec425274-fd02-4620-8a81-24db41f5cc22
* 5000556	8396380d-e0b6-4b81-8fe9-0b99c611f9f3
* 7000529	337411a4-d8c4-47dc-acfc-9d72b43d2abf


### IAM:
_All data was removed by the automatic sanitization_ 

## Process Data

In [4]:
# Filter DataFrames to include only rows marked for analysis
df_dpoc = df_dpoc[df_dpoc.get("include_in_analysis", 0) == 1].reset_index(drop=True)
df_iam = df_iam[df_iam.get("include_in_analysis", 0) == 1].reset_index(drop=True)

print("df_dpoc rows (included):", len(df_dpoc))
print("df_iam rows (included):", len(df_iam))

df_dpoc rows (included): 432
df_iam rows (included): 447


In [5]:
# Filter only the target columns

# user id ==> identificador único do aluno
# objective test score ==> nota obtida pelos alunos no OSCE (Objective Structured Clinical Examination)
# question 2, question ==> resposta da questão dissertativa
# global score ==> nota global dada à resposta

target_columns_dpoc = ["user id", "objective test score", "question 2", "global score"]
target_columns_iam = ["user id", "objective test score", "question", "global score"]

df_dpoc = df_dpoc[target_columns_dpoc]
df_iam = df_iam[target_columns_iam]

In [6]:
# Rename columns before merging
df_dpoc = df_dpoc.rename(columns={"question 2": "dpoc_answers", "global score": "dpoc_global_score"})
df_iam = df_iam.rename(columns={"question": "iam_answers", "global score": "iam_global_score"})

In [8]:
# Final length of each DataFrame
get_dataframe_len(df_dpoc, df_iam)

df_dpoc rows: 432
df_iam rows: 447


## Converting the Dataset to HuggingFace Format

In [9]:
# Function to convert a DataFrame to a Huggingface Dataset with top and bottom scoring samples
def get_huggingface_dataset(df, sample_percentage, answer_column, scrore_column):

  # Select only answer relevant columns
  subset = df[[answer_column, scrore_column]].copy()

  # Order by score
  subset = subset.sort_values(by=scrore_column).reset_index(drop=True)

  # Calculate indices for top and bottom samples
  subset_len = len(subset)
  n = int(subset_len * sample_percentage)

  # Assign labels
  labels = np.full(subset_len, 'middle', dtype=object)
  labels[:n] = 'bottom'
  labels[-n:] = 'top'
  subset['label'] = labels

  # Remove middle samples
  subset = subset[subset['label'] != 'middle'].reset_index(drop=True)

  # Remove score column
  subset = subset.drop(columns=[scrore_column])

  # Rename answer column to text
  subset = subset.rename(columns={answer_column: 'text'})

  # Convert to Huggingface Dataset
  hf_dataset = Dataset.from_pandas(subset)

  # Convert label column to ClassLabel
  unique_labels = hf_dataset.unique('label')
  class_label = ClassLabel(names=[str(l) for l in unique_labels])
  hf_dataset = hf_dataset.cast_column('label', class_label)

  return hf_dataset

In [12]:
for answer_type in ['dpoc', 'iam']:

    df = {"dpoc": df_dpoc, "iam": df_iam}[answer_type]
    
    # Convert to Huggingface Dataset
    hf_dataset = get_huggingface_dataset(df, 0.3, f'{answer_type}_answers', f'{answer_type}_global_score')
    
    # Save the Huggingface Dataset to disk
    hf_dataset.save_to_disk(f"{medical_answers_path}/{answer_type}_hf_dataset")


Casting the dataset: 100%|██████████| 258/258 [00:00<00:00, 43188.48 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 258/258 [00:00<00:00, 83886.08 examples/s] 
Casting the dataset: 100%|██████████| 268/268 [00:00<00:00, 159307.46 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 268/268 [00:00<00:00, 89846.81 examples/s] 


# Extract Activation Areas

In [3]:
# Constants
GRID_SIZES = [10, 20, 30, 40, 50]
SAMPLE_SIZE = 1000
model_ckpt = "neuralmind/bert-base-portuguese-cased" # The same used in the article

medical_answers_data_path = "../data/medical_answers/"
processed_graphs_path = f"{medical_answers_data_path}processed/graphs"

In [4]:
# Open each Huggingface Dataset
dpoc_dataset = load_from_disk(f"{medical_answers_path}/dpoc_hf_dataset")
dpoc_dataset.cleanup_cache_files()

iam_dataset = load_from_disk(f"{medical_answers_path}/iam_hf_dataset")
iam_dataset.cleanup_cache_files()

4

In [5]:
for answer_type in ["dpoc", "iam"]:
    dataset = {"dpoc": dpoc_dataset, "iam": iam_dataset}[answer_type]
    for sample in range(SAMPLE_SIZE):
        print(f"Processing {answer_type} sample {sample+1}/{SAMPLE_SIZE}")
        for grid_size in GRID_SIZES:

            umap = UMAP(n_components=2, random_state=42, gridsize=grid_size)

            llm_mri = ActivationAreas(
                model=model_ckpt, device="cpu", dataset=dataset, reduction_method=umap
            )
            llm_mri.process_activation_areas()

            g_top = llm_mri.get_graph("top")
            g_bottom = llm_mri.get_graph("bottom")

            nx.write_gexf(
                g_top,
                f"{processed_graphs_path}/{answer_type}/{grid_size}/g_top_{sample}.gexf",
            )
            nx.write_gexf(
                g_bottom,
                f"{processed_graphs_path}/{answer_type}/{grid_size}/g_bottom_{sample}.gexf",
            )