In [2]:
import pandas as pd
import numpy as np
import random
import os
import json
from datetime import datetime
import copy

# load datasets from huggingface hub
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from datasets import Features, Value, ClassLabel, Sequence

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

#  set max display width too view full text
pd.set_option("display.max_colwidth", None)

debug = True

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load orig_filtered and g2b versions of GBaker/MedQA-USMLE-4-options-hf
orig_filtered_medqa = load_dataset(
    "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original", split="test"
)
g2b_medqa = load_dataset(
    "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand", split="test"
)

orig_filtered_medmcqa = load_dataset("AIM-Harvard/medmcqa_original", split="test")
g2b_medmcqa = load_dataset("AIM-Harvard/medmcqa_generic_to_brand", split="test")

# convert to pandas
orig_filtered_medqa = orig_filtered_medqa.to_pandas()
g2b_medqa = g2b_medqa.to_pandas()

orig_filtered_medmcqa = orig_filtered_medmcqa.to_pandas()
g2b_medmcqa = g2b_medmcqa.to_pandas()

# sort values
orig_filtered_medqa.sort_values("id", inplace=True)
g2b_medqa.sort_values("id", inplace=True)

orig_filtered_medmcqa.sort_values("id", inplace=True)
g2b_medmcqa.sort_values("id", inplace=True)

# merge on id
merged_medqa = pd.merge(
    orig_filtered_medqa, g2b_medqa, on="id", suffixes=("_orig", "_g2b")
)
merged_medmcqa = pd.merge(
    orig_filtered_medmcqa, g2b_medmcqa, on="id", suffixes=("_orig", "_g2b")
)

# write to csv
merged_medqa.to_csv(
    "../pre_filter_datasets/eval_csvs/orig_filtered_g2b_medqa.csv", index=False
)
merged_medmcqa.to_csv(
    "../pre_filter_datasets/eval_csvs/orig_filtered_g2b_medmcqa.csv", index=False
)

# Load in Annotated Data

In [4]:
# pre_filter_datasets/eval_csvs/annotated_medmcqa_new.csv
annotated_medmcqa_new = pd.read_csv(
    "../pre_filter_datasets/eval_csvs/annotated_medmcqa_new.csv"
)

annotated_medqa_new = pd.read_csv(
    "../pre_filter_datasets/eval_csvs/annotated_medqa_new.csv"
)

annotated_medmcqa_new.head(2)

Unnamed: 0,id,question_orig,opa_orig,opb_orig,opc_orig,opd_orig,cop_orig,choice_type_orig,exp_orig,subject_name_orig,topic_name_orig,found_keywords_orig,local_id_orig,Unnamed: 13,question_g2b,opa_g2b,opb_g2b,opc_g2b,opd_g2b,cop_g2b,choice_type_g2b,exp_g2b,subject_name_g2b,topic_name_g2b,found_keywords_g2b,local_id_g2b,Unnamed: 26,keep/drop,comments
0,006acfff-dc8f-4bb5-97b2-e26144c56483,PGE1 analogue is ?,Carboprost,Alprostadil,Epoprostenol,Dinoprostone,-1,single,,Pharmacology,,['carboprost' 'dinoprostone' 'alprostadil' 'epoprostenol'],4101,,PGE1 analogue is ?,hemabate,caverject,flolan,cervidil,-1,single,,Pharmacology,,['carboprost' 'dinoprostone' 'alprostadil' 'epoprostenol'],4101,,keep,
1,024f96d1-8881-4b52-a7f9-58e5b194a0fa,Which of the following cephalosporin is active against Pseudomonas aeruginosa:,Ceftriaxone,Cephalothin,Ceftazidime,Cefotaxime,-1,single,,Unknown,,['cefotaxime' 'ceftazidime' 'cephalothin' 'ceftriaxone'],1162,,Which of the following cephalosporin is active against Pseudomonas aeruginosa:,rocephin,keflin,fortaz,claforan,-1,single,,Unknown,,['cefotaxime' 'ceftazidime' 'cephalothin' 'ceftriaxone'],1162,,keep,


In [5]:
# get list of ids to filter (where penultimate column is not "keep")
# make the col is string
annotated_medmcqa_new.iloc[:, -2] = annotated_medmcqa_new.iloc[:, -2].astype(str)

rows_to_filter = annotated_medmcqa_new[
    annotated_medmcqa_new.iloc[:, -2] != "keep"
].id.tolist()

# same for medqa
annotated_medqa_new.iloc[:, -2] = annotated_medqa_new.iloc[:, -2].astype(str)

rows_to_filter_medqa = annotated_medqa_new[
    annotated_medqa_new.iloc[:, -2] != "keep"
].id.tolist()

print(f"Number of rows to filter in medmcqa: {len(rows_to_filter)}")
print(f"Number of rows to filter in medqa: {len(rows_to_filter_medqa)}")

Number of rows to filter in medmcqa: 82
Number of rows to filter in medqa: 63


In [6]:
# get the ids of the rows to filter in annotated_medmcqa_new
medmcqa_rows_to_filter = annotated_medmcqa_new[
    annotated_medmcqa_new.iloc[:, -2] != "keep"
].id.tolist()


# get the ids of the rows to filter in annotated_medqa_new
medqa_rows_to_filter = annotated_medqa_new[
    annotated_medqa_new.iloc[:, -2] != "keep"
].id.tolist()

# filter out the rows from the pandas hf datasets in orig and g2b
filtered_orig_filtered_medmcqa = orig_filtered_medmcqa[
    ~orig_filtered_medmcqa.id.isin(medmcqa_rows_to_filter)
]
filtered_g2b_medmcqa = g2b_medmcqa[~g2b_medmcqa.id.isin(medmcqa_rows_to_filter)]

filtered_orig_filtered_medqa = orig_filtered_medqa[
    ~orig_filtered_medqa.id.isin(medqa_rows_to_filter)
]
filtered_g2b_medqa = g2b_medqa[~g2b_medqa.id.isin(medqa_rows_to_filter)]

# check rows and difference
print(
    f"Number of rows in filtered_orig_filtered_medmcqa: {len(filtered_orig_filtered_medmcqa)}"
)
print(f"Number of rows in filtered_g2b_medmcqa: {len(filtered_g2b_medmcqa)}")
print(
    f"Number of rows in filtered_orig_filtered_medqa: {len(filtered_orig_filtered_medqa)}"
)
print(f"Number of rows in filtered_g2b_medqa: {len(filtered_g2b_medqa)}")
print(
    f"Difference in rows in filtered_orig_filtered_medmcqa: {len(orig_filtered_medmcqa) - len(filtered_orig_filtered_medmcqa)}"
)
print(
    f"Difference in rows in filtered_g2b_medmcqa: {len(g2b_medmcqa) - len(filtered_g2b_medmcqa)}"
)
print(
    f"Difference in rows in filtered_orig_filtered_medqa: {len(orig_filtered_medqa) - len(filtered_orig_filtered_medqa)}"
)
print(
    f"Difference in rows in filtered_g2b_medqa: {len(g2b_medqa) - len(filtered_g2b_medqa)}"
)

Number of rows in filtered_orig_filtered_medmcqa: 457
Number of rows in filtered_g2b_medmcqa: 457
Number of rows in filtered_orig_filtered_medqa: 378
Number of rows in filtered_g2b_medqa: 378
Difference in rows in filtered_orig_filtered_medmcqa: 82
Difference in rows in filtered_g2b_medmcqa: 82
Difference in rows in filtered_orig_filtered_medqa: 63
Difference in rows in filtered_g2b_medqa: 63


## write out new parquet file with filtered df in place of test set

In [7]:
# write to pre_filter_datasets but append filtered to the name
## pre_filter_datasets/GBaker_MedQA-USMLE-4-options-hf
## pre_filter_datasets/medmcqa/test

filtered_orig_filtered_medmcqa.to_parquet(
    "../pre_filter_datasets/medmcqa/test/original_filtered.parquet"
)
filtered_g2b_medmcqa.to_parquet(
    "../pre_filter_datasets/medmcqa/test/generic_to_brand_filtered.parquet"
)

filtered_orig_filtered_medqa.to_parquet(
    "../pre_filter_datasets/GBaker_MedQA-USMLE-4-options-hf/test/original_filtered.parquet"
)
filtered_g2b_medqa.to_parquet(
    "../pre_filter_datasets/GBaker_MedQA-USMLE-4-options-hf/test/generic_to_brand_filtered.parquet"
)