In [1]:
# mount your drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/547.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB

In [3]:
import os

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import StratifiedKFold, KFold
import datasets

In [4]:
data_dir = "/content/drive/MyDrive/Data/"
downstream = {
    "BioASQ": 10,
    "HoC": 5,
    "mednli": 3,
    "medqa": 3
}

In [9]:
def load_bioasq(data_dir):
    train_df = pd.read_csv(
        f"{data_dir}train.tsv",
        sep="\t",
        header=0,
        names=["id", "text_a", "text_b", "label"],
    )
    dev_df = pd.read_csv(
        f"{data_dir}dev.tsv",
        sep="\t",
        header=0,
        names=["id", "text_a", "text_b", "label"],
    )
    test_df = pd.read_csv(
        f"{data_dir}test.tsv",
        sep="\t",
        header=0,
        names=["id", "text_a", "text_b", "label"],
    )

    return train_df, dev_df, test_df

def load_hoc(data_dir):
    df_train = pd.read_csv(os.path.join(data_dir, "train.tsv"), sep="\t")
    df_dev = pd.read_csv(os.path.join(data_dir, "dev.tsv"), sep="\t")
    df_test = pd.read_csv(os.path.join(data_dir, "test.tsv"), sep="\t")

    def extract_true_labels(label_str):
        true_labels = []
        pairs = label_str.split(',')
        for pair in pairs:
            label, value = pair.split('_')
            true_labels.append(float(value))
        return true_labels

    df_train['labels'] = df_train['labels'].apply(extract_true_labels)
    df_dev['labels'] = df_dev['labels'].apply(extract_true_labels)
    df_test['labels'] = df_test['labels'].apply(extract_true_labels)

    return df_train, df_dev, df_test

def load_mednli(data_dir):
    column_names = ['sentence1', 'sentence2', 'gold_label']

    file_name = lambda x: f"{data_dir}mli_{x}_v1.jsonl"

    train_df = pd.read_json(file_name("train"), lines=True)[column_names]
    train_df = train_df.rename(columns={'sentence1': 'text_a', 'sentence2': 'text_b', 'gold_label': 'label'})

    dev_df = pd.read_json(file_name("dev"), lines=True)[column_names]
    dev_df = dev_df.rename(columns={'sentence1': 'text_a', 'sentence2': 'text_b', 'gold_label': 'label'})

    test_df = pd.read_json(file_name("test"), lines=True)[column_names]
    test_df = test_df.rename(columns={'sentence1': 'text_a', 'sentence2': 'text_b', 'gold_label': 'label'})

    return train_df, dev_df, test_df

def load_medqa():
    return datasets.load_dataset("GBaker/MedQA-USMLE-4-options")


In [10]:
def save(folder, df_train, df_val):
  if not os.path.exists(folder):
    os.makedirs(folder)

  df_train.to_csv(f"{folder}/train.tsv", sep="\t")
  df_val.to_csv(f"{folder}/val.tsv", sep="\t")

for dataset in downstream.keys():
  data_path = f"{os.path.join(data_dir, dataset)}/"

  match dataset:
    case "BioASQ":
      train, dev, test = load_bioasq(f"{data_path}old_structure/")
    case "HoC":
      train, dev, test = load_hoc(f"{data_path}old_structure/")
    case "mednli":
      train, dev, test = load_mednli(f"{data_path}old_structure/")
    case "medqa":
      data = load_medqa()

  if dataset != "medqa":
    full = pd.concat([train, dev])
  else:
    train = data["train"]
    full = pd.DataFrame(train)
    test = data["test"]
    pd.DataFrame(test).to_csv(f"{data_path}test.tsv", sep="\t")

  n_folds = downstream[dataset]
  skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=17)
  kf = KFold(n_splits=n_folds, shuffle=True, random_state=17)

  if dataset == "BioASQ" or dataset == "mednli":
    for fold, (train_idx, val_idx) in enumerate(skf.split(full.index, full["label"])):
      train_df = full.iloc[train_idx]
      val_df = full.iloc[val_idx]

      subfolder = f"{data_path}fold_{fold}"
      save(subfolder, train_df, val_df)

  else:
    for fold, (train_idx, val_idx) in enumerate(kf.split(full.index)):
      train_df = full.iloc[train_idx]
      val_df = full.iloc[val_idx]

      subfolder = f"{data_path}fold_{fold}"
      save(subfolder, train_df, val_df)

  pd.DataFrame(test).to_csv(f"{data_path}test.tsv", sep="\t")
