In [5]:
import json
import os
import pandas as pd
from ast import literal_eval
from pathlib import Path

pd.set_option("display.max_colwidth", None)

In [1]:
##################
#### CRC 100K ####
##################


# generate the matching training - test samples for ViT training -> CRC 100K
src_path_base = "./Results/CRC100K"
dest_path_base = "./VisionModels/CRC100K"

# binary paths
binary_src_path = os.path.join(src_path_base, "binary")
binary_src_path = list(Path(binary_src_path).rglob("*.json"))
binary_src_path = [bp for bp in binary_src_path if "interleaved" in str(bp) or "zero_shot" in str(bp)]
binary_dest_path = os.path.join(dest_path_base, "binary")

# complete paths
complete_src_path = os.path.join(src_path_base, "complete")
complete_src_path = list(Path(complete_src_path).rglob("*.json"))
complete_src_path = [cp for cp in complete_src_path if not "deprecated" in str(cp)]
complete_dest_path = os.path.join(dest_path_base, "complete")

# KNN paths
knn_src_path = os.path.join(src_path_base, "KNN")
knn_src_path = list(Path(knn_src_path).rglob("*.json"))
knn_dest_path = os.path.join(dest_path_base, "knn")


def make_train_datafile(paths, destination):

    keys = ["label", "fname", "sample"]
    for path in paths:

        with open(path, mode="r") as f:
            data = json.load(f)
            
        data_frames = [pd.DataFrame([d]).loc[:, keys] for d in data if all(k in d for k in keys)]
        df = pd.concat(data_frames, ignore_index=True)

        df["train_data"] = None
        for idx, row in df.iterrows():
            map_for_train = row["sample"].get("multi_shot_mappings", None)
            if map_for_train is not None:
                map_for_train = [path for paths in map_for_train.values() for path in paths]
                map_for_train = [path.rsplit("/")[-1] for path in map_for_train]
                map_for_train = {key: f"{key.split('-')[0]}" for key in map_for_train}
                df.at[idx, "train_data"] = map_for_train
        df.drop("sample", axis=1, inplace=True)

        parts = path.parts
        base_path = Path(*parts[-2:-1])
        save_path = os.path.join(destination, str(base_path))

        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        save_path = os.path.join(save_path, path.stem)
        print(save_path)
        df.to_csv(f"{save_path}.csv")


# run
# make_train_datafile(complete_src_path, complete_dest_path)
# make_train_datafile(binary_src_path, binary_dest_path)
# make_train_datafile(knn_src_path, knn_dest_path)

In [2]:
##############
#### PCAM ####
##############


# generate the matching training - test samples for ViT training -> PCam
src_path_base = "./Results/PCam"
dest_path_base = "./VisionModels/PCam"

# KNN paths
knn_src_path = os.path.join(src_path_base, "KNN")
knn_src_path = list(Path(knn_src_path).rglob("*.json"))
knn_dest_path = os.path.join(dest_path_base, "knn")

def make_train_datafile(paths, destination):

    keys = ["label", "fname", "sample"]
    for path in paths:

        with open(path, mode="r") as f:
            data = json.load(f)
            
        data_frames = [pd.DataFrame([d]).loc[:, keys] for d in data if all(k in d for k in keys)]
        df = pd.concat(data_frames, ignore_index=True)

        df["train_data"] = None
        for idx, row in df.iterrows():
            map_for_train = row["sample"].get("multi_shot_mappings", None)
            if map_for_train is not None:
                map_for_train = [path for paths in map_for_train.values() for path in paths]
                map_for_train = [path.rsplit("/")[-1] for path in map_for_train]
                map_for_train = {key: f"{key.split('-')[0]}" for key in map_for_train}
                df.at[idx, "train_data"] = map_for_train
        df.drop("sample", axis=1, inplace=True)

        parts = path.parts
        base_path = Path(*parts[-2:-1])
        save_path = os.path.join(destination, str(base_path))

        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        save_path = os.path.join(save_path, path.stem)
        print(save_path)
        df.to_csv(f"{save_path}.csv")

# run
# make_train_datafile(knn_src_path, knn_dest_path)

In [3]:
##############
#### MHIST ###
##############


# generate the matching training - test samples for ViT training -> PCam
src_path_base = "./Results/MHIST"
dest_path_base = "./VisionModels/MHIST"

# KNN paths
knn_src_path = os.path.join(src_path_base, "KNN")
knn_src_path = list(Path(knn_src_path).rglob("*.json"))
knn_dest_path = os.path.join(dest_path_base, "knn")

def make_train_datafile(paths, destination):

    keys = ["label", "fname", "sample"]
    for path in paths:

        with open(path, mode="r") as f:
            data = json.load(f)
            
        data_frames = [pd.DataFrame([d]).loc[:, keys] for d in data if all(k in d for k in keys)]
        df = pd.concat(data_frames, ignore_index=True)

        df["train_data"] = None
        for idx, row in df.iterrows():
            map_for_train = row["sample"].get("multi_shot_mappings", None)
            if map_for_train is not None:
                map_for_train = [path for paths in map_for_train.values() for path in paths]
                map_for_train = [path.rsplit("/")[-1] for path in map_for_train]
                map_for_train = {key: f"{key.split('_')[0]}" for key in map_for_train}
                df.at[idx, "train_data"] = map_for_train
        df.drop("sample", axis=1, inplace=True)

        parts = path.parts
        base_path = Path(*parts[-2:-1])
        save_path = os.path.join(destination, str(base_path))

        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        save_path = os.path.join(save_path, path.stem)
        print(save_path)
        df.to_csv(f"{save_path}.csv")

# run
# make_train_datafile(knn_src_path, knn_dest_path)