In [18]:
import os
import glob
import numpy as np
import pandas as pd

In [None]:
# given a directory path, get the files matching the file type. this is useful for gathering patches, etc.

def get_file_names(directory, extension):
    os.chdir(directory)
    file_names = glob.glob(f'*.{extension}')
    return file_names

# Use here
directory = 'path_to_your_directory'  # directory
extension = 'txt'  # file extension
file_names = get_file_names(directory, extension)
print(file_names)

In [19]:
## given a directory folder, returns the sub-folder names. This is useful when trying to 
# gather the WSI files stored in folders under the WSI ids
# We can use these folder names to construct our datasets

def get_folder_names(directory):
    return [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]

# use here
directory = 'insert_path_to_directory_for_WSI_files (to get file names)'
folder_names = get_folder_names(directory)
print(folder_names)
print(len(folder_names))

['00a26aaa82c959624d90dfb69fcf259c', '00a76bfbec239fd9f465d6581806ff42', '00a7fb880dc12c5de82df39b30533da9', '00a97615a51ba4c475bdec8505623bf9', '00bbc1482301d16de3ff63238cfd0b34', '00c15b23b30a5ba061358d9641118904', '00c46b336b5b06423fcdec1b4d5bee06', '00c52cb4db1c7a5811a8f070a910c038', '00ca0c23961b5510be46c917be11c43e', '00d7ec94436e3a1416a3b302914957d3', '00d8a8c04886379e266406fdeff81c45', '0a0f8e20b1222b69416301444b117678', '0a107c91216d62b2543122a46eb26541', '0a14e008bce3dbcce2470ca1123bd565', '0a336d5bc71e1f83d453297413f11e9b', '0a3bf331f951938b678a1736ea8d8399', '0a4b7a7499ed55c71033cefb0765e93d', '0a5b11fe9c149f5e2e41a5d16dc5f314', '0a5d6a20d1429dd55b94ae1b857ad573', '0a619ab32b0cd639d989cce1e1e17da0', '0a6c5a120961974a7dae8cf11245ff73', '0a6e7a0cfe6a3203b4cb28d6ff6daa6a', '0a75b377181b60efd8278bce0b6260a5', '0a81500dc48b2bb7ee19849502360270', '0a836ea85157d6e0b92371d8ae18a55c', '0a848ccbbb065ef5ee59dd01710f8531', '0a8b2cefacdad96c6005799832055629', '0a8bc9de5acf2d4a667a452880

In [None]:
# to split the data and return .txt files. These files can be used in 
# conjunction with the master csv file to create the data .txt files 
# in the correct delimited structure

def split_dataset(file_names, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    # error check
    assert train_ratio + val_ratio + test_ratio == 1, "Ratios must sum to 1"

    # randomize
    np.random.shuffle(file_names)

    # calcuate subset sizes
    total_files = len(file_names)
    train_size = int(total_files * train_ratio)
    val_size = int(total_files * val_ratio)

    # set the files
    train_files = file_names[:train_size]
    val_files = file_names[train_size:train_size+val_size]
    test_files = file_names[train_size+val_size:]

    return train_files, val_files, test_files

def write_to_file(file_names, file_path):
    with open(file_path, 'w') as f:
        for file_name in file_names:
            f.write(file_name + '\n')


# Split the dataset using file or folder names (wherever the names that correspond with the sample ids)
train_files, val_files, test_files = split_dataset(folder_names) # file_names)

# Write names to data set files
write_to_file(train_files, 'train.txt') # trainig dataset, line by line file
write_to_file(val_files, 'val.txt') # validatoin dataset
write_to_file(test_files, 'test.txt') # testing dataset

In [15]:
# this takes the subset csv files, the identified sample txt files, and returns a csv file with the appropriate filtered info
def filter_csv_by_txt(csv_file, txt_file, output_csv_file, key_column):
    # read csv file
    df = pd.read_csv(csv_file)

    # read txt file
    with open(txt_file, 'r') as f:
        keys = f.read().splitlines()

    # filter dataframe using keys
    df_filtered = df[df[key_column].isin(keys)]

    # write dataframe to csv
    df_filtered.to_csv(output_csv_file, index=False)

# Usage
train_set = False
val_set = False
test_set = True
#filter_csv_by_txt('path_to_your_csv_file', 'path_to_your_txt_file', 'path_to_output_csv_file', 'name_of_key_column')
if train_set:
    filter_csv_by_txt('../train_data/train.csv', '../tmi2022/train_set.txt', '../tmi2022/data/our_project.csv', 'image_id')
if val_set:
    filter_csv_by_txt('../train_data/train.csv', '../tmi2022/val_set.txt', '../tmi2022/data/our_project_val.csv', 'image_id')
if test_set:
    filter_csv_by_txt('../train_data/train.csv', '../tmi2022/test_set.txt', '../tmi2022/data/our_project_test.csv', 'image_id')


In [16]:
def csv_to_txt(csv_file, txt_file, col1, col2, term):
    # Read csv file
    df = pd.read_csv(csv_file)

    # Add 'term-' to the beginning of the first column elements
    df[col1] = str(term) + df[col1].astype(str)

    # Check if the value in the second column is greater than zero and update that value to 1 
    # this is modifying our class labels (contracting from 6 labeles to 3)
    df[col2] = df[col2].apply(lambda x: 2 if x > 1 else x)

    # Replace the integer values in the second column with string terms
    df[col2] = df[col2].replace({0: 'normal', 1: 'mild', 2: 'severe'})

    # Select columns and export them to .txt file with delimitor of tab
    df[[col1, col2]].to_csv(txt_file, sep='\t', index=False, header=False)

train_set = False
val_set = False
test_set = True

#csv_to_txt('path_to_your_csv_file', 'path_to_output_txt_file', 'name_of_first_column', 'name_of_second_column')
if train_set:
    csv_to_txt('../tmi2022/data/our_project.csv', '../tmi2022/data/our_project_mod_3.txt', 'image_id', 'isup_grade', 'THREE-')
if val_set:
    csv_to_txt('../tmi2022/data/our_project_val.csv', '../tmi2022/data/our_project_mod_3_val.txt', 'image_id', 'isup_grade', 'THREE-')
if test_set:
    csv_to_txt('../tmi2022/data/our_project_test.csv', '../tmi2022/data/our_project_mod_3_test.txt', 'image_id', 'isup_grade', 'THREE-')


In [None]:
# Visualize Quick Aid
import torch

model = torch.load('path')
print(model)