In [1]:
import sys
from pathlib import Path

import pandas as pd
from rich import print
from rich.panel import Panel
from rich.pretty import Pretty

abs_module_path = Path("./../../").resolve()
if (abs_module_path.exists()) and (str(abs_module_path) not in sys.path):
    sys.path.append(str(abs_module_path)) # add path to scan customized module

from modules.data.processeddatainstance import ProcessedDataInstance
from modules.shared.clioutput import CLIOutput
from modules.shared.config import load_config
from modules.shared.utils import get_repo_root

In [2]:
""" Detect Repository """
print(f"Repository: '{get_repo_root()}'")

In [3]:
training_ratio = 0.8
train_ratio = 0.9

In [4]:
# set variables
cli_out = CLIOutput()
cli_out._set_logger("Split Dataset")
config = load_config("0.5.cluster_data.toml")
random_seed = config["cluster"]["random_seed"]

In [5]:
# read `data.csv`
processed_di = ProcessedDataInstance()
processed_di.parse_config(config)
df = pd.read_csv(processed_di.tabular_file, encoding='utf_8_sig', index_col=[0])
cli_out.divide()



| 2024-01-10 08:42:20,031 | Processed Data Instance | INFO | Instance Root: 'C:\Users\confocal_microscope\Desktop\WorkingDir\ZebraFish_DB\{Data}_Processed\{20230904_update}_Academia_Sinica_i85'
| 2024-01-10 08:42:20,031 | Processed Data Instance | INFO | Palmskin Processed Dir: 'C:\Users\confocal_microscope\Desktop\WorkingDir\ZebraFish_DB\{Data}_Processed\{20230904_update}_Academia_Sinica_i85\{m113_m222_ku15}_PalmSkin_preprocess'
| 2024-01-10 08:42:20,032 | Processed Data Instance | INFO | Brightfield Processed Dir: 'C:\Users\confocal_microscope\Desktop\WorkingDir\ZebraFish_DB\{Data}_Processed\{20230904_update}_Academia_Sinica_i85\{autothres_triangle}_BrightField_analyze'
| 2024-01-10 08:42:20,046 | Processed Data Instance | INFO | Brightfield Recollect Dir: 'C:\Users\confocal_microscope\Desktop\WorkingDir\ZebraFish_DB\{Data}_Processed\{20230904_update}_Academia_Sinica_i85\{autothres_triangle}_BrightField_reCollection'
| 2024-01-10 08:42:20,046 | Processed Data Instance | INFO | data

In [6]:
training_df: pd.DataFrame = df.sample(frac=training_ratio, replace=False, random_state=random_seed)
test_df: pd.DataFrame = df[~df.index.isin(training_df.index)]

train_df: pd.DataFrame = training_df.sample(frac=train_ratio, replace=False, random_state=random_seed)
valid_df: pd.DataFrame = training_df[~training_df.index.isin(train_df.index)]

In [7]:
# display
tmp_dict = {}
tmp_dict["original_df"] = len(df)
tmp_dict["training_df"] = len(training_df)
tmp_dict["train_df"] = len(train_df)
tmp_dict["test_df"] = len(test_df)
tmp_dict["valid_df"] = len(valid_df)
print(Panel(Pretty(tmp_dict, expand_all=True), width=100))

In [8]:
for idx in train_df.index:
    df.loc[idx, "dataset"] = "train"

for idx in valid_df.index:
    df.loc[idx, "dataset"] = "valid"

for idx in test_df.index:
    df.loc[idx, "dataset"] = "test"

In [9]:
df.to_csv(processed_di.instance_root.joinpath(f"datasplit_{random_seed}.csv"), encoding='utf_8_sig')
cli_out.new_line()


