# Dataset Statistics/exploration

Try to understand the quality of the data better

In [1]:
import os
from pathlib import Path
os.chdir(Path.cwd().parent)   # go one level up
print(os.getcwd())            # check
from xflow import ConfigManager, SqlProvider, PyTorchPipeline, show_model_info
from xflow.data import build_transforms_from_config
from xflow.utils import load_validated_config, save_image
import xflow.extensions.physics
from config_utils import load_config, detect_machine
from utils import *

# Detect machine once
experiment_name = "CAE_validate_clear"
machine = detect_machine()

# Use it in load_config
config = load_config(
    f"{experiment_name}.yaml",
    machine=machine
)

def make_dataset(provider, transforms):
    pipeline = PyTorchPipeline(provider, transforms)
    dataset = pipeline.to_memory_dataset(config["data"]["dataset_ops"])
    return dataset, pipeline.in_memory_sample_count

c:\Users\qiyuanxu\Documents\GitHub\fiber-image-reconstruction-comparison
[config_utils] Using machine profile: win-qiyuanxu


In [2]:
# ==================== 
# Prepare Dataset (Wednesday Chromox)
# ====================

test_dir = config["paths"]["chromox_01"]
# Create SqlProvider to query the database
db_path = f"{test_dir}/db/dataset_meta.db"
query = """
SELECT 
    image_path
FROM mmf_dataset_metadata 
WHERE batch IN (10, 11, 12)
--LIMIT 20
"""
realbeam_provider = SqlProvider(
    sources={"connection": db_path, "sql": query}, output_config={'list': "image_path"}
)
train_provider, evaluation_provider = realbeam_provider.split(ratio=config["data"]["train_val_split"], seed=config["seed"])
val_provider, test_provider = evaluation_provider.split(ratio=config["data"]["val_test_split"], seed=config["seed"])

# For train dataset
config["data"]["transforms"]["torch"].insert(0, {
    "name": "add_parent_dir",
    "params": {
        "parent_dir": test_dir
    }
})
transforms = build_transforms_from_config(config["data"]["transforms"]["torch"])
train_dataset, n1 = make_dataset(train_provider, transforms)
val_dataset, n2 = make_dataset(val_provider, transforms)
test_dataset, n3 = make_dataset(test_provider, transforms)

Loading data into memory:   0%|          | 0/263 [00:00<?, ?it/s]Failed to preprocess item in transform 'multi_transform': Centroid (143.6953582763672, 181.52749633789062) outside rectangle bounds
Failed to preprocess item in transform 'multi_transform': Centroid (140.02857971191406, 92.65306091308594) outside rectangle bounds
Loading data into memory:   8%|▊         | 21/263 [00:00<00:01, 205.70it/s]Failed to preprocess item in transform 'multi_transform': Centroid (115.63353729248047, 185.60708618164062) outside rectangle bounds
Failed to preprocess item in transform 'multi_transform': Centroid (129.43499755859375, 89.33609008789062) outside rectangle bounds
Failed to preprocess item in transform 'multi_transform': Centroid (142.36422729492188, 87.2079849243164) outside rectangle bounds
Loading data into memory:  15%|█▌        | 40/263 [00:00<00:01, 195.34it/s]Failed to preprocess item in transform 'multi_transform': Centroid (143.44810485839844, 85.41702270507812) outside rectangle 

In [6]:
realbeam_provider()

['dataset/10/1763562612691531700.png',
 'dataset/10/1763562614458815400.png',
 'dataset/10/1763562616142945200.png',
 'dataset/10/1763562617803188000.png',
 'dataset/10/1763562619504166300.png',
 'dataset/10/1763562621181598100.png',
 'dataset/10/1763562623057639400.png',
 'dataset/10/1763562624913657800.png',
 'dataset/10/1763562626609045800.png',
 'dataset/10/1763562628537380500.png',
 'dataset/10/1763562630300575100.png',
 'dataset/10/1763562632006963800.png',
 'dataset/10/1763562633721144500.png',
 'dataset/10/1763562635611562500.png',
 'dataset/10/1763562637325415200.png',
 'dataset/10/1763562639055205300.png',
 'dataset/10/1763562640803819600.png',
 'dataset/10/1763562642552485400.png',
 'dataset/10/1763562644221881100.png',
 'dataset/10/1763562646314752500.png',
 'dataset/10/1763562647998036300.png',
 'dataset/10/1763562649712443500.png',
 'dataset/10/1763562651360805800.png',
 'dataset/10/1763562653116598700.png',
 'dataset/10/1763562654795746300.png',
 'dataset/10/176356265645