In [11]:
from pathlib import Path
import numpy as np
import tempfile
import os

# =======================
# USTAWIENIA
# =======================
INPUT_PATH = Path("moving_target_dataset.npy")
OUT_DIR = Path(f"{INPUT_PATH.stem}_parts")
MAX_MB = 100.0

# =======================
# WCZYTANIE
# =======================
arr = np.load(INPUT_PATH, allow_pickle=True)
print("Loaded:", arr.dtype, arr.shape)

if not (isinstance(arr, np.ndarray) and arr.dtype == object and arr.ndim == 1):
    raise ValueError(f"Oczekuję 1D ndarray dtype=object. Mam: dtype={arr.dtype}, shape={arr.shape}, ndim={arr.ndim}")

OUT_DIR.mkdir(parents=True, exist_ok=True)

max_bytes = int(MAX_MB * 1024 * 1024)
base = INPUT_PATH.stem

def chunk_size_bytes(chunk: np.ndarray) -> int:
    """Zapisuje chunk do pliku tymczasowego i zwraca jego rozmiar w bajtach."""
    fd, tmp_path = tempfile.mkstemp(suffix=".npy")
    os.close(fd)
    try:
        np.save(tmp_path, chunk, allow_pickle=True)
        return Path(tmp_path).stat().st_size
    finally:
        try:
            os.remove(tmp_path)
        except FileNotFoundError:
            pass

# =======================
# DZIELENIE BEZ NADPISYWANIA
# =======================
n = len(arr)
part_idx = 0
start = 0

print(f"Splitting {n} items into parts <= {MAX_MB} MB")
print("Output dir:", OUT_DIR.resolve())

current_start = 0
current_end = 0  # exclusive

while current_end < n:
    # spróbujmy powiększyć chunk o jeden element
    candidate_end = current_end + 1
    candidate = arr[current_start:candidate_end]

    size = chunk_size_bytes(candidate)

    if size <= max_bytes:
        # OK — przyjmujemy większy chunk
        current_end = candidate_end
        continue

    # Jeśli nie mieści się, to zapisujemy "poprzedni" chunk
    if current_end == current_start:
        # Nawet 1 element jest za duży -> zapisujemy go osobno (będzie > limit)
        single = arr[current_start:current_start + 1]
        out_path = OUT_DIR / f"{base}.part{part_idx:04d}.npy"
        np.save(out_path, single, allow_pickle=True)
        out_size = out_path.stat().st_size
        print(f"[WARN] Single item > limit -> {out_path.name} ({out_size/1024/1024:.1f} MB)")

        part_idx += 1
        current_start += 1
        current_end = current_start
        continue

    # zapisujemy chunk, który się mieścił (current_start:current_end)
    chunk = arr[current_start:current_end]
    out_path = OUT_DIR / f"{base}.part{part_idx:04d}.npy"
    np.save(out_path, chunk, allow_pickle=True)
    out_size = out_path.stat().st_size
    print(f"Saved {out_path.name}: items {current_start}:{current_end} ({out_size/1024/1024:.1f} MB)")

    part_idx += 1
    current_start = current_end
    # current_end zostaje, bo zaczynamy nowy chunk od current_start

# zapis ostatniego chunku (jeśli coś zostało)
if current_end > current_start:
    chunk = arr[current_start:current_end]
    out_path = OUT_DIR / f"{base}.part{part_idx:04d}.npy"
    np.save(out_path, chunk, allow_pickle=True)
    out_size = out_path.stat().st_size
    print(f"Saved {out_path.name}: items {current_start}:{current_end} ({out_size/1024/1024:.1f} MB)")
    part_idx += 1

print(f"Done ✅ Created {part_idx} files")


Loaded: object (350,)
Splitting 350 items into parts <= 100.0 MB
Output dir: C:\Users\user\Desktop\data\moving_target_dataset_parts
Saved moving_target_dataset.part0000.npy: items 0:114 (96.7 MB)
Saved moving_target_dataset.part0001.npy: items 114:133 (96.5 MB)
Saved moving_target_dataset.part0002.npy: items 133:164 (98.4 MB)
Saved moving_target_dataset.part0003.npy: items 164:171 (72.9 MB)
Saved moving_target_dataset.part0004.npy: items 171:172 (42.9 MB)
Saved moving_target_dataset.part0005.npy: items 172:173 (75.6 MB)
Saved moving_target_dataset.part0006.npy: items 173:175 (71.4 MB)
Saved moving_target_dataset.part0007.npy: items 175:191 (80.2 MB)
Saved moving_target_dataset.part0008.npy: items 191:203 (89.2 MB)
Saved moving_target_dataset.part0009.npy: items 203:210 (94.2 MB)
Saved moving_target_dataset.part0010.npy: items 210:234 (98.7 MB)
Saved moving_target_dataset.part0011.npy: items 234:253 (98.0 MB)
Saved moving_target_dataset.part0012.npy: items 253:268 (91.2 MB)
Saved moving

In [12]:
from pathlib import Path
import numpy as np
import hashlib
import pickle
import random

# =======================
# USTAWIENIA
# =======================
PARTS_DIR = Path("moving_target_dataset_parts")     # folder z częściami
OUTPUT_PATH = Path("moving_target_dataset_merged.npy")

# opcjonalnie: oryginał do weryfikacji 1:1 (zostaw None jeśli nie chcesz / nie masz)
ORIGINAL_PATH = Path("moving_target_dataset.npy")   # albo None

# ile losowych rekordów porównać (im więcej, tym pewniej, ale wolniej)
SAMPLE_CHECKS = 20
SEED = 123

# =======================
# POMOCNICZE
# =======================
def stable_item_digest(obj) -> str:
    """Stabilny skrót obiektu po pickle (do porównania rekordów)."""
    b = pickle.dumps(obj, protocol=4)  # tylko do porównania w RAM
    return hashlib.sha256(b).hexdigest()

# =======================
# 1) ZNAJDŹ CZĘŚCI
# =======================
parts = sorted(PARTS_DIR.glob("*.part*.npy"))
if not parts:
    raise FileNotFoundError(f"Nie znaleziono plików *.part*.npy w: {PARTS_DIR.resolve()}")

print(f"Found {len(parts)} part files in: {PARTS_DIR.resolve()}")
print("First:", parts[0].name)
print("Last :", parts[-1].name)

# =======================
# 2) WCZYTAJ I SCAL
# =======================
arrays = []
total_len = 0

for p in parts:
    a = np.load(p, allow_pickle=True)
    if not (isinstance(a, np.ndarray) and a.dtype == object and a.ndim == 1):
        raise ValueError(f"Zły format w {p.name}: dtype={getattr(a,'dtype',None)}, shape={getattr(a,'shape',None)}")
    arrays.append(a)
    total_len += len(a)

merged = np.concatenate(arrays, axis=0)
print("Merged dtype/shape:", merged.dtype, merged.shape)
print("Merged length:", len(merged), "Expected:", total_len)

if len(merged) != total_len:
    raise AssertionError("Długość merged nie zgadza się z sumą długości części!")

# =======================
# 3) ZAPISZ
# =======================
np.save(OUTPUT_PATH, merged, allow_pickle=True)
print("Saved merged file:", OUTPUT_PATH.resolve(), f"({OUTPUT_PATH.stat().st_size/1024/1024:.1f} MB)")

# =======================
# 4) TEST: CZY DA SIĘ ODCZYTAĆ
# =======================
reloaded = np.load(OUTPUT_PATH, allow_pickle=True)
assert reloaded.dtype == object and reloaded.ndim == 1 and len(reloaded) == len(merged)
print("Reload test: OK ✅")

# =======================
# 5) TEST: WERYFIKACJA Z ORYGINAŁEM (opcjonalnie, ale najlepsza)
# =======================
if ORIGINAL_PATH is not None and Path(ORIGINAL_PATH).exists():
    print("\nVerifying against original:", Path(ORIGINAL_PATH).resolve())
    orig = np.load(ORIGINAL_PATH, allow_pickle=True)

    if not (isinstance(orig, np.ndarray) and orig.dtype == object and orig.ndim == 1):
        raise ValueError(f"Oryginał ma zły format: dtype={orig.dtype}, shape={orig.shape}")

    assert len(orig) == len(merged), f"Różna liczba elementów: orig={len(orig)} merged={len(merged)}"
    print("Length match: OK ✅")

    random.seed(SEED)
    idxs = random.sample(range(len(orig)), k=min(SAMPLE_CHECKS, len(orig)))

    mismatches = []
    for i in idxs:
        d1 = stable_item_digest(orig[i])
        d2 = stable_item_digest(merged[i])
        if d1 != d2:
            mismatches.append(i)

    if mismatches:
        raise AssertionError(f"Mismatch w rekordach na indeksach: {mismatches[:10]} (pokazuję max 10)")
    print(f"Sample content check ({len(idxs)} items): OK ✅")

    # dodatkowy sanity check: czy klucze dictów się zgadzają (jeśli rekordy są dictami)
    if isinstance(orig[0], dict) and isinstance(merged[0], dict):
        assert orig[0].keys() == merged[0].keys()
        print("Dict keys check: OK ✅")

else:
    print("\n[INFO] ORIGINAL_PATH nie ustawiony albo plik nie istnieje — pomijam porównanie 1:1.")
    print("Masz wciąż test 'Reload OK' + sumę długości części.")


Found 17 part files in: C:\Users\user\Desktop\data\moving_target_dataset_parts
First: moving_target_dataset.part0000.npy
Last : moving_target_dataset.part0016.npy
Merged dtype/shape: object (350,)
Merged length: 350 Expected: 350
Saved merged file: C:\Users\user\Desktop\data\moving_target_dataset_merged.npy (1452.7 MB)
Reload test: OK ✅

Verifying against original: C:\Users\user\Desktop\data\moving_target_dataset.npy
Length match: OK ✅
Sample content check (20 items): OK ✅
Dict keys check: OK ✅
