In [None]:
from IPython.display import clear_output
from pathlib import Path
import re

import matplotlib.pyplot as plt
import numpy as np

from topostats.plottingfuncs import Colormap

colormap = Colormap().get_cmap()

from pySPM import Bruker

In [None]:
# SAVE THE IMAGES

# for sample_type in ["on_target", "off_target"]:
#     for sample in (DATA_DIR / sample_type).glob("*.npy"):
#         data = np.load(sample)
#         plt.imshow(data, vmin=-8, vmax=8, cmap=colormap)
#         plt.title(f"{sample_type} {sample.stem}")
#         plt.show()
#         plt.imsave(
#             f"{FLATTENED_IMAGES_DIR / sample_type / sample.stem}.png",
#             data,
#             vmin=-8,
#             vmax=8,
#             cmap=colormap,
#         )

In [None]:
# DATA_DIR = Path(
#     "/Volumes/shared/pyne_group/Shared/AFM_Data/Cas9_Minicircles/20231130_Cas9Minicircles/output165_300nm2/processed/"
# )
# TRAINING_DATA_DIR = Path("/Users/sylvi/topo_data/hariborings/neat_protein_outputs/LINEAR/")

# Cas9
# DATA_DIR = Path("/Users/sylvi/topo_data/hariborings/cas9_data_p2nm/OT2_SC/")
# CROP_OUTPUT_DIR = Path("/Users/sylvi/topo_data/hariborings/cas9_crops_p2nm/OT2_SC_p2nm/")
# DNA only

SAMPLE_GROUP = "unbound"
SAMPLE_TYPE = "ON_REL"
ZOOM_IN = True
CROP_SIZE_NM = 40

if SAMPLE_GROUP == "bound":
    # bound samples
    DATA_DIR = Path(f"/Users/sylvi/topo_data/hariborings/cas9_data_p2nm/{SAMPLE_TYPE}/")
elif SAMPLE_GROUP == "unbound":
    # unbound samples
    # SPM_DATA_DIR = Path(f"/Users/sylvi/topo_data/hariborings/testing_all_unbound_data/data/{SAMPLE_TYPE}/")
    # DATA_DIR = Path(f"/Users/sylvi/topo_data/hariborings/testing_all_unbound_data/output_{SAMPLE_TYPE}/processed/")
    # NPY_P_TO_NM_DATA_DIR = Path(f"/Users/sylvi/topo_data/hariborings/dna_data_p2nm/{SAMPLE_TYPE}/")

    DATA_DIR = Path(f"/Users/sylvi/topo_data/hariborings/dna_data_p2nm/{SAMPLE_TYPE}/")

if ZOOM_IN:
    CROP_OUTPUT_DIR = Path(
        f"/Users/sylvi/topo_data/hariborings/figure_1/crops_for_figure_zoom_in/{CROP_SIZE_NM}/{SAMPLE_GROUP}/{SAMPLE_TYPE}"
    )
elif not ZOOM_IN:
    CROP_OUTPUT_DIR = Path(
        f"/Users/sylvi/topo_data/hariborings/figure_1/crops_for_figure/{CROP_SIZE_NM}/{SAMPLE_GROUP}/{SAMPLE_TYPE}"
    )

assert DATA_DIR.exists()
assert CROP_OUTPUT_DIR.exists()
if not DATA_DIR.exists():
    raise FileNotFoundError("Data directory not found")
if not CROP_OUTPUT_DIR.exists():
    raise FileNotFoundError("Training data directory not found")

# TRAINING_DATA_CROPPED_DIR = TRAINING_DATA_DIR / "cropped" / "dna_cas9" / "images"

files = sorted(list(DATA_DIR.glob("*.npy")))

# Re-save the data with the p2nm in the filename
# for file in files:
#     filename = file.stem
#     orignal_filename = f"{filename.split('_height_thresholded')[0]}.spm"
#     # Grab the pixel to nm scaling factor from the original spm file
#     scan = Bruker(f"{SPM_DATA_DIR / orignal_filename}")
#     channel_data = scan.get_channel("Height")
#     p_to_nm_data = channel_data.pxs()
#     unit_dict = {
#             "nm": 1,
#             "um": 1e3,
#     }
#     p_to_nm = p_to_nm_data[0][0] * unit_dict[p_to_nm_data[0][1]]
#     filename_with_p2nm = f"{filename}_image_p2nm_{p_to_nm:.2f}.npy"
#     data = np.load(file)
#     # save the data with p2nm in the filename
#     # np.save(f"{NPY_P_TO_NM_DATA_DIR / filename_with_p2nm}", data)

#     print(f"{filename} | {orignal_filename} | {p_to_nm:.2f}")


print(f"number of files for {SAMPLE_GROUP} {SAMPLE_TYPE}: {len(files)}")
print(f"save dir: {CROP_OUTPUT_DIR}")

In [None]:
image_index = 0
image = np.load(files[image_index])
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
plt.imshow(image, cmap=colormap, vmax=8, vmin=-8)

In [None]:
image_index = 2
file = files[image_index]
print(f"file: {file.stem}")
# Find the pixel to floating point nm conversion factor from the filename that occurs after the "_p2nm_" string. It contains a decimal point.
p_to_nm = float(re.search(r"(?<=_p2nm_)\d+\.\d+", file.stem).group(0))
print(f"px to nm: {p_to_nm}")
image = np.load(file)

In [None]:
# Manual plotting with correct colour scale (automatic plotting & cropping doesn't use right colourscale due to cv2 being bad)
image_index = 51
file = files[image_index]
image = np.load(file)
p_to_nm = float(re.search(r"(?<=_p2nm_)\d+\.\d+", file.stem).group(0))
print(f"p_to_nm: {p_to_nm}")
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
if SAMPLE_GROUP == "bound":
    VMIN = -8
    VMAX = 8
elif SAMPLE_GROUP == "unbound":
    VMIN = -3
    VMAX = 4
plt.imshow(image, cmap=colormap, vmax=VMAX, vmin=VMIN)
plt.title(f"{file.stem}\nsample type: {SAMPLE_GROUP} {SAMPLE_TYPE} p_to_nm: {p_to_nm}")
plt.show()

x = 70
y = 100
# w = 200
w_nm = 40

w = int(w_nm / p_to_nm)

image_cropped = image[y : y + w, x : x + w]

fig, ax = plt.subplots(1, 1, figsize=(5, 5))
plt.imshow(image_cropped, cmap=colormap, vmax=VMAX, vmin=VMIN)
plt.title(f"{file.stem}\n sample type: {SAMPLE_GROUP} {SAMPLE_TYPE} p_to_nm: {p_to_nm} crop size: {w_nm} nm")
plt.show()

In [None]:
# Save the cropped image
posx_nm = x * p_to_nm
posy_nm = y * p_to_nm
filename = CROP_OUTPUT_DIR / f"{file.stem}_crop_{w * p_to_nm:.2f}_nm_{posx_nm:.2f}_{posy_nm:.2f}_nm.npy"
np.save(f"{filename}.npy", image_cropped)

plt.imsave(f"{filename}.png", image_cropped, cmap=colormap, vmax=VMAX, vmin=VMIN)

In [None]:
import cv2
import cmapy

image_index = 0
file = files[image_index]
# Find the pixel to floating point nm conversion factor from the filename that occurs after the "_p2nm_" string. It contains a decimal point.
# p_to_nm = float(re.search(r"(?<=_p2nm_)\d+\.\d+", file.stem).group(0))
# print(f"px to nm: {p_to_nm}")
image = np.load(file)

bounding_box_size = 120
# Define the bounding box
x, y, w, h = 100, 100, bounding_box_size, bounding_box_size

window_name = "image_display"
cropped_window_name = "cropped_image_display"

while True:
    # Get cropped image
    cropped_image = image[y : y + h, x : x + w]
    cropped_image_rgb = cropped_image.copy()

    crop_size_nm = p_to_nm * w

    # Make a copy of the image
    display_image = image.copy()

    # Turn the heightmap into a color image
    display_image_norm = cv2.normalize(display_image, None, 0, 255, cv2.NORM_MINMAX)
    display_image = cv2.applyColorMap(display_image_norm.astype(np.uint8), cmapy.cmap("afmhot"))

    # Draw the bounding box
    cv2.rectangle(display_image, (x, y), (x + w, y + h), (0, 255, 0), 2)
    # Show the image with the bounding box and also show the cropped image. They cannot be hstacked though because they are different sizes
    file_name = file.stem
    cv2.imshow(window_name, display_image)

    # Apply a colormap to the cropped image where the minimum and maximum are set to the minimum and maximum of the original image
    cropped_image_rgb = cv2.normalize(cropped_image_rgb, None, 0, 255, cv2.NORM_MINMAX)
    cropped_image_rgb = cv2.applyColorMap(cropped_image_rgb.astype(np.uint8), cmapy.cmap("afmhot"))
    cv2.imshow(cropped_window_name, cropped_image_rgb)

    # Wait for a key press
    key = cv2.waitKey(1)

    # Move the bounding box
    if key == ord("a"):
        x -= 10
    elif key == ord("d"):
        x += 10
    elif key == ord("w"):
        y -= 10
    elif key == ord("s"):
        y += 10
    elif key == ord("e"):
        w -= 10
        h -= 10
        crop_size_nm = p_to_nm * w
    elif key == ord("r"):
        w += 10
        h += 10
        crop_size_nm = p_to_nm * w
    elif key == ord("f"):
        image_index += 1
        file = files[image_index]
        # p_to_nm = float(re.search(r"(?<=_p2nm_)\d+\.\d+", file.stem).group(0))
        image = np.load(file)
    elif key == ord("g"):
        image_index -= 1
        file = files[image_index]
        # p_to_nm = float(re.search(r"(?<=_p2nm_)\d+\.\d+", file.stem).group(0))
        # print(f"loading image: {file.stem} px to nm: {p_to_nm} index: {image_index} / {len(files)}")

        image = np.load(file)
    # Save the region in the bounding box when space is pressed
    elif key == ord(" "):
        # Get the index of the output file
        output_index = len(list(CROP_OUTPUT_DIR.glob("*.npy")))

        filename = f"image_{output_index}"
        if filename + ".png" in [f.stem for f in CROP_OUTPUT_DIR.glob("*.png")]:
            print("File already exists")
            exit()

        # Save the cropped image
        # np.save(CROP_OUTPUT_DIR / f"image_{output_index}_{p_to_nm}.npy", cropped_image)
        np.save(CROP_OUTPUT_DIR / f"image_{output_index}.npy", cropped_image)
        # Save as png
        plt.imsave(
            # CROP_OUTPUT_DIR / f"image_{output_index}_{p_to_nm}.png",
            CROP_OUTPUT_DIR / f"image_{}_{output_index}.png",
            cropped_image,
            vmin=image.min(),
            vmax=image.max(),
        )
        print(f"saving image_{output_index}.png")

    if x < 0:
        x = 10
        w = 100
        h = 100
    if y < 0:
        y = 10
        w = 100
        h = 100
    if x + w > image.shape[1]:
        x = 10
        w = 100
        h = 100
    if y + h > image.shape[0]:
        y = 10
        w = 100
        h = 100

    # Quit the program when 'q' is pressed
    elif key == ord("q"):
        # Clean up
        cv2.destroyAllWindows()
        cv2.destroyWindow(file_name)
        break

In [None]:
# Save the molecule
# Set i to be the number of molecules saved so far
output_index = len(list(TRAINING_DATA_CROPPED_DIR.glob("*.npy")))
np.save(TRAINING_DATA_CROPPED_DIR / f"image_{output_index}.npy", cropped_molecule)
plt.imsave(
    TRAINING_DATA_CROPPED_DIR / f"image_{output_index}.png",
    cropped_molecule,
    vmin=image.min(),
    vmax=image.max(),
)

In [None]:
# Rename files

path = Path("/Users/sylvi/topo_data/hariborings/dna_crops_all/dna_crops_additional/")
image_starting_index = 133
testing_mode = False
file_type = ".png"
prefix = "image"

image_index = image_starting_index
files = list(path.glob(f"*{file_type}"))
# files.sort()
# Sort by the index in the filenaem
files.sort(key=lambda x: int(re.search(r"\d+", x.stem).group(0)))
for file in files:
    print(file.name)
    new_filename = path / f"{prefix}_{image_index}{file_type}"
    if testing_mode:
        print(f"renaming {file.name} to {new_filename.name}")
    else:
        file.rename(new_filename)
    image_index += 1

In [None]:
images = sorted(list(Path("/Users/sylvi/topo_data/hariborings/training_data/cropped/masks/").glob("*.npy")))
print(images[0])
image = np.load(images[0])
plt.imshow(image)
print(np.unique(image))

In [None]:
# Merge multiclass masks into one mask
MULTICLASS_MASK_DIR = Path("/Users/sylvi/topo_data/hariborings/training_data/cropped/multiclass_masks/")

mask_files = sorted(list(MULTICLASS_MASK_DIR.glob("*.npy")))

# Files with the same task id are the same molecule and need to be merged.
image_index = 0
for task in range(252, 302):
    print(f"task: {task}")
    # Both masks have "task-{task}" in the name.
    # Get the ring mask. It is the only file with both "task-{task}" and "ring" in the name.
    ring_file = [file for file in mask_files if f"task-{task}" in file.name and "ring" in file.name][0]
    gem_file = [file for file in mask_files if f"task-{task}" in file.name and "gem" in file.name][0]
    # print(f"ring_file: {ring_file.stem}, gem_file: {gem_file.stem}")
    ring_mask = np.load(ring_file).astype(bool)
    gem_mask = np.load(gem_file).astype(bool)
    # plt.imshow(ring_mask)
    # plt.show()
    # plt.imshow(gem_mask)
    # plt.show()
    combined_mask = np.zeros_like(ring_mask).astype(int)
    combined_mask[ring_mask] = 1
    combined_mask[gem_mask] = 2
    # plt.imshow(combined_mask)
    # plt.show()
    plt.imsave(MULTICLASS_MASK_DIR / f"mask_{image_index}.png", combined_mask)
    np.save(MULTICLASS_MASK_DIR / f"mask_{image_index}.npy", combined_mask)
    image_index += 1