In [1]:
import psutil
import pickle
import re
from pathlib import Path
from typing import Generator

from hashlib import sha256
import matplotlib.pyplot as plt
from pydantic import BaseModel, ConfigDict
import numpy as np
import numpy.typing as npt
import pandas as pd
import seaborn as sns
from AFMReader.topostats import load_topostats
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize

from topostats.damage.damage import (
    Defect,
    DefectGap,
    OrderedDefectGapList,
    calculate_indirect_defect_gaps,
    get_defects_and_gaps_from_bool_array,
)
from topostats.io import LoadScans
from topostats.measure.curvature import discrete_angle_difference_per_nm_circular, total_turn_in_region_radians
from topostats.tracing.splining import resample_points_regular_interval
from topostats.unet_masking import make_bounding_box_square, pad_bounding_box_cutting_off_at_image_bounds

In [None]:
def clear_output():
    from IPython.display import clear_output as ipy_clear_output

    ipy_clear_output()

In [None]:
# Get the data directories set up
dir_base = Path("/Volumes/shared/pyne_group/Shared/AFM_Data/dna_damage/Cs137_irradiations")
assert dir_base.exists()
dir_this_analysis = dir_base / "20260204-analysis-getting-back-into-the-project"
assert dir_this_analysis.exists()
dir_processed_data = dir_this_analysis / "output"
assert dir_processed_data.exists()
dir_results = dir_this_analysis / "analysis_results"
dir_results.mkdir(exist_ok=True)
assert dir_results.exists()

# Load the data, lazily since the files are large?
topo_files = list(dir_processed_data.glob("*/**/*.topostats"))
print(f"found {len(topo_files)} topo files")

# Load the corresponding statistics csv file
csv_grain_stats = dir_processed_data / "grain_statistics.csv"
assert csv_grain_stats.exists(), f"could not find grain stats csv at {csv_grain_stats}"
df_grain_stats = pd.read_csv(csv_grain_stats)
print(f"grain stats columns: {df_grain_stats.columns}")

# convert some columns to nanometres
df_grain_stats["total_contour_length"] /= 1e-9

In [None]:
# plot contour length distributions
sns.stripplot(data=df_grain_stats, x="basename", y="total_contour_length", s=2)
sns.violinplot(data=df_grain_stats, x="basename", y="total_contour_length", inner=None)
plt.xticks(rotation=90)
plt.title("Contour length distributions")
plt.show()

# drop any rows with contour length less than a threshold
threshold_contour_length = 300

n_rows_before = len(df_grain_stats)
df_grain_stats = df_grain_stats[df_grain_stats["total_contour_length"] >= threshold_contour_length]
n_rows_after = len(df_grain_stats)
print(
    f"dropped {n_rows_before - n_rows_after} rows with contour length < {threshold_contour_length} nm. remaining rows: {n_rows_after}"
)

sns.stripplot(data=df_grain_stats, x="basename", y="total_contour_length", s=2)
sns.violinplot(data=df_grain_stats, x="basename", y="total_contour_length", inner=None)
plt.xticks(rotation=90)
plt.title("Contour length distributions")
plt.show()

In [None]:
# Function to check ram usage of the notebook
def notebook_ram_usage():
    process = psutil.Process()
    print(f"process: {process}")
    mem_info = process.memory_info()
    print(f"memory info: {mem_info}")
    ram_usage_gb = mem_info.rss / (1024**3)
    print(f"RAM usage: {ram_usage_gb:.2f} GB")


notebook_ram_usage()