# Overview

Purpose of notebook is to investigate known feature engineering techniques from literature on 4D-STEM datasets. Identify key microstructure features.
- control unzip
- AA
- BD spot extraction
- Std dev thresholding
- radial profiling
- IFT
- Binarizer

# Imports


In [17]:
%matplotlib qt5

import pyxem as pxm
import numpy as np
import hyperspy.api as hs
from matplotlib import pyplot as plt
from sklearn.preprocessing import RobustScaler, Binarizer
from sklearn.pipeline import Pipeline
import umap
import hdbscan

from sklearn.base import BaseEstimator, TransformerMixin

plt.rcParams['figure.dpi'] = 300
plt.rcParams['font.family']='sans-serif'
plt.rcParams['font.sans-serif']='Arial'
plt.rcParams['font.size']=8
plt.rcParams["font.weight"] = "bold"
plt.rcParams["font.style"] = "normal"


plt.rcParams['axes.linewidth'] = 1
plt.rcParams["xtick.direction"] = "in"
plt.rcParams["ytick.direction"] = "in"
plt.rcParams['lines.linewidth'] = 0.8
plt.rcParams['figure.figsize'] = [3 , 3 ]

# Supporting functions

In [18]:

def palette_for_label_map(no_cluster, unclassified=False, mask=False):
    """
    no_cluster = number of real clusters
    unclassified = no_cluster + 1
    mask = no_cluster + 2

    Returns palette, CustomCmap
    """
    import seaborn as sns
    import matplotlib

    # Extract colors from tab20 and tab20b
    tab20_colors = [plt.cm.tab20(i) for i in range(20)]
    tab20b_colors = [plt.cm.tab20b(i) for i in range(20)]
    combined_colors = tab20_colors + tab20b_colors

    custom_palette = combined_colors[:no_cluster]

    if unclassified == True:
        custom_palette.insert(0, "white")  # UNCLASSIFIED
    if mask == True:
        custom_palette.insert(0, "black")  # MASKED OUT

    CustomCmap = matplotlib.colors.ListedColormap(custom_palette)
    palette = sns.color_palette(palette=custom_palette)
    
    return (palette, CustomCmap)

def include_scalebar(dp):
    from matplotlib_scalebar.scalebar import ScaleBar

    dx = dp.axes_manager[0].scale
    scalebar = ScaleBar(
        dx,
        "nm",
        length_fraction=0.25,
        width_fraction=0.015,
        location="lower left",
        frameon=False,
        color="w",
        scale_loc="top",
        border_pad=0.5,
    )
    plt.gca().add_artist(scalebar)


def plot_label_map(labels_highest_soft_spatial, no_cluster_soft, scalebar=False):
    palette, CustomCmap = palette_for_label_map(
        no_cluster_soft, unclassified=False, mask=False
    )

    plt.figure()
    plt.imshow(labels_highest_soft_spatial, cmap=CustomCmap)

    if scalebar == True:
        include_scalebar(dp)
    
    plt.xticks([])
    plt.yticks([])
    plt.tight_layout()
    plt.show()

def plot_manifold_labels(no_cluster_soft,labels_highest_soft, reduced_data ):

    import seaborn as sns
    cluster_colors = [palette_for_label_map(no_cluster_soft)[0][x] if x >= 0
                    else (0.5, 0.5, 0.5)
                    for x in labels_highest_soft]
    if reduced_data.shape[1]==3:
        fig = plt.figure()
        ax = fig.add_subplot(projection='3d')
        ax.scatter(reduced_data[:,0], reduced_data[:,1], reduced_data[:,2], alpha=0.2,  c=cluster_colors)
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.set_zticklabels([])

    if reduced_data.shape[1]==2:
        fig = plt.figure()
        plt.scatter(reduced_data[:,0], reduced_data[:,1], alpha=0.2,  c=cluster_colors)
        plt.gca().set_xticklabels([])
        plt.gca().set_yticklabels([])
 

In [19]:
def dp_cluster_std_dev(selected_dp):
    cluster_std_dev_dp_pixels = np.std(selected_dp, axis=0)
    return cluster_std_dev_dp_pixels


def get_cluster_mean_dp_list(dp, memberships_highest_soft, labels_highest_soft):
    dp.fold()
    dp.unfold_navigation_space()
    dp_soft_cluster_mean_list = []

    for i in range(no_cluster_soft):
        selected_dp = np.take(
            dp.data,
            np.where((memberships_highest_soft == 1) & (labels_highest_soft == i))[0],
            axis=0)
        
        dp_soft_cluster_mean = np.mean(selected_dp, axis=0)
        dp_soft_cluster_mean_list.append(dp_soft_cluster_mean)
    return dp_soft_cluster_mean_list


def get_cluster_std_dev_dp_list(dp_bina, memberships_highest_soft, labels_highest_soft):
    dp_bina.fold()
    dp_bina.unfold_navigation_space()
    dp_soft_cluster_std_dev_list = []
    for i in range(no_cluster_soft):
        selected_dp = np.take(
            dp_bina.data,
            np.where((memberships_highest_soft == 1) & (labels_highest_soft == i))[0],
            axis=0,
        )
        dp_soft_cluster_std_dev_list.append(dp_cluster_std_dev(selected_dp))
    return dp_soft_cluster_std_dev_list


def get_cluster_mean_eds_list(eds, memberships_highest_soft, labels_highest_soft):
    eds.fold()
    eds.unfold_navigation_space()
    eds_soft_cluster_mean_list = []
    for i in range(no_cluster_soft):
        selected_eds = np.take(
            eds.data,
            np.where((memberships_highest_soft == 1) & (labels_highest_soft == i))[0],
            axis=0,
        )
        eds_soft_cluster_mean = np.mean(selected_eds, axis=0)
        eds_soft_cluster_mean_list.append(eds_soft_cluster_mean)
    return eds_soft_cluster_mean_list


def get_cluster_std_dev_eds_list(eds, memberships_highest_soft, labels_highest_soft):
    eds.fold()
    eds.unfold_navigation_space()
    eds_soft_cluster_std_dev_list = []
    for i in range(no_cluster_soft):
        selected_eds = np.take(
            eds.data,
            np.where((memberships_highest_soft == 1) & (labels_highest_soft == i))[0],
            axis=0,
        )
        eds_soft_cluster_std_dev = np.std(selected_eds, axis=0)
        eds_soft_cluster_std_dev_list.append(eds_soft_cluster_std_dev)
    return eds_soft_cluster_std_dev_list


def plot_cluster_mean_analysis(dp,
    include_eds_mean_plot=False,
    include_eds_std_dev_plot=False,
    include_dp_std_dev_plot=False,
    include_dp_mean_plot=True,
    include_probability_map=True,
    dp_vmax=0.1,
):
    from matplotlib_scalebar.scalebar import ScaleBar

    palette, CustomCmap = palette_for_label_map(
        no_cluster_soft, unclassified=False, mask=False
    )

    # Calculate the total number of rows required based on the plots to be included
    total_rows = 0

    if include_dp_mean_plot:
        total_rows += 1

    if include_probability_map:
        total_rows += 2

    if include_dp_std_dev_plot:
        total_rows += 1

    if include_eds_mean_plot:
        total_rows += 1

    if include_eds_std_dev_plot:
        total_rows += 1

    fig = plt.figure(figsize=(7.08661, 7.08661))

    gs = plt.GridSpec(
        total_rows,
        no_cluster_soft,
        wspace=0.1,
        hspace=0.1,
        height_ratios=[1] * (total_rows - 1) + [0.1],
    )

    for i in range(no_cluster_soft):
        row_index = 0  # Initialize row index

        ax_outer = plt.subplot(gs[:, i])
        ax_outer.set_facecolor(palette[i])
        plt.xticks([])
        plt.yticks([])

        if include_dp_mean_plot:
            dp_soft_cluster_mean_list = get_cluster_mean_dp_list(
                dp, memberships_highest_soft, labels_highest_soft
            )

            ax_ebsd = fig.add_subplot(gs[row_index, i])
            dp_cluster_mean = dp_soft_cluster_mean_list[i]  # current cluster
            dp_cluster_mean_normalize = (dp_cluster_mean - dp_cluster_mean.min()) / (
                dp_cluster_mean.max() - dp_cluster_mean.min()
            )  # normalize
            plt.imshow(dp_cluster_mean_normalize, cmap="Greys_r", vmax=dp_vmax)
            plt.xticks([])
            plt.yticks([])

            row_index += 1

        if include_dp_std_dev_plot:

            dp_soft_cluster_std_dev_list = get_cluster_std_dev_dp_list(
                dp, memberships_highest_soft, labels_highest_soft
            )
            ax_dp_std = fig.add_subplot(gs[row_index, i])
            dp_cluster_std = dp_soft_cluster_std_dev_list[i]
            plt.imshow(dp_cluster_std, cmap="inferno")
            plt.xticks([])
            plt.yticks([])

            row_index += 1

        if include_eds_mean_plot:
            eds_soft_cluster_mean_list = get_cluster_mean_eds_list(
                eds, memberships_highest_soft, labels_highest_soft
            )
            ax_eds = fig.add_subplot(gs[row_index, i])
            eds_cluster_mean = eds_soft_cluster_mean_list[i]
            plt.plot(eds_cluster_mean)
            plt.xticks([])
            plt.yticks([])

            row_index += 1

        if include_eds_std_dev_plot:
            eds_soft_cluster_std_dev_list = get_cluster_std_dev_eds_list(
                eds, memberships_highest_soft, labels_highest_soft
            )
            ax_eds_std = fig.add_subplot(gs[row_index, i])
            eds_cluster_std = eds_soft_cluster_std_dev_list[i]
            plt.plot(eds_cluster_std, "g")
            plt.xticks([])
            plt.yticks([])

            row_index += 1

        if include_probability_map:
            ax_loading = fig.add_subplot(gs[row_index, i])
            memberships_highest_soft_spatial2 = memberships_highest_soft_spatial.copy()
            memberships_highest_soft_spatial2[
                np.where(labels_highest_soft_spatial != i)
            ] = 0
            pc = plt.imshow(memberships_highest_soft_spatial2, cmap='inferno')

            # scalebar = ScaleBar(2, 'nm', length_fraction=0.25, width_fraction=0.015, location='lower left',
            #             frameon=False, color='w', scale_loc='top', border_pad=0.1)  # 1 pixel = 0.2 meter
            # plt.gca().add_artist(scalebar)

            plt.xticks([])
            plt.yticks([])

            row_index += 1

    ax_colorbar = fig.add_subplot(gs[row_index, :])
    plt.colorbar(pc, cax=ax_colorbar, orientation="horizontal")

# Purpose of notebook

Try all feature engineering techniques. Identify key microstructure features
- control unzip
- AA
- BD spot extraction
- Std dev thresholding
- radial profiling
- IFT
- Binarizer



# Load data


In [20]:
dirpath = f"C:\\Users\\Zhi Quan\\Dropbox (Personal)\\Jupyter backup\\Simulations\\SimulatedData\\data\\"

dp = hs.load(dirpath + "send_reduced.zspy", lazy=False)
dp.change_dtype("float32")
dp = dp.isig[64:192, 64:192]
dp

<Signal2D, title: , dimensions: (80, 80|128, 128)>

In [5]:
# dir_path = 'C:\\Users\\Zhi Quan\\Dropbox (Personal)\\Jupyter backup\\TRISO\\2025\\data\\sped\\'

# dp = hs.load(dir_path + '1100-100h.blo')

In [14]:
dp.unfold()

plt.plot(dp.mean().data)
plt.xlabel('Bin channels (a.u.)')
plt.ylabel('Intensity (a.u.)')

Text(25.625000000000004, 0.5, 'Intensity (a.u.)')

In [8]:
dp.unfold()
dp_bina = hs.signals.Signal2D(Binarizer(threshold=np.percentile(np.log1p(dp.data), 90)).fit_transform(np.log1p(dp.data)).reshape(80,80, 128, 128))
dp.fold()

In [None]:
dp_bina.plot()

In [None]:
dp_bina.unfold()

plt.plot(dp_bina.mean().data)
plt.xlabel('Bin channels (a.u.)')
plt.ylabel('Intensity (a.u.)')

# Start

In [21]:
def apply_mask(img, center_x, center_y, radius):

    x, y = np.meshgrid(
        np.arange(img.shape[0]),
        np.arange(img.shape[1]),
    )

    # Calculate the distance of each pixel to the center of the circle
    distances = np.sqrt((x - center_x) ** 2 + (y - center_y) ** 2)

    # Create a boolean mask where True values correspond to pixels within the circle
    mask = distances <= radius

    img[mask] = 0

    return img

In [22]:
dp.map(apply_mask, center_x = 64, center_y = 64, radius=10)

[########################################] | 100% Completed | 753.64 ms


In [None]:
dp.plot()

In [12]:
save_dir = 'C:\\Users\\Zhi Quan\\Dropbox (Personal)\\Final thesis\\methods_figures\\'

# Control

In [None]:
dp.unfold()

pipe = Pipeline(
[
('reduce_dims', umap.UMAP(densmap=True, n_neighbors=10, n_components=3, min_dist=0, metric='euclidean', random_state=1)), 
]
)

reduced_data = pipe.fit_transform(np.log1p(dp).data)
clust = hdbscan.HDBSCAN(min_cluster_size=20, min_samples=10, prediction_data=True, metric='euclidean')

clust.fit(reduced_data)
x,y = 80,80

memberships_all_soft = hdbscan.all_points_membership_vectors(clust)
memberships_highest_soft = np.array([np.max(x) for x in memberships_all_soft])
memberships_highest_soft_spatial = memberships_highest_soft.reshape(x, y)
labels_highest_soft = np.array([np.argmax(x) for x in memberships_all_soft])
labels_highest_soft_spatial = labels_highest_soft.reshape(x, y)
no_cluster_soft = len(set(labels_highest_soft))


In [23]:
save_dir = 'C:\\Users\\Zhi Quan\\Dropbox (Personal)\\Final thesis\\methods_figures\\'

In [None]:
np.save('..\\2025\\data\\feature_engi_labels\\control_labels.npy', labels_highest_soft_spatial)

In [None]:
np.save('..\\2025\\data\\feature_engi_memberships\\control_memberships.npy', memberships_all_soft)

np.save('..\\2025\\data\\feature_engi_reduced_data\\control_reduced.npy', reduced_data)

In [None]:

plot_label_map(labels_highest_soft_spatial, no_cluster_soft)
plt.savefig(save_dir + 'control_label_map.png', bbox_inches='tight')
plt.close('all')

plot_manifold_labels(no_cluster_soft, labels_highest_soft, reduced_data)
plt.savefig(save_dir + 'control_manifold.png', bbox_inches='tight')
plt.close('all')

In [None]:

plot_cluster_mean_analysis(dp=dp, include_dp_std_dev_plot=True)


# Std dev thresholding

In [None]:
dp.fold()
dp_shape = dp.data.shape[2]
dp.unfold()

std_dev_dp_pixels = np.std(dp.data, axis=0)
std_dev_dp_pixels = hs.signals.Signal2D(
    std_dev_dp_pixels.reshape(dp_shape, dp_shape)
)

center_x, center_y = (dp_shape/2, dp_shape/2)

radius = 9

# Create a grid of pixel coordinates
x, y = np.meshgrid(
    np.arange(std_dev_dp_pixels.data.shape[0]),
    np.arange(std_dev_dp_pixels.data.shape[1]),
)

# Calculate the distance of each pixel to the center of the circle
distances = np.sqrt((x - center_x) ** 2 + (y - center_y) ** 2)

# Create a boolean mask where True values correspond to pixels within the circle
mask = distances <= radius

# Apply the mask to the image
std_dev_dp_pixels_masked_central = np.copy(std_dev_dp_pixels.data)
std_dev_dp_pixels_masked_central[mask] = 0


std_dev_dp_pixels_masked_central = hs.signals.Signal2D(std_dev_dp_pixels_masked_central)

final_thres = 0.4

matrix = std_dev_dp_pixels_masked_central.data

n = int(np.ceil(matrix.size * final_thres))
# n=1024

# Flatten the matrix into a 1D array and sort it in descending order
sorted_array = np.sort(matrix.flatten())[::-1]

# Extract the highest n values from the sorted array
top_n = sorted_array[:n]

# Reshape the top_n array back into a 2D matrix
# Create a mask for the top n values
mask = np.isin(matrix, top_n)

# Set all values not in the top n to zero
result = matrix * mask


result = hs.signals.Signal2D(result)

result.plot(norm='symlog')

In [None]:

dp.fold()
dp.unfold_navigation_space()

extracted_pixels_list = []

for d in dp.data:
    extracted_pixels = d[np.where(mask==1)]
    extracted_pixels_list.append(extracted_pixels)

extracted_pixels_array = np.array(extracted_pixels_list)
extracted_pixels_array[extracted_pixels_array < 0] = 0


In [None]:
plt.plot(np.mean(extracted_pixels_array, axis=0))
plt.xlabel('Bin channels (a.u.)')
plt.ylabel('Intensity (a.u.)')

In [None]:
dp.unfold()

pipe = Pipeline(
[
('reduce_dims', umap.UMAP(densmap=True, n_neighbors=10, n_components=3, min_dist=0, metric='euclidean', random_state=0)), 
]
)

reduced_data = pipe.fit_transform(np.log1p(extracted_pixels_array).data)
clust = hdbscan.HDBSCAN(min_cluster_size=20, min_samples=10, prediction_data=True, metric='euclidean')

clust.fit(reduced_data)
x,y = 80,80

memberships_all_soft = hdbscan.all_points_membership_vectors(clust)
memberships_highest_soft = np.array([np.max(x) for x in memberships_all_soft])
memberships_highest_soft_spatial = memberships_highest_soft.reshape(x, y)
labels_highest_soft = np.array([np.argmax(x) for x in memberships_all_soft])
labels_highest_soft_spatial = labels_highest_soft.reshape(x, y)
no_cluster_soft = len(set(labels_highest_soft))

In [None]:
plot_label_map(labels_highest_soft_spatial, no_cluster_soft)

In [None]:

plot_label_map(labels_highest_soft_spatial, no_cluster_soft)
plt.savefig(save_dir + 'std_dev_thres_label_map.png')
plt.close('all')

plot_manifold_labels(no_cluster_soft, labels_highest_soft, reduced_data)
plt.savefig(save_dir + 'std_dev_thres_manifold.png')
plt.close('all')

In [None]:

np.save('..\\2025\\data\\feature_engi_labels\\std_dev_thres_labels.npy', labels_highest_soft_spatial)


In [None]:
np.save('..\\2025\\data\\feature_engi_memberships\\std_dev_thres_memberships.npy', memberships_all_soft)

In [None]:
np.save('..\\2025\\data\\feature_engi_reduced_data\\std_dev_thres_reduced.npy', reduced_data)

## Binarizer

In [9]:
dp_bina.fold()
dp_bina.plot()
dp_bina

<Signal2D, title: , dimensions: (80, 80|128, 128)>

In [10]:
from sklearn.pipeline import Pipeline
import umap
import hdbscan
from sklearn.preprocessing import RobustScaler, Binarizer

dp_bina.unfold()

pipe = Pipeline(
[
('reduce_dims', umap.UMAP(densmap=True, n_neighbors=10, n_components=3, min_dist=0, metric='jaccard', random_state=0)),
]
)

reduced_data = pipe.fit_transform(dp_bina.data)

clust = hdbscan.HDBSCAN(min_cluster_size=20, min_samples=10, prediction_data=True)

clust.fit(reduced_data)

x,y = 80,80 

memberships_all_soft = hdbscan.all_points_membership_vectors(clust)
memberships_highest_soft = np.array([np.max(x) for x in memberships_all_soft])
memberships_highest_soft_spatial = memberships_highest_soft.reshape(x, y)
labels_highest_soft = np.array([np.argmax(x) for x in memberships_all_soft])
labels_highest_soft_spatial = labels_highest_soft.reshape(x, y)
no_cluster_soft = len(set(labels_highest_soft))

# plot_label_map(labels_highest_soft_spatial, no_cluster_soft)

  warn(
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [None]:

np.save('..\\2025\\data\\feature_engi_labels\\bina_labels.npy', labels_highest_soft_spatial)


In [None]:
np.save('..\\2025\\data\\feature_engi_memberships\\bina_memberships.npy', memberships_all_soft)
np.save('..\\2025\\data\\feature_engi_reduced_data\\bina_reduced.npy', reduced_data)

In [13]:
plot_label_map(labels_highest_soft_spatial, no_cluster_soft)
plt.savefig(save_dir + 'binarize_label_map.png')
plt.close('all')

plot_manifold_labels(no_cluster_soft, labels_highest_soft, reduced_data)
plt.savefig(save_dir + 'binarize_manifold.png')
plt.close('all')

# plot_cluster_mean_analysis(dp=dp, include_dp_std_dev_plot=True)


## Radial variance

In [25]:
def radial_profiling_with_mask(image, x0, y0, binsize=1, fn=np.var, radius=None):
    """
    Perform radial profiling on a 2D image with a customizable statistical measure
    and an optional circular mask.
    
    Args:
        image (np.ndarray): 2D array representing the image or pattern.
        x0, y0 (float): Center coordinates for radial integration.
        binsize (int): Size of the bin for radial distances.
        fn (function): A function to apply within each radial bin (e.g., np.mean, np.sum, np.var, np.median).
        radius (float or None): Radius of the circular mask. If None, no mask is applied.
    
    Returns:
        np.ndarray: Array of the chosen statistical measure for each radial bin within the mask.
    """
    
    # Step 1: Create a circular mask if radius is provided
    if radius is not None:
        y, x = np.indices(image.shape)
        circular_mask = (np.hypot(x - x0, y - y0) >= radius)
        image = image * circular_mask  # Apply the mask to the image
    
    # Step 2: Calculate the radial distances for each pixel
    qx, qy = np.indices(image.shape)
    r = np.hypot(qx - x0, qy - y0).ravel()
    
    # Step 3: Bin the radial distances
    r_bins = np.floor(r / binsize).astype(int)
    
    # Step 4: Initialize an array to hold the results
    radial_profile = []
    
    # Step 5: Apply the chosen statistical measure within each bin
    for i in range(r_bins.max() + 1):
        mask = r_bins == i
        if np.any(mask):
            radial_value = fn(image.ravel()[mask])
            radial_profile.append(radial_value)
    
    return np.array(radial_profile)
dp.fold()
dp.unfold_navigation_space()

True

In [32]:
# import pyxem as pxm
# dp.set_signal_type('electron_diffraction')
# dp.unit = "k_A^-1"
# dp.beam_energy = 200 # in 200 keV
# dp.set_ai()
# integration1d  = dp.get_azimuthal_integral1d(npt=100)

In [27]:
rv_dp = np.array([radial_profiling_with_mask(dp.data[i], 64, 64, fn=np.var, radius=9) for i in range(dp.data.shape[0])])

In [28]:
rv_dp = hs.signals.Signal1D(rv_dp.reshape(80,80, -1))

In [88]:
plt.plot(rv_dp.sum().data)
plt.xlabel('Bin channels (a.u.)')
plt.ylabel('Intensity (a.u.)')

Text(0, 0.5, 'Intensity (a.u.)')

In [36]:
integration1d

<ElectronDiffraction1D, title: , unfolded dimensions: (6400|100)>

In [37]:
integration1d.unfold()

pipe = Pipeline(
[
('reduce_dims', umap.UMAP(densmap=True, n_neighbors=10, n_components=3, min_dist=0, metric='euclidean', random_state=0)), 
]
)

reduced_data = pipe.fit_transform(integration1d.data)
clust = hdbscan.HDBSCAN(min_cluster_size=20, min_samples=10, prediction_data=True, metric='euclidean')

clust.fit(reduced_data)

x,y = 80,80

memberships_all_soft = hdbscan.all_points_membership_vectors(clust)
memberships_highest_soft = np.array([np.max(x) for x in memberships_all_soft])
memberships_highest_soft_spatial = memberships_highest_soft.reshape(x, y)
labels_highest_soft = np.array([np.argmax(x) for x in memberships_all_soft])
labels_highest_soft_spatial = labels_highest_soft.reshape(x, y)
no_cluster_soft = len(set(labels_highest_soft))
print(no_cluster_soft)

# plot_label_map(labels_highest_soft_spatial, no_cluster_soft)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


34


In [38]:
plot_label_map(labels_highest_soft_spatial, no_cluster_soft)

In [None]:

np.save('..\\2025\\data\\feature_engi_labels\\rv_labels.npy', labels_highest_soft_spatial)


In [None]:
np.save('..\\2025\\data\\feature_engi_memberships\\rv_memberships.npy', memberships_all_soft)
np.save('..\\2025\\data\\feature_engi_reduced_data\\rv_reduced.npy', reduced_data)

In [39]:

plot_label_map(labels_highest_soft_spatial, no_cluster_soft)
plt.savefig(save_dir + 'rv_label_map.png')
plt.close('all')

plot_manifold_labels(no_cluster_soft, labels_highest_soft, reduced_data)
plt.savefig(save_dir + 'rv_manifold.png')
plt.close('all')


In [None]:

plot_cluster_mean_analysis(dp=dp, include_dp_std_dev_plot=True)


## Annular averaging

In [41]:
from scipy.signal import find_peaks
from scipy.ndimage import gaussian_filter1d


def extract_annular_ring(image, center, r1, r2):
    """
    Extracts the annular ring region between r1 and r2 radii.
    
    Parameters:
    image (2D numpy array): Input diffraction pattern.
    center (tuple): Coordinates of the center (x, y).
    r1 (float): Inner radius of the annular ring.
    r2 (float): Outer radius of the annular ring.
    
    Returns:
    annular_ring (2D numpy array): Masked image with only the annular ring region.
    """
    y, x = np.indices(image.shape)
    r = np.sqrt((x - center[1])**2 + (y - center[0])**2)
    mask = (r >= r1) & (r < r2)
    return image * mask

def average_intensity_per_angle(image, center, r1, r2, degree_step=5):
    """
    Averages the intensity every 'degree_step' degrees in the annular ring.
    
    Parameters:
    image (2D numpy array): Input diffraction pattern.
    center (tuple): Coordinates of the center (x, y).
    r1 (float): Inner radius of the annular ring.
    r2 (float): Outer radius of the annular ring.
    degree_step (int): Step size in degrees for angular averaging.
    
    Returns:
    angular_profile (list): Average intensity values for each angular segment.
    """
    y, x = np.indices(image.shape)
    r = np.sqrt((x - center[1])**2 + (y - center[0])**2)
    theta = np.arctan2(y - center[0], x - center[1]) * 180 / np.pi  # Convert to degrees
    theta[theta < 0] += 360  # Normalize to [0, 360]

    # Extract the annular ring
    ring_image = extract_annular_ring(image, center, r1, r2)
    ring_theta = theta[ring_image > 0]  # Angular values in the ring
    ring_intensity = ring_image[ring_image > 0]  # Intensity values in the ring

    # Average over angular segments
    angular_profile = []
    for angle in range(0, 360, degree_step):
        angle_mask = (ring_theta >= angle) & (ring_theta < angle + degree_step)
        average_intensity = np.mean(ring_intensity[angle_mask]) if np.any(angle_mask) else 0
        angular_profile.append(average_intensity)
    
    return angular_profile

def compute_annular_averaging(image, center, local_minima, degree_step=5):
    """
    Computes the annular averaging features for all rings defined by local minima.
    
    Parameters:
    image (2D numpy array): Input diffraction pattern.
    center (tuple): Coordinates of the center (x, y).
    local_minima (list): List of radial distances defining ring boundaries.
    degree_step (int): Step size in degrees for angular averaging.
    
    Returns:
    all_profiles (list): List of angular profiles for each annular ring.
    """
    all_profiles = []
    
    for i in range(len(local_minima) - 1):
        r1 = local_minima[i]
        r2 = local_minima[i + 1]
        angular_profile = average_intensity_per_angle(image, center, r1, r2, degree_step)
        all_profiles.append(angular_profile)
    
    return all_profiles

In [80]:
def find_peaks_in_sum_rv(sum_rv):
    # Step 2: Identify local minima in the radial profile to define the boundaries of the annular rings
    sum_rv = gaussian_filter1d(sum_rv, sigma=1)
    local_minima, _ = find_peaks(-sum_rv, prominence=0.1)  # Minima are peaks of the negative radial profile
    return local_minima

In [83]:
integration1d.unfold()

local_minima = find_peaks_in_sum_rv(np.log1p(integration1d.sum(axis=0).data))

local_minima = np.insert(local_minima, 0, 0)
local_minima = np.append(local_minima, integration1d.sum(axis=0).data.shape[0] - 1)

plt.plot(integration1d.sum(axis=0).data, label='Radial Profile', color='blue')

# Plot the local minima
plt.plot(local_minima, integration1d.sum(axis=0).data[local_minima], 'ro', label='Local Minima')


[<matplotlib.lines.Line2D at 0x2675443a970>]

In [None]:
aa_dp_list = []

center=(64,64)

for ind, d in enumerate(dp.data):
    aa_dp = compute_annular_averaging(d, center, local_minima, degree_step=5)
    aa_dp_list.append(aa_dp)

aa_dp = np.array(aa_dp_list)
aa_dp = hs.signals.Signal1D(aa_dp.reshape(80,80,-1))


# 11 rings with 72 segments to each ring

In [94]:
aa_dp.plot()

In [95]:
aa_dp.unfold()

pipe = Pipeline(
[
('reduce_dims', umap.UMAP(densmap=True, n_neighbors=10, n_components=3, min_dist=0, metric='euclidean', random_state=0)), 
]
)

reduced_data = pipe.fit_transform(aa_dp.data)
clust = hdbscan.HDBSCAN(min_cluster_size=20, min_samples=10, prediction_data=True, metric='euclidean')

clust.fit(reduced_data)
x,y = 80,80

memberships_all_soft = hdbscan.all_points_membership_vectors(clust)
memberships_highest_soft = np.array([np.max(x) for x in memberships_all_soft])
memberships_highest_soft_spatial = memberships_highest_soft.reshape(x, y)
labels_highest_soft = np.array([np.argmax(x) for x in memberships_all_soft])
labels_highest_soft_spatial = labels_highest_soft.reshape(x, y)
no_cluster_soft = len(set(labels_highest_soft))
print(no_cluster_soft)

# plot_label_map(labels_highest_soft_spatial, no_cluster_soft)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


14


In [None]:

np.save('..\\2025\\data\\feature_engi_labels\\aa_labels.npy', labels_highest_soft_spatial)


In [None]:

np.save('..\\2025\\data\\feature_engi_memberships\\aa_memberships.npy', memberships_all_soft)
np.save('..\\2025\\data\\feature_engi_reduced_data\\aa_reduced.npy', reduced_data)

In [96]:

plot_label_map(labels_highest_soft_spatial, no_cluster_soft)
plt.savefig(save_dir + 'aa_dp_label_map.png')
plt.close('all')

plot_manifold_labels(no_cluster_soft, labels_highest_soft, reduced_data)
plt.savefig(save_dir + 'aa_dp_manifold.png')
plt.close('all')


In [None]:

plot_cluster_mean_analysis(dp=dp, include_dp_std_dev_plot=True)

In [97]:
plt.plot(aa_dp.mean().data)
plt.xlabel('Bin channels (a.u.)')
plt.ylabel('Intensity (a.u.)')

Text(0, 0.5, 'Intensity (a.u.)')

# IFT

In [98]:
import numpy as np

def compute_cepstrum(I_k):
    """
    Compute the cepstrum of an input electron diffraction pattern.

    Parameters:
    I_k (numpy.ndarray): Input diffraction pattern (2D array).

    Returns:
    numpy.ndarray: Cepstrum of the input diffraction pattern.
    """
    # Compute the natural logarithm of the intensity
    log_I_k = np.log1p(I_k)
    
    # Compute the Fourier transform of the log-transformed intensity
    F_log_I_k = np.fft.fft2(log_I_k)
    
    # Step 5: Shift the FFT to center low frequencies
    F_log_I_k_shifted = np.fft.fftshift(F_log_I_k)
    
    # Step 6: Compute the magnitude of the Fourier transform
    C_p = np.abs(F_log_I_k_shifted)
    
    return C_p


In [102]:
dp.plot()

In [99]:
dp.fold()
dp_ift = dp.map(compute_cepstrum, inplace=False)

[########################################] | 100% Completed | 952.80 ms


In [103]:
dp_ift.plot(norm='symlog')

In [100]:
dp_ift.fold()
dp_ift

<ElectronDiffraction2D, title: , dimensions: (80, 80|128, 128)>

In [104]:
dp_ift.unfold()

plt.plot(np.mean(dp_ift.data, axis=0))
plt.xlabel('Bin channels (a.u.)')
plt.ylabel('Intensity (a.u.)')

dp_ift.fold()


In [105]:
from sklearn.pipeline import Pipeline
import umap
import hdbscan
from sklearn.preprocessing import RobustScaler, Binarizer

dp_ift.unfold()

pipe = Pipeline(
[
('reduce_dims', umap.UMAP(densmap=True, n_neighbors=10, n_components=3, min_dist=0, metric='euclidean', random_state=0)),
]
)

reduced_data = pipe.fit_transform(np.log1p(dp_ift.data))
clust = hdbscan.HDBSCAN(min_cluster_size=20, min_samples=10, prediction_data=True, metric='euclidean')

clust.fit(reduced_data)
x,y = 80,80 

memberships_all_soft = hdbscan.all_points_membership_vectors(clust)
memberships_highest_soft = np.array([np.max(x) for x in memberships_all_soft])
memberships_highest_soft_spatial = memberships_highest_soft.reshape(x, y)
labels_highest_soft = np.array([np.argmax(x) for x in memberships_all_soft])
labels_highest_soft_spatial = labels_highest_soft.reshape(x, y)
no_cluster_soft = len(set(labels_highest_soft))

print(no_cluster_soft)
# plot_label_map(labels_highest_soft_spatial, no_cluster_soft)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


17


In [None]:

np.save('..\\2025\\data\\feature_engi_labels\\ift_labels.npy', labels_highest_soft_spatial)


In [None]:
np.save('..\\2025\\data\\feature_engi_memberships\\ift_memberships.npy', memberships_all_soft)
np.save('..\\2025\\data\\feature_engi_reduced_data\\ift_reduced.npy', reduced_data)

In [106]:

plot_label_map(labels_highest_soft_spatial, no_cluster_soft)
plt.savefig(save_dir + 'ift_label_map.png')
plt.close('all')

plot_manifold_labels(no_cluster_soft, labels_highest_soft, reduced_data)
plt.savefig(save_dir + 'ift_manifold.png')
plt.close('all')

# plot_cluster_mean_analysis(dp=dp, include_dp_std_dev_plot=True)

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(dp_ift.data[32,32], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Distribution of Feature Array', fontsize=14)
plt.xlabel('Feature Values', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(alpha=0.3)
plt.show()
