In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import cv2
import shutil
from PIL import Image
from skimage import exposure, restoration, color
from skimage.morphology import disk, closing
from skimage.filters import gaussian
from scipy.signal import convolve2d
import matplotlib.pyplot as plt
import glob
import mrcfile


**Setup Directory Paths**

Path - location of raw datasets

path_output - where preprocessed images will be saved

**NOTE**
images will be saved as .npy, this is a numpy file type
We need to save the files as npy inorder to preserve the normalized values as float32.

If you want to save the images as png or jpg, then run the npyToFiletype utility.

In [2]:
#drive path to directory containg datasets
path = "../Data Sets/Raw Datasets"

#drive path to output directory for preprocessed_data
path_output = "../Data Sets/Processed Datasets"

**PreProcessing Function**

Preprocessing is a critical step in image analysis that involves preparing the image data for further processing. This typically involves a series of operations to correct for various artifacts and distortions in the image data, and to extract relevant features from the images. 


The goal of preprocessing is to improve the quality and relevance of the image data, and to prepare it for further processing and analysis. Proper preprocessing can have a significant impact on the accuracy and reliability of downstream analysis, so it is important to carefully choose and optimize preprocessing steps for each particular application. 

For this model the following are the preprocessing steps taken:


>   **Image Normalization**
>> Indented scales the pixel values of an image to a consistent range or distribution. This helps to remove biases or inconsistencies in the data and improve its suitability for subsequent processing and analysis.

> **Image Adjustment**
>> involves modifying the pixel values of an image to improve its quality or contrast. Here we use stretch and gamma adjustment

> **Image Restoration**
>>  aims to improve the quality of an image by removing noise, blurring, or other distortions.The practices used here are:
>>>**Histogram equalization**
>>>>enhances the contrast of an image by redistributing the pixel intensities. This involves adjusting the image's histogram so that it is more evenly distributed across the available range of pixel values.

>>>**Weiner Deconvolution**
>>>> technique used to restore images that have been degraded by blurring or noise. It involves applying a deconvolution filter to the image, which estimates the original, unblurred image by removing the effects of the blurring process. 

>**Adaptive Histogram Equalization**
>>hance the contrast of an image, particularly in areas with low contrast or uneven illumination. Unlike traditional histogram equalization, which operates on the entire image, AHE applies the equalization on local patches of the image. This means that the contrast enhancement is localized to specific regions of the image, preserving the contrast in other regions. 

>**Gaussian Filtering**
>>smoothing an image by applying a Gaussian filter to the image data. The Gaussian filter is a low-pass filter that effectively removes high-frequency noise from the image, while preserving the overall spatial structure of the image.

>**Morphological Operations (closing)**
>> technique that involves modifying the shape or structure of objects in an image. These operations can be used to remove noise or to enhance specific features in an image, such as edges or boundaries. 





In [3]:
def preprocess_image(image_data):
    # preprocessing
    # image normalization: pixel values are in the range [0,1]
    normalized_data = image_data/np.max(image_data)
    # conver image to greyscale
    gray_image_data = color.rgb2gray(normalized_data)
    # # image adjustment (stretch and adjust)
    # # stretch the contrast of the image to represent the 5th and 95th percentiles of the pixel intensity distribution
    p1, p2 = np.percentile(gray_image_data, (5,95))
    stretched_data = exposure.rescale_intensity(gray_image_data, in_range=(p1, p2))
    # Adjust the contrast and brightness of the image
    adjusted_data = exposure.adjust_gamma(stretched_data, gamma=0.5)
    # image restoration (histogram equalization and weiner deconvolution)
    # histogram equalization
    histeq_data = exposure.equalize_hist(adjusted_data)
    # weiner deconvolution
    # the psf used is a general approximation, a better estimate can be created for our data set (possible improvment, though probably small)
    psf = np.ones((3, 3)) / 9
    blurred_data = convolve2d(histeq_data, psf, mode='same', boundary='symm')
    wiener_deconvolved_data = restoration.wiener(blurred_data, psf, balance=0.1)
    # histogram equlization
    histeq_data = exposure.equalize_hist(wiener_deconvolved_data)
    # adaptive histogram equalization
    adapeq_data = exposure.equalize_adapthist(histeq_data, clip_limit=0.02, kernel_size=None)
    # adaptive histogram equalization again
    adapeq_data = exposure.equalize_adapthist(adapeq_data, clip_limit=0.99, kernel_size=None)
    # Gaussian filtering 4 times
    filtered_image_data = adapeq_data.copy()
    for j in range(4):
      filtered_image_data = gaussian(filtered_image_data, sigma=1)
      filtered_image_data = restoration.denoise_tv_chambolle(filtered_image_data, weight=0.1)
    # morphological closing operation
    selem = disk(5)
    closed_image_data = closing(filtered_image_data, selem)
    return closed_image_data

**Importing Datasets and Directory Set Up**

imports raw dataset into notebook.

Imports images to a dicontary of format dataset[folder]->Image

Images are imported and the preprocessing is applied before converting to numpy array, and saving to the dictionary dataset

In [4]:
dataset = {}

# Loop through all folders in the directory and import images
for folder in sorted(os.listdir(path)):
    folder_path = os.path.join(path, folder)
    data = []
    # Find all image files in the folder
    image_paths = glob.glob(os.path.join(folder_path, '*'))

    # Sort the image paths
    image_paths = sorted(image_paths)

    for image_path in image_paths:
        # Check the file extension
        _, ext = os.path.splitext(image_path)
        # Convert .mrc images to PNG
        if ext.lower() == '.mrc':
            # Define the output path for the converted PNG image
            png_path = os.path.splitext(image_path)[0] + '.png'

            # Check if the PNG image already exists
            if not os.path.exists(png_path):
                # Convert the .mrc image to PNG
                image = Image.open(image_path)
                image.save(png_path, 'PNG')

            # Load the PNG image
            image = cv2.imread(png_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        else:
            # Import image and convert to RGB values
            image = cv2.imread(image_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if image is not None:
            preprocessed_image = preprocess_image(image)
            data.append(preprocessed_image)

    # Convert data to numpy arrays
    data = np.array(data)

    # Add the data to the dataset dictionary
    dataset[folder] = data

  data = np.array(data)


**Setup output directories**

setup subdirectory in output directory based on folder labels

output directory will now match the shape and style of the input directory

In [5]:
# Remove all files and subdirectories inside the directory
if os.path.exists(path_output):
    shutil.rmtree(path_output)
# Create the directory and label subdirectories
# store the output of the preprocessing into seperate folders
os.makedirs(path_output)
# get all labels for directory creation
labels = dataset.keys()
# create and label subdirectories
for label in labels:
    # create path
    label_dir = os.path.join(path_output, label)
    # make dir
    if not os.path.exists(label_dir):
      os.makedirs(label_dir)

**Save processed data**

Saves the processed data as .npy files in the output directories

In [6]:
for folder in dataset.keys():
    folder_path = os.path.join(path_output, folder)
    # Loop through all images in the folder
    for i, image in enumerate(dataset[folder]):
        # Construct the file name
        file_name = f"{folder}_{i+1:03}.npy"
        file_path = os.path.join(folder_path, file_name)
        np.save(file_path, image)