Libraries

In [1]:
import os
from PIL import Image
import numpy as np
import random
import datetime
import time

Protein Classes

In [2]:
# EMD-4214 = 40S Ribosome
# EMD-2811 = 60S Ribosome
# EMD-2858 = 80S Ribosome
# EMD-2788 = Apoferritin

# Class_EMD_Value: Mask_Pixel_Value
class_masks = {'EMD-4214': 104, 'EMD-2811': 153, 'EMD-2858': 202, 'EMD-2788': 251}

Get Projection Image Files

In [4]:
# Get protein directory names
directories = next(os.walk('.'))[1]
directories.remove(".ipynb_checkpoints")
directories.remove("Dataset")
directories.remove("raw_images")
directories.remove("Mini_Dataset")

projections = set()
for d in directories:
    files = os.listdir(os.path.join(d,'projections'))
    files = [d + "/projections/" + os.path.splitext(x)[0] for x in files]
    projections.update(files)
    
projections = list(projections)
#print(len(projections)) == 400

Single Image Creation Helper Function

In [5]:
def createSingleImage(projections, class_masks):
    image = np.full((4096, 4096), 16)
    mask = np.zeros((4096, 4096))
    
    max_particles = np.random.randint(200,401)
    #print(num_particles)
    
    num_particles = 0
    giveup_thresh = 25
    
    while (num_particles < max_particles and giveup_thresh > -1):
        flag = False
        # Select random projection from all class projections (uniform distribution)
        proj_file = random.choice(projections)
        #print(proj_file)
        proj_class = proj_file[-8:]
        proj_mask_val = class_masks.get(proj_class)
        
        proj_img = Image.open(proj_file + ".jpg")
        #proj_img.show()
        proj_mask = np.array(proj_img)
        proj_mask[np.where(proj_mask >= 70)] = proj_mask_val
        proj_mask[np.where(proj_mask < 70)] = 0
        
        slen = proj_mask.shape[0]
        row = np.random.randint(0,image.shape[0]-slen)
        column = np.random.randint(0,image.shape[1]-slen)
        
        # Check for overlap
        examine = np.add(mask[row:row+slen, column:column+slen], proj_mask)
        flag1 = np.isin(mask[row:row+slen, column:column+slen], [0, 104, 153, 202, 251])
        flag2 = np.isin(examine, [0, 104, 153, 202, 251])
        if (flag1.sum() != flag2.sum()): # Protein overlap
            giveup_thresh = giveup_thresh - 1
            #print("Collision Avoided!")
        else:
            num_particles = num_particles + 1
            giveup_thresh = 25
            
            mask[row:row+slen, column:column+slen] = np.add(mask[row:row+slen, column:column+slen], proj_mask)
            add_proj = np.array(proj_img)
            
            # Null projection background
            if proj_class == 'EMD-2858': # Edge Case, 80S Rib. has strange glow which is harder to eliminate
                add_proj[np.where(add_proj <= 65)] = 0
            else:
                background_pix_val = add_proj[0,0]
                add_proj[np.where(add_proj == background_pix_val)] = 0
                
            image[row:row+slen, column:column+slen] = np.add(image[row:row+slen, column:column+slen], add_proj)
            #img = Image.fromarray(image)
            #img.show()
            
        
    #print("The number of particles in the image: " + str(num_particles))
    output_image = np.uint8(np.abs(image - 255))
    img = Image.fromarray(np.uint8(output_image), mode='L')
    #img.show() 
    #img.save("raw_image_no_ctf.jpg")
    
    # Convert mask class values, now that we have ensured no overlaps, to factors of 50
    mask[np.where(mask == 104)] = 100
    mask[np.where(mask == 153)] = 150
    mask[np.where(mask == 202)] = 200
    mask[np.where(mask == 251)] = 250
    
    msk = Image.fromarray(np.uint8(mask), mode='L')
    #msk.show() 
    #msk.save("mask.jpg")
    return img, msk, num_particles
        
# fin_image, fin_mask, num_p = createSingleImage(projections, class_masks)
# fin_image.show() 
# fin_image.save("test_raw_image_no_ctf.png")
# fin_mask.show() 
# fin_mask.save("test_mask.png")
# print("The number of particles in the image: " + str(num_p))

Apply CTF + Noise to Image Function and Save to Database Folder

In [6]:
def apply_ctf_and_noise(raw_img, msk):
    postfix = datetime.datetime.now().strftime("%d_%b_%Y-H%H_M%M_S%S.%f") # Get date/time for file naming
    time.sleep(.001)
    
    # Save raw_image to .png format
    raw_img_file = os.path.join("raw_images", "raw_image_" + postfix + ".png")
    raw_img.save(raw_img_file)
    
    # Apply CTF + Noise Command String
    fin_img_file = os.path.join("Dataset", "images", "image_" + postfix + ".jpg")
    ctf_noise_str = "~/Desktop/eman2/programs/e2proc2d.py ./" + raw_img_file + " ./" + fin_img_file + " --process=math.simulatectf:ampcont=10:defocus=0.25:noiseamp=2"
    %run $ctf_noise_str
    
    # Save mask using same postfix as image
    msk_file = os.path.join("Dataset", "masks", "mask_" + postfix + ".jpg")
    msk.save(msk_file)

Make X New Samples for Dataset

In [7]:
# Num of new samples
num_samples = 1

for i in range(num_samples):
    fin_raw_image, fin_mask, _ = createSingleImage(projections, class_masks)
    apply_ctf_and_noise(fin_raw_image, fin_mask)

1 images, processing 0-0 stepping by 1


Tests