In [24]:
import ops
from ops.imports_ipython import *

# runs example from repository directory
home = os.path.dirname(os.path.dirname(ops.__file__))
os.chdir(os.path.join(home, 'projects', 'example'))
print(os.getcwd())

/Users/sasha/PycharmProjects/OpticalPooledScreens/projects/example


In [25]:
# image processing thresholds and expected values
THRESHOLD_READS = 50
THRESHOLD_DAPI = 200
THRESHOLD_CELL = 600
NUCLEUS_AREA = 40, 400

WILDCARDS = dict(well='A1', tile=107) # change these to change the well and tile that you want to analyze

SBS_CYCLES = range(1, 13) # should be 1 to n + 1 where n = number of cycles

# color of bases
# lut = "lookup table", used to map one color to another like a filter
LUTS = [
    ops.io.GRAY,
    ops.io.GREEN,
    ops.io.RED,
    ops.io.MAGENTA,
    ops.io.CYAN
]

# for formatting tif images when they are saved?
DISPLAY_RANGES = [
    [500, 15000],
    [100, 10000],
    [100, 20000],
    [100, 8000],
    [100, 6000]
]

In [26]:
# find sbs images and print paths
search = 'experimentC/input/*/10X*{well}_Tile-{tile}.tif'.format(**WILDCARDS)

input_files = natsorted(glob(search))
print(input_files)
# used to format output filenames
description = parse(input_files[0])
description['subdir'] = 'experimentC/process_ipynb'
description.pop('cycle')

['experimentC/input/10X_c1-SBS-1/10X_c1-SBS-1_A1_Tile-107.tif', 'experimentC/input/10X_c2-SBS-2/10X_c2-SBS-2_A1_Tile-107.tif', 'experimentC/input/10X_c3-SBS-3/10X_c3-SBS-3_A1_Tile-107.tif', 'experimentC/input/10X_c4-SBS-4/10X_c4-SBS-4_A1_Tile-107.tif', 'experimentC/input/10X_c5-SBS-5/10X_c5-SBS-5_A1_Tile-107.tif', 'experimentC/input/10X_c6-SBS-6/10X_c6-SBS-6_A1_Tile-107.tif', 'experimentC/input/10X_c7-SBS-7/10X_c7-SBS-7_A1_Tile-107.tif', 'experimentC/input/10X_c8-SBS-8/10X_c8-SBS-8_A1_Tile-107.tif', 'experimentC/input/10X_c9-SBS-9/10X_c9-SBS-9_A1_Tile-107.tif', 'experimentC/input/10X_c10-SBS-10/10X_c10-SBS-10_A1_Tile-107.tif', 'experimentC/input/10X_c11-SBS-11/10X_c11-SBS-11_A1_Tile-107.tif', 'experimentC/input/10X_c12-SBS-12/10X_c12-SBS-12_A1_Tile-107.tif']


'c1-SBS-1'

In [28]:
data = np.array([read(f) for f in input_files])
aligned, x_offsets, y_offsets = Snake._align_SBS(data, method="SBS_mean") # rigid alignment of sequencing cycles and channels.
data = data[:, :, abs(x_offsets[0]):-x_offsets[-1], abs(y_offsets[0]):-y_offsets[-1]] # change shape of data array to match aligned image
save(name(description, tag='aligned'), aligned, display_ranges=DISPLAY_RANGES, luts=LUTS)

In [29]:
loged = Snake._transform_log(aligned, skip_index=0) # apply Laplacian-of-Gaussian filter from scipy.ndimage.
save(name(description, tag='log'), loged, display_ranges=DISPLAY_RANGES, luts=LUTS)

In [30]:
maxed = Snake._max_filter(loged, 3, remove_index=0) # apply a maximum filter in a window of `width`. Conventionally operates on Laplacian-of-Gaussian filtered SBS data, dilating sequencing channels to compensate for single-pixel alignment error.
save(name(description, tag='maxed'), maxed, display_ranges=DISPLAY_RANGES[1:], luts=LUTS[1:])

In [31]:
std = Snake._compute_std(loged, remove_index=0) # use standard deviation over cycles, followed by mean across channels to estimate sequencing read locations.
save(name(description, tag='std'), std)

In [32]:
peaks = Snake._find_peaks(std) # where are the spots
save(name(description, tag='peaks'), peaks)

### segment nuclei and cells

In [33]:
# Find nuclei from DAPI (fluorescent stain)
nuclei = Snake._segment_nuclei(data[0], THRESHOLD_DAPI,
 area_min=NUCLEUS_AREA[0], area_max=NUCLEUS_AREA[1])

save(name(description, tag='nuclei'), nuclei, compress=1)

In [34]:
cells = Snake._segment_cells(data[0], nuclei, THRESHOLD_CELL) # Matches cell labels to nuclei labels.
save(name(description, tag='cells'), cells, compress=1)

### extract base intensity, call reads, assign to cells

In [35]:
# Find the signal intensity from `maxed` at each point in `peaks` above `threshold_peaks`.
df_bases = Snake._extract_bases(maxed, peaks, cells,
                        THRESHOLD_READS, wildcards=WILDCARDS)
print(df_bases)
df_bases.to_csv(name(description, tag='bases', ext='csv'), index=None)

         read  cycle channel  intensity  cell    i    j  tile well
0           0      1       A        653     0    5   81   107   A1
1           0      1       C        431     0    5   81   107   A1
2           0      1       G         28     0    5   81   107   A1
3           0      1       T         16     0    5   81   107   A1
4           0      2       A         10     0    5   81   107   A1
...       ...    ...     ...        ...   ...  ...  ...   ...  ...
730699  15222     11       T         98  3457  986  859   107   A1
730700  15222     12       A        323  3457  986  859   107   A1
730701  15222     12       C        231  3457  986  859   107   A1
730702  15222     12       G         79  3457  986  859   107   A1
730703  15222     12       T         73  3457  986  859   107   A1

[736080 rows x 9 columns]


In [36]:
df_reads = Snake._call_reads(df_bases, peaks=peaks) # call reads by compensating for channel cross-talk and calling the base with the highest corrected intensity for each cycle. Q = quality?
filename = name(description, tag='reads', ext='csv')
df_reads.to_csv(filename, index=None)

In [37]:
# read from csv to match numerical precision of snakemake pipeline
df_reads = pd.read_csv(filename) 
df_cells = Snake._call_cells(df_reads) # gets the two most-common barcode reads for each cell.
df_cells.to_csv(name(description, tag='cells', ext='csv'), index=None)

### annotated SBS images

In [38]:
# last channel annotates base calls
annotate_luts = LUTS + [ops.annotate.GRMC, ops.io.GRAY]
annotate_display_ranges = [(a / 4, b / 4) for a,b in DISPLAY_RANGES] + [(0, 4)]
annotate_SBS = Snake._annotate_SBS(log=loged, df_reads=df_reads)

save(name(description, tag='annotate_SBS'), annotate_SBS,
     display_ranges=annotate_display_ranges, luts=annotate_luts, compress=1)