**This notebook runs the individual steps of the provided Snakemake pipeline. This may be useful for understanding the functions, but it is highly recommended to use Snakemake to run the pipeline on screening data.**

In [86]:
import ops
from ops.imports_ipython import *
from ops.paper.cell_idr import setup_example

# runs example from repository directory
home = os.path.dirname(os.path.dirname(ops.__file__))
os.chdir(home)

In [87]:
os.chdir(os.path.join(home, 'example'))
os.path.join(home, 'example')

'/Users/sasha/PycharmProjects/OpticalPooledScreens/example'

In [88]:
barcodes = pd.read_csv('barcodes.csv')

THRESHOLD_READS = 50
THRESHOLD_DAPI = 200
THRESHOLD_CELL = 600
NUCLEUS_AREA = 40, 400
WILDCARDS = dict(well='A1', tile=107)

SBS_CYCLES = [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12]

LUTS = [
    ops.io.GRAY,
    ops.io.GREEN,
    ops.io.RED,
    ops.io.MAGENTA,
    ops.io.CYAN
]

DISPLAY_RANGES = [
    [500, 15000],
    [100, 10000],
    [100, 20000],
    [100, 8000],
    [100, 6000]
]

In [89]:
search = 'experimentC/input/*/10X*{well}_Tile-{tile}.tif'.format(**WILDCARDS)
input_files = natsorted(glob(search))
for f in input_files:
    print(f)

# used to format output filenames
description = parse(input_files[0])
description['subdir'] = 'experimentC/process_ipynb'
description.pop('cycle');

experimentC/input/10X_c1-SBS-1/10X_c1-SBS-1_A1_Tile-107.tif
experimentC/input/10X_c2-SBS-2/10X_c2-SBS-2_A1_Tile-107.tif
experimentC/input/10X_c3-SBS-3/10X_c3-SBS-3_A1_Tile-107.tif
experimentC/input/10X_c4-SBS-4/10X_c4-SBS-4_A1_Tile-107.tif
experimentC/input/10X_c5-SBS-5/10X_c5-SBS-5_A1_Tile-107.tif
experimentC/input/10X_c6-SBS-6/10X_c6-SBS-6_A1_Tile-107.tif
experimentC/input/10X_c7-SBS-7/10X_c7-SBS-7_A1_Tile-107.tif
experimentC/input/10X_c8-SBS-8/10X_c8-SBS-8_A1_Tile-107.tif
experimentC/input/10X_c9-SBS-9/10X_c9-SBS-9_A1_Tile-107.tif
experimentC/input/10X_c10-SBS-10/10X_c10-SBS-10_A1_Tile-107.tif
experimentC/input/10X_c11-SBS-11/10X_c11-SBS-11_A1_Tile-107.tif
experimentC/input/10X_c12-SBS-12/10X_c12-SBS-12_A1_Tile-107.tif


In [90]:
data = np.array([read(f) for f in input_files])
aligned = Snake._align_SBS(data)
save(name(description, tag='aligned'), aligned, display_ranges=DISPLAY_RANGES, luts=LUTS)

In [91]:
loged = Snake._transform_log(aligned, skip_index=0)
save(name(description, tag='log'), loged, display_ranges=DISPLAY_RANGES, luts=LUTS)

In [92]:
maxed = Snake._max_filter(loged, 3, remove_index=0)
save(name(description, tag='maxed'), maxed, display_ranges=DISPLAY_RANGES[1:], luts=LUTS[1:])

### detect candidate reads

In [93]:
std = Snake._compute_std(loged, remove_index=0)
save(name(description, tag='std'), std)

In [94]:
peaks = Snake._find_peaks(std)
save(name(description, tag='peaks'), peaks)

### segment nuclei and cells

In [95]:
nuclei = Snake._segment_nuclei(data[0], THRESHOLD_DAPI,
 area_min=NUCLEUS_AREA[0], area_max=NUCLEUS_AREA[1])

save(name(description, tag='nuclei'), nuclei, compress=1)

In [96]:
cells = Snake._segment_cells(data[0], nuclei, THRESHOLD_CELL)
save(name(description, tag='cells'), cells, compress=1)

### extract base intensity, call reads, assign to cells

In [97]:
df_bases = Snake._extract_bases(maxed, peaks, cells, 
                        THRESHOLD_READS, wildcards=WILDCARDS)
print(df_bases)
df_bases.to_csv(name(description, tag='bases', ext='csv'), index=None)

         read  cycle channel  intensity  cell     i    j  tile well
0           0      1       A        653     0     5   81   107   A1
1           0      1       C        431     0     5   81   107   A1
2           0      1       G         28     0     5   81   107   A1
3           0      1       T         16     0     5   81   107   A1
4           0      2       A         17     0     5   81   107   A1
...       ...    ...     ...        ...   ...   ...  ...   ...  ...
800923  16685     11       T          0  3693  1014  128   107   A1
800924  16685     12       A        666  3693  1014  128   107   A1
800925  16685     12       C        835  3693  1014  128   107   A1
800926  16685     12       G        472  3693  1014  128   107   A1
800927  16685     12       T        547  3693  1014  128   107   A1

[808608 rows x 9 columns]


In [98]:
df_reads = Snake._call_reads(df_bases, peaks=peaks)
filename = name(description, tag='reads', ext='csv')
df_reads.to_csv(filename, index=None)

In [99]:
# read from csv to match numerical precision of snakemake pipeline
df_reads = pd.read_csv(filename) 
df_cells = Snake._call_cells(df_reads)
df_cells.to_csv(name(description, tag='cells', ext='csv'), index=None)

### extract
enotypes and combine with called cells

In [100]:
# df_combined, barcode_info = Snake._merge_sbs_phenotype(
#     sbs_tables=df_cells,
#     phenotype_tables=None,
#     barcode_table=barcodes,
#     sbs_cycles=SBS_CYCLES
# )
# df_combined.to_csv(name(description, tag='combined', ext='csv'), index=None)
# barcode_info.to_csv(name(description, tag='barcode_info', ext='csv'), index=None)
# # print(barcode_info)

### annotated SBS images

In [101]:
# last channel annotates base calls
annotate_luts = LUTS + [ops.annotate.GRMC, ops.io.GRAY]
annotate_display_ranges = [(a/4, b/4) for a,b in DISPLAY_RANGES] + [[0, 4]]
annotate_SBS = Snake._annotate_SBS(log=loged, df_reads=df_reads)
save(name(description, tag='annotate_SBS'), annotate_SBS,
     display_ranges=annotate_display_ranges, luts=annotate_luts, compress=1)

In [102]:
# # second-to-last channel annotates base calls (notches are mapped reads, pluses are unmapped reads)
# # last channel encodes peaks value
# annotate_extra_luts = LUTS + [ops.annotate.GRMC, ops.io.GRAY, ops.io.GRAY]
# annotate_extra_display_ranges = (
#     [(a/4, b/4) for a,b in DISPLAY_RANGES]
#     +[[0, 4], [0, THRESHOLD_READS*4], [0, 30]]
# )
# annotate_SBS_extra = Snake._annotate_SBS_extra(
#     log=loged,
#     peaks=peaks,
#     df_reads=df_reads,
#     barcode_table=barcodes,
#     sbs_cycles=SBS_CYCLES
# )
# save(name(description, tag='annotate_SBS_extra'), annotate_SBS_extra,
#      display_ranges=annotate_extra_display_ranges[1:], luts=annotate_extra_luts[1:], compress=1)