In [None]:
from spacr.core import preprocess_generate_masks
%matplotlib inline

settings = {'src':'path', # (path) path to source folder (where origional images were stored)
            'metadata_type':'cellvoyager', # (string) - type of fime name metadata (cellvoyager, cq1, Nikon)
            'custom_regex':None, # (regex) - Regular expression if filename metadata not in metadata_type 
            'experiment':'screen', # (string) - Name of experiment
            'channels':[0,1,2,3], # (list) - list of integers representing available channels
            'cell_channel':3, # (integer or NoneType) - Cell image dimension 
            'cell_background':100, # (integer) - Background value in cell images
            'cell_Signal_to_noise':10, # (integer) - Signal to noise ration for cell channel
            'cell_CP_prob':-1, # (integer) - Cellpose Cell probability
            'remove_background_cell':False, # (bool) - Set background to 0 for cell channel
            'nucleus_channel':0, # (Optional, integer or NoneType) - Nucleus image dimension 
            'nucleus_background':200, # (Optional, integer) - Background value in nucleus images
            'nucleus_Signal_to_noise':5, # (Optional, integer) - Signal to noise ration for nucleus channel
            'nucleus_CP_prob':0, # (Optional, integer) - Cellpose Nucleus probability
            'remove_background_nucleus':False, # (Optional, bool) - Set background to 0 for nucleus channel
            'pathogen_model':None, # (Optional, path or NoneType) - Custom cellpose model path for pathogen detection
            'pathogen_channel':2, # (Optional, integer or NoneType) - Pathogen image dimension 
            'pathogen_background':150, # (Optional, integer) - Background value in pathogen images
            'pathogen_Signal_to_noise':6, # (Optional, integer) - Signal to noise ration for pathogen channel
            'pathogen_CP_prob':-2, # (Optional, integer) - Cellpose pathogen probability
            'remove_background_pathogen':True, # (Optional, bool) - Set background to 0 for pathogen channel
            'consolidate':False, 
            'magnification':20, # (integer) - Objective magnefication used to aquire images (40, 60, 100)
            'save':True, # (bool) - Save masks and object data to database
            'preprocess':True, # (bool) - Preprocess images
            'masks':True, # (bool) - Generate masks
            'batch_size':100, # (bool) - Number of images to be normalized together and loaded onto the GPU
            'filter':False, # (bool) - Filter objects based on size
            'merge_pathogens':False, # (bool) - Merge pathogens that share > 75% perimiter
            'plot':False, # (bool) - Plot normalized intensity and object images
            'adjust_cells':True, # (bool) - If cell, nucleus and pathogen: merge cells that share a pathogen
            'test_mode':False, # (bool) - Test settings in test mode before analyzing entire experiment
            'test_images':100, # (integer) - Number of images to analyze in test mode
            'random_test':True} # (bool) - Randomize images for test mode

preprocess_generate_masks(settings)

In [None]:
from spacr.measure import measure_crop
%matplotlib inline

settings = {'src':'path', # (path) path to source folder (ens in /merged)
            'channels':[0,1,2,3],# (list) - list of integers representing available channels
            'cell_mask_dim':4, # (integer or NoneType) - Cell mask dimension 
            'cell_min_size':2000, # (integer) - minimum size in px2 of cell objects
            'nucleus_mask_dim':5, # (integer or NoneType) - Nucleus mask dimension 
            'nucleus_min_size':1000, # (integer) - minimum size in px2 of nuclei objects
            'pathogen_mask_dim':6, # (integer or NoneType) - Pathogen mask dimension 
            'pathogen_min_size':400, # (integer) - minimum size in px2 of pathogen objects
            'cytoplasm_min_size':0, # (integer) - minimum size in px2 of cutoplasm (cell-(nucleus+pathogen)) objects
            'save_png':True, # (bool) - save objects as PNGs
            'crop_mode':['cell'], # (list) - Object(s) to be cropped into images ('cell', 'nuclei', 'pathogen')
            'use_bounding_box':False, # (bool) - Use bounding box for cropped images instead of object area
            'png_size':[[224,224]], # (list of lists) - size of single object pngs
            'normalize':False, # (bool or list) - normalize PNGs to percentiles
            'png_dims':[0,1,2], # (list) - Dimensions to include in PNG images
            'normalize_by':'png', # (string) - If normalize, normalize to fov (field of view) or png 
            'save_measurements':True, # (bool) - Save measurements
            'plot':False, # (bool) - plot images during analazys
            'plot_filtration':False, # (bool) - Plot filtration steps
            'uninfected':False, # () - Include uninfected
            'test_mode':False, # (bool) - Activate Test mode
            'test_nr':10} # (integer) - Number of images to analyze in test mode

measure_crop(settings)

In [None]:
# Description: Train a ML model to classigy cells based on measurement data
from spacr.ml import generate_ml_scores
%matplotlib inline

settings = {'src':'path', # (path) path to source folder (where origional images were stored)
            'model_type_ml':'xgboost', # (string) - Type of model ( 'random_forest', 'xgboost', 'gradient_boosting')
            'heatmap_feature':'predictions', # (string) - column to display in heatmaps
            'grouping':'mean', # (string) - Grouping for heatmap
            'min_max':'allq', # (string) - Quantiles to normalize heatmap to (all, allq)
            'cmap':'viridis', # (string) - Heatmap cmap
            'n_estimators':100, # (integer) - Number of estimators for model
            'test_size':0.2, # (float) - Fraction of images used for the test set
            'location_column':'column_name', # (string) - Column containing negative/ positive controll metadata information.
            'positive_control':'c2', # (string) - Value for positive control in location column
            'negative_control':'c1', # (string) - Value for negative control in location column
            'exclude':None, # (string, NoneType) - Rows to exclude in location_column
            'nuclei_limit':1, # (integer) - Maximum number of nuclei for each cell
            'pathogen_limit':3, # (integer) - Maximum number of pathogens per cell
            'n_repeats':10, # (integer) - Number of repeats for permutation importance.
            'top_features':30, # (integer) - Number of top features to plot based on permutation importance, feature importance and shap.
            'channel_of_interest':1, # (integer) - 
            'minimum_cell_count':25, # (integer) - Minimum number of cells per well
            'remove_low_variance_features':True, # (bool) - Remove columns with low variance.
            'remove_highly_correlated_features':True, # (bool) - Remove highly correlated features.
            'verbose':False, # (bool) - Display verbose output
            'n_jobs':10} # (integer) - Number of threads

results = generate_ml_scores(settings)

In [None]:
# Description: Fit a regression model to estimate the effect size of gRNAs on cell scores.
from spacr.ml import perform_regression
import pandas as pd
%matplotlib inline

settings = {'count_data':'path', # (path) path or list of paths to sequencing count data
            'score_data':'path', # (path) path or list of paths to score data
            'score_column':'column', # () - column with cell scores
            'metadata_files':['path.csv','path.csv'], # (list) pahts to gene metadata 
            'positive_control':'gene', # (string) - gene to highlight in volcano plot
            'negative_control':'gene', # (string) - gene to highlight in volcano plot
            'min_n':3, # () - 
            'fraction_threshold':None, # (Optional, float or NoneType) - Minimum threshold for gene fraction, if None automatically calculated
            'target_unique_count':5, # () - Number of expected unique gRNAs per well
            'tolerance':0.02, # (float) - Tollerance for cells per well limit
            'log_x':False, # () - gRNA Fraction plot X axis log
            'log_y':False, # () - gRNA Fraction plot Y axis log
            'x_lim':None, # () - Volcano X axis limit
            'control_wells':['c1','c2','c3'], # (list) - Metadata to exclude from regression model
            'filter_column':'column', # (str) - Column containing control metadata to remove
            'dependent_variable': 'column', # (string) - Dependent variable for regression
            'threshold_method':'var', # (string) - effect size thresold type (std or var)
            'threshold_multiplier':4, # (integer) - effect size threshold multiplyer 
            'transform':'log', # (string) - Transform dependent variable
            'agg_type':'mean', # (string) - aggregation for dependent variable
            'min_cell_count':None, # (integer) - Minimum number of cells per well
            'regression_type':'ols', # (string) - Type of regression (ols, glm, mixed, ridge, lasso).
            'random_row_column_effects':False, # (bool) - Remove plate , row and column random effects.
            'y_lims':[[0,9], [12, 16]], # (list of lists) limits for broken y axis
            'plate':None, # (string or NoneType) - strinf to replace plate column values with
            'cov_type':None, # (string) - covariance type for ols regression
            'volcano':'gene', # (string) - mode for significant resuls (gene, grna, all)
            'alpha':0.8} # (float) - alpha for hinge and lasso regression

coef_df = perform_regression(settings)

In [None]:
# Description: use this cell to generate train and test folders: datasets/train/nc and pc and datasets/test/nc and pc
from spacr.io import generate_training_dataset

settings = {'src':'path', # (path) path to source folder (where origional images were stored)
            'dataset_mode':'metadata_annotation', # (string) annotation, measurement, metadata, annotation_metadata
            'tables':['cell'],# (list of strings) The tabels present in the database, excluding png_list
            'test_split':0.1, # (float) Fraction of images used for the test set
            'annotation_column':'test', # (Optional, string) If using mode annotation, The annotation column in the database
            'annotated_classes':[1], # (Optional, list of integers) If using mode annotation, The interger in annotation_column, if len(annotated_classes) is 1, class 2 will be generated from a random selection of images.
            'metadata_type_by':'column_name', # (Optional, strin) If using mode medatada, If using mode medatada,the column class_metadata elements are in
            'class_metadata':['c10','c11','c12','c22','c23','c24'], # (Optional, list of lists of strings) If using mode medatada, the elements that deffine each class 
            'png_type':'cell_png', # (Optional, string) string in the path of each image (used to filter images)
            'nuclei_limit':False, # (Optional, bool) if cell and nucleus in tables, filter for number of nuclei per cell
            'pathogen_limit':0, # (Optional, integer) if cell and pathogen in tables, filter for number of pathogen per cell
            'uninfected':True, # (Optional, bool) if cell and pathogen in tables, bool for uninfected cells (cells)
            'size':None # (Optional, integer or NoneType) limit for number of images to include in total (test + train) per class
           }

generate_training_dataset(settings)

In [None]:
# Description: train a torch model
from spacr.deep_spacr import train_test_model

settings = {'src':'path', # (path) path to source folder (ends with datasets/training)
            'train':False, # (bool) - Train
            'test': True, # (bool) - Test
            'custom_model':'path', # (path) - path to a custom model
            'classes':['nc','pc'], # (list) - list of classes (folder names in dataset/training/train or test)
            'model_type':'maxvit_t', # (string) - Name of torch model architecture
            'optimizer_type':'adamw', # (string) - type of optimizer
            'schedule':'reduce_lr_on_plateau', # (string) - type of scheduler (reduce_lr_on_plateau or step_lr)
            'loss_type':'focal_loss', # (string) - Loss function (binary_cross_entropy_with_logits or focal_loss)
            'normalize':True, # (bool) - Apply ImageNet normalization to images before training.
            'image_size':224, # (int) - Size of images, height and width.
            'batch_size':64, # (int) - Nr. of images per batch
            'epochs':100, # (int) - Nr. of epochs for training
            'val_split':0.1, # (float) - Fraction of images in validation dataset
            'learning_rate':0.0001, # (float) - Learning rate per epoch
            'weight_decay':0.00001, # (float) - Fraction of random weights decay (regularization)
            'dropout_rate':0.1, # (float) - Fraction of weights to omit per epoch (regularization)
            'init_weights':True, # (bool) - Initiate model with ImageNet weights
            'amsgrad':True, # (bool) - guard against exploding gradients
            'use_checkpoint':True, # (bool) - checkpoint gradient calculations to save VRAM at the expence of computation
            'gradient_accumulation':True, # (bool) - Accumulate gradients to mimic larger batches
            'gradient_accumulation_steps':4, # (int) - Epochs to accumulate gradients
            'intermedeate_save':True, # Save intermediate states of the model
            'pin_memory':True, # (bool) - Whether to pin memory for the data loader
            'n_jobs':30, # (int) - Number of threads to use
            'train_channels':['r','g','b'], # (list of 'r', 'g', and/or 'b') - PNG channels to use for training
            'augment':False, # (bool) - Augment the dataset, vertical, horizontal flip and rotate each image to artificially expand the dataset 8 fold.
            'verbose':True}

train_test_model(settings)

In [None]:
# Description: generate a tar dataset
from spacr.io import generate_dataset

settings = {'src':'path', # (path) path to source folder (where origional images were stored)
           'file_metadata':'cell_png', # (Optional, string) string in the path of each image (used to filter images)
           'experiment':'test', # (string) - Name of dataset
           'sample':10000} # (Optional, integer or NoneType) limit for number of images to include in the dataset

generate_dataset(settings)

In [None]:
# Description: apply a model to a tar dataset
from spacr.deep_spacr import apply_model_to_tar

settings = {'dataset':'path.tar', # (path) - path to tar dataset (ends with .tar) 
            'model_path':'path.pth', # (path) - path to model (ends with .pth) 
            'file_type':'cell_png', # (Optional, string) string in the path of each image (used to filter images)
            'image_size':224, # (int) - Size of images, height and width
            'batch_size':64, # (int) - Nr. of images per batch
            'normalize':True, # (bool) - Apply ImageNet normalization to images before training.
            'score_threshold':0.5, # (float) - Score to byass the classes
            'n_jobs':30, # (int) - Number of threads to use
            'verbose':True}

result_df = apply_model_to_tar(settings)

In [None]:
from spacr.sequencing import generate_barecode_mapping

settings = {'src': 'path',
            'regex': '^(?P<column>.{8})TGCTG.*TAAAC(?P<grna>.{20,21})AACTT.*AGAAG(?P<row_name>.{8}).*',
            'target_sequence': 'TGCTGTTTCCAGCATAGCTCTTAAAC',
            'offset_start': -8,
            'expected_end': 89,
            'column_csv': 'path to column_barecodes.csv',
            'grna_csv': 'path to grna_barcodes_RC.csv',
            'row_csv': 'path to row_barecodes_RC.csv',
            'save_h5': True,
            'comp_type': 'zlib',
            'comp_level': 5,
            'chunk_size': 10000,
            'n_jobs': None,
            'mode': 'paired',
            'single_direction': 'R1',
            'test': False,
            'fill_na':True}

generate_barecode_mapping(settings)

In [None]:
# generate cellpose dataset
from spacr.io import prepare_cellpose_dataset

input_root = 'path'

prepare_cellpose_dataset(input_root, augment_data=True, train_fraction=0.8, n_jobs=None)

In [None]:
# train cellpose model
from spacr.submodules import train_cellpose
%matplotlib inline

settings = {'src':'path',
            'test':False,
            'normalize':False,
            'percentiles':None,
            'invert':False,
            'grayscale':True,
            'rescale':False,
            'circular':False,
            'channels':[0,0],
            'model_name':'test',
            'model_type':'cyto',
            'Signal_to_noise':10,
            'background':200,
            'remove_background':False,
            'learning_rate':0.2,
            'weight_decay':1e-05,
            'batch_size':8,
            'n_epochs':25000,
            'from_scratch':False,
            'diameter':30,
            'resize':False,
            'target_dimensions':1000,
            'verbose':True}

train_cellpose(settings)