# Upload data to the Zooniverse


## Installing dependencies
First of all, we need to install Zooniverse's ```panoptes_client``` library to be able to interact with the server.

For information about how Zooniverse projects are set up, visit the [Project Builder how-to](https://help.zooniverse.org/getting-started/)

In [None]:
!pip install --user panoptes_client

## Set up Zooniverse project information
Each project and workflow has a unique identifier, edit below to reflect your project.

In [None]:
PROJECT_ID = "h-spiers/etch-a-cell"   # Change this to the project location, taken from the URL, e.g. https://www.zooniverse.org/projects/h-spiers/etch-a-cell
DATA_DIR = "/path/to/my/jpeg/files"
USERNAME = "enter_your_user_name_here"
PREFIX = "prefix_of_your_data"   # e.g. filename HeLa_cell2_Run1_0003.jpg, use prefix like "HeLa_cell2_Run1_"
OUTPUT_PREFIX = "Subject_Prefix" # adds a prefix to the uploaded subjects to help with bookkeeping
TILE_X = 1    # Keeps track of tiling of raw image - for Etch a Cell there was no additional tiling of ROIs
TILE_Y = 1

Import the necessary libraries and modules

In [None]:
from panoptes_client import Project, Panoptes, Subject, SubjectSet
import glob
import os
import sys
import getpass
import re

## Set up properties for image upload
Each __subject__ in an Etch a Cell project on the Zooniverse usually consists of several images arranged in a __flipbook__. For 3D data, this allows us to provide some context for the volunteer to see how the structures change in nearby z-slices. 

For organelles and structures of different sizes, the spacing between these z-slices can vary, so we set that here. The variable `SPAN` is how many images above and below the central image we include in the subject. E.g. `SPAN = 2` is the central slice plus 2 from above and 2 from below (a total of 5 images). The variable `STEP` is the distance between these images (in slices), e.g. a `SPAN = 2` and `STEP = 10`, centred on slice 50 would include the slices [30, 40, 50, 60, 70] in the subject.

In [None]:
SPAN = 2
STEP = 1

When building a subject, we need to attach the set of images that we have chosen and fill in some metadata so we can track important information about the experiment. Additional metadata fields can be added as, e.g. `subject.metadata['New field'] = 'New field information'`

In [None]:
# This function builds the subject from the chosen set of images and attaches metadata
def build_subject(project, file_list, centre_idx, span, step):
    subject = Subject() # Inititialise a subject
    subject.links.project = project # ...attach it to a project
    subject.metadata['Subject ID'] = centre_idx - step * span + 1 # Add the names of the images
    
    # For loop to attach the images to the subject one-by-one
    for i, idx in enumerate(range(centre_idx - step * span, centre_idx + step * span + 1, step)):
        fname = str(file_list[idx])
        print("Attaching %s to subject %d" % (os.path.basename(fname), centre_idx - step * span + 1))
        subject.add_location(fname)
        subject.metadata['Image %d' % i] = os.path.basename(fname)
    subject.metadata['default_frame'] = span + 1  # We want people to annotate the middle image
  
    # Metadata from here should be changed according to the data
    subject.metadata['Microscope'] = 'SBF SEM'
    subject.metadata['Raw XY resolution (nm)'] = 10
    subject.metadata['Raw Z resolution (nm)'] = 50
    subject.metadata['Scaling factor'] = 2   # The scaling between the raw ROI and jpeg image
    subject.metadata['jpeg quality (%)'] = 90
    subject.metadata['Attribution'] = 'Person who acquired the data'
    subject.metadata['Description'] = 'Free text description of the data'
    print("Starting to save")
    print(subject)
    subject.save()
    print("Subject saved")

    return subject



This function connects to the Zooniverse, make sure the Project ID and username were set correctly earlier.

In [None]:
def connect_to_zooniverse(project_id=PROJECT_ID, user_name=USERNAME):
    try:
        password = getpass.getpass(prompt='Password: ', stream=None)
        Panoptes.connect(username=user_name, password=password)
        print("Connected to Zooniverse")
    except Exception as e:
        print("Couldn't connect to Zooniverse")
        print("Exception {}".format(e))
        sys.exit(1)
    print(f"Connecting to {project_id}...")
    project = Project.find(slug=project_id)
    print("...connected!")
    return project

In [None]:
# Helper function to initialise a "subject set" and attach it to a project
def initialise_subject_set(project, subject_name):
    subject_set = SubjectSet()
    subject_set.links.project = project
    subject_set.display_name = subject_name
    subject_set.save()
    return subject_set

# Helper function to read all of the jpegs in a directory that have the given prefix
def get_image_list(input_directory, prefix):
    file_list = glob.glob(os.path.join(input_directory, prefix + '*.jpg'))
    n_files = len(file_list)
    print(f"There are {n_files} jpg files in the directory with prefix {prefix}")
    return file_list, n_files


# Function to build a subject set from a fixed range of images
def build_subject_set(project, file_list, file_idx_start, file_idx_stop, span, step):
    print(f"Building subject set from files {file_idx_start}-{file_idx_stop-1}")
    subjects = []
    min_idx = 0
    max_idx = len(file_list)

    for centre_idx in range(max(min_idx, file_idx_start), min(max_idx, file_idx_stop)):
        subject = build_subject(project, file_list, centre_idx, span, step)
        subjects.append(subject)
    return subjects



Connect to the Zooniverse using your credentials

In [None]:
project = connect_to_zooniverse(project_id=PROJECT_ID)

To allow fine-control over the data that is active in the project, it is useful to split the data into multiple subject sets. If all data is uploaded at once, then the time taken to achieve retirement for any given image is very long. By uploading smaller subject sets it is possible to start getting completed subjects much earlier, allowing downstream data analysis to be developed early and refined as more completed data arrives. This requires more effort on the project owner's part as they must monitor the progress of the project and attach new subject sets to the project as others are completed.

A good strategy is to have chunks of sequential images making up each subject set, since contiguous 3D volumes are most useful for 3D machine learning strategies. However, the nature of some types of microscopy (e.g. FIB SEM) means that many of these nearby slices will look nearly identical. Attaching multiple small subject sets from distant parts of the sample prevents volunteers from being presented with nearly identical images in quick succession. 

The variable `subject_set_size` defines how many subjects are to be attached to each subject set. The minimum useful number of slices is around 20, since that should be compatible with most 3D machine learning approaches, but small subject sets are also more labour intensive for project owners to curate. 

In [None]:
subject_set_size = 100  # typically around 100 should be OK

Check for files in your local data directory to upload

In [None]:
file_list, n_files = get_image_list(DATA_DIR, PREFIX)
file_list.sort()
file_basenames = [os.path.basename(file) for file in file_list]
file_basename_prefixes = [file.split('_z')[0] for file in file_basenames]

Note that we can only build subjects around slices that have enough slices above and below to fulfil the `SPAN` and `STEP` settings. For example, for `SPAN=2` and `STEP=10`, the first slice that we can build into a subject (as the central slice) is the 20th slice, so the subject would contain slices [0, 10, 20, 30, 40].

The subject set name is set as the raw data file prefix followed by the x, y coordinates of the top-left pixel of the crop from the raw data, followed by the range of z-values in the subject set. For example, a raw file with a prefix `HeLa_cell2_Run1_Tile_*.tif`, starting at (x, y) = (2334, 0000) and containing z-slices from 20 to 119 will have the subject-set name: `HeLa_cell2_Run1_Tile_x2334_y0000_z0020-0119`

### Note: this can be a very slow process to upload!

In [None]:
# TODO: Investigate speeding up the upload by using the "async_saves()" function in Panoptes
prefix, z_str = file_list[0].split('_z')
minimum_z = int(z_str.split('.jpg')[0])
starting_index = SPAN * STEP + minimum_z  # This is the first index that we can build a 5 slice subject from
found_subjects_to_upload = False
for counter, list_start_abs in enumerate(range(starting_index, starting_index + n_files, subject_set_size)):
    print(f"\n*******\nStep {counter}")
    list_start = list_start_abs - starting_index # Make sure we subtract the offset
    # Perform some text manipulation to extract the x, y, z values from the file name
    file_name = os.path.split(file_list[list_start_abs])[-1]
    prefix, z_str = file_name.split('_z')
    xy_str = prefix.split('_x')[-1]
    x_str, y_str = xy_str.split('_y')
    z_start = z_str.split('.jpg')[0]
    z_end = int(z_start) + subject_set_size - 1
    print(f"List start (abs): {list_start_abs}\n"
      f"List start: {list_start}\n"
      f"Z start: {z_start}\n"
      f"Filename: {file_name}\n"
      f"Start index: {starting_index}\n")
    if int(z_start) < (starting_index) or int(z_end) > (starting_index + n_files - SPAN*STEP):  # Skip the highest and lowest z slices
        print(f"Skipping, can't build {2*SPAN+1} slice subject from slice {z_start}")
        continue
    found_subjects_to_upload = True
    list_end = list_start + subject_set_size
    list_end_abs = list_start_abs + subject_set_size

    subject_set_name = f"{OUTPUT_PREFIX}_S{SPAN}_{STEP}_{prefix}_z{z_start}-{z_end:04d}"
    print(subject_set_name)
    try:
        subject_set = initialise_subject_set(project, subject_set_name)
    except:
        print(f"Subject set name {subject_set_name} appears to already be taken, skipping\n\n")
        continue
        # TODO: output a logfile with skipped subject set names
    print(f"Creating subject set name {subject_set_name}\n\n")
    subjects = build_subject_set(project, file_list, list_start_abs, list_end_abs, SPAN, STEP)
    subject_set.add(subjects)
