In [2]:
#allow autoreload modules at every execution if requested
%load_ext autoreload

from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd

from cs_parser import CsParser, mappings
from star_parser import StarTabDf, StarParser

In [5]:
work_folder = Path('/mnt/DATA/andrea/apoferritin')
particles_metadata_file = work_folder/"cryosparc_P105_J28_010_particles.cs"
assert particles_metadata_file.exists()
parser = CsParser(particles_metadata_file)
dirty_array = parser.parse_array()

In [43]:
a = dirty_array.cs_array
dtypes = a.dtype.fields
#pandas does not like to have 2D / 3D arrays as column elements. 
#we separate standard columns (clean) from the multidimensional data (problematic)
problematic = []
clean = []
for index, value in enumerate(list(a[0])):
    if isinstance(value, np.ndarray):
        key = list(dtypes.keys())[index]
        problematic.append(key)
    else:
        key = list(dtypes.keys())[index]
        clean.append(key)
index = np.arange(dirty_array.cs_array.shape[0])
flat = pd.DataFrame(dirty_array.cs_array[clean])
# get rid of some b''
for key in ['blob/path', 'ctf/type']:
    try:
        flat[key] = flat[key].map(lambda x: x.decode('utf-8'))
    except KeyError:
        continue



Note: Cryospar internal formats are not well documented, however:

1 - blob/path indicates the path of the 3D mrc stack
2 - blob/idx is the z index of the given particle image in the stack

angles are in radians
shifts are in Angstrom (A), not pixels; blob/psize_A contains the pixel size in A to convert to pixel shifts if necessary

Angle conventions are unclear, but they might be in Rodrigues coordinates.

There is a script that converts cryosparc data (.cs) to relion (.star). (/soft/pyem/pyem/csparc2star.py)
It often fails with KeyError, hence I wrote my own...
however, when converting angles from cs to relion, it does the following :

(/soft/pyem/pyem/pyem/metadata.py, line 481)

log.debug("Converting Rodrigues coordinates to Euler angles")
df[star.Relion.ANGLES] = np.rad2deg(geom.rot2euler(geom.expmap(df[star.Relion.ANGLES].values)))

Follow the pyem code for more info on conventions? 

More info on cs files and csg files:
https://guide.cryosparc.com/processing-data/tutorials-and-case-studies/manipulating-.cs-files-created-by-cryosparc
https://guide.cryosparc.com/processing-data/tutorials-and-case-studies/tutorial-data-management-in-cryosparc#use-case-manually-modify-cryosparc-outputs-and-metadata-for-continued-experimentation


The cryosparc discussion forum is also quite helpful, they might be able to answer your questions.


In [44]:
#we unpack each 3D array into single columns to be added back to our metadata
#we arbitrarily rename columns by adding their index to their name, i.e. anisomag_0, anisomag_1, etc.
#what is what, though, is left as an exercise for the reader ;-)
rascals = a[problematic]
index_column = flat.index
reformed = pd.DataFrame(index_column)
for key in problematic:
    #if the name of the column is poses, they are angles;
    #otherwise they are shifts
    if 'pose' in key:
        mapping = mapping_angles
    else:
        mapping = mapping_coords
    #split the array into a list of n single columns
    split_arrays = np.hsplit(rascals[key], rascals[key].shape[-1])
    #rename their index
    for index, array in enumerate(split_arrays):
        new_dtype = (f'{key}_{index}')
        reformed[new_dtype] = pd.DataFrame(array)

In [47]:
for i,c in enumerate(megablob.iloc[0,:]):
    print(megablob.columns[i],type(c))
    # print(f"{megablob.columns[i]} is {type(i)}")

uid <class 'numpy.uint64'>
blob/path <class 'str'>
blob/idx <class 'numpy.uint32'>
blob/psize_A <class 'numpy.float32'>
blob/sign <class 'numpy.float32'>
blob/import_sig <class 'numpy.uint64'>
ctf/type <class 'str'>
ctf/exp_group_id <class 'numpy.uint32'>
ctf/accel_kv <class 'numpy.float32'>
ctf/cs_mm <class 'numpy.float32'>
ctf/amp_contrast <class 'numpy.float32'>
ctf/df1_A <class 'numpy.float32'>
ctf/df2_A <class 'numpy.float32'>
ctf/df_angle_rad <class 'numpy.float32'>
ctf/phase_shift_rad <class 'numpy.float32'>
ctf/scale <class 'numpy.float32'>
ctf/scale_const <class 'numpy.float32'>
ctf/bfactor <class 'numpy.float32'>
alignments3D/split <class 'numpy.uint32'>
alignments3D/psize_A <class 'numpy.float32'>
alignments3D/error <class 'numpy.float32'>
alignments3D/error_min <class 'numpy.float32'>
alignments3D/resid_pow <class 'numpy.float32'>
alignments3D/slice_pow <class 'numpy.float32'>
alignments3D/image_pow <class 'numpy.float32'>
alignments3D/cross_cor <class 'numpy.float32'>
alig