In [1]:
from measure import get_database_interface
import yaml

In [2]:
import pandas as pd
from ftplib import *
from io import StringIO
import io

In [3]:
from tqdm.auto import tqdm

In [4]:
def get_df(iface, par_name):
    # create dataframe from the database
        
    selection = iface.as_table([par_name, 'line', 'id', 'sex', 'age', 'generation'])
    df = pd.DataFrame(selection)
    df = df.loc[~df[par_name].isna()]
    df = df.explode(par_name).explode(par_name).explode(par_name)
    df = df.rename(columns={o:n for o,n in zip([par_name, 'line', 'id', 'sex', 'age', 'generation'], 
                                               ['measurement', 'line_id', 'fish_id', 'fish_sex', 'fish_age', 'fish_generation'])})
    return df

In [5]:
with open('measurement_configs/processing/medaka_mongo.yaml') as f:
    iface = get_database_interface(yaml.safe_load(f)['db'])

## Adding meta-information to the database

**NB!** be sure to understand what you are doing good before making active and executing the last cell here

In [6]:
scan_201811 = pd.read_excel('samplelist_complete.xlsx', sheet_name=4)
scan_201905 = pd.read_excel('samplelist_complete.xlsx', sheet_name=3)
scan_201912 = pd.read_excel('samplelist_complete.xlsx', sheet_name=2)
scan_202012 = pd.read_excel('samplelist_complete.xlsx', sheet_name=1)

all_scans = pd.concat([scan_201811, scan_201905, scan_201912, scan_202012])
all_scans = all_scans[all_scans['Scanned stained'].isin(['yes', 'Yes', 'Scanned', 'scanned'])]
all_scans = all_scans[['S.No', 'Line', 'Gen.', 'Sex', 'DOB', 'DOS']]
all_scans['age'] = (all_scans['DOS'] - pd.to_datetime(all_scans['DOB'], errors='coerce')).dt.days
all_scans = all_scans.drop(['DOS', 'DOB'], axis=1)

all_scans = all_scans.rename(columns={'S.No': 'id', 'Line': 'line', 'Gen.': 'generation', 'Sex': 'sex', 'age':'age'})
all_scans['id'] = pd.to_numeric(all_scans['id'], errors='coerce')
all_scans = all_scans[~all_scans.isna()['id']]
all_scans['id'] = all_scans['id'].astype(int).astype(str)

In [8]:
set_of_reliable_ids = set([i['id'] for i in iface.as_table([])]) & set(all_scans.id)

## Generating the dataframes with all measurements available to be saved

In [9]:
from collections import Counter
from itertools import chain

def _transform_db(in_db, measurement_id):
    new_db = in_db[['line', 'id', 'sex', 'age', 'generation', measurement_id]]
    new_db = new_db.rename({o:n for o,n in zip([measurement_id, 'line', 'id', 'sex', 'age', 'generation'], 
                                               ['measurement', 'line_id', 'fish_id', 'fish_sex', 'fish_age', 'fish_generation'])}, axis=1)
    return new_db

def _db_statistics(possible_fields, attempted, whole_db):
    measured = 0
    samples_in_db = []
    for measurement_key in tqdm(possible_fields):
        df = _transform_db(whole_db, measurement_key)
        samples_id_current = list(df.dropna(how='any', subset=['measurement']).fish_id.unique())
        measured += df.dropna(how='any', subset=['measurement']).fish_id.nunique()
        samples_in_db.append(samples_id_current)

    print('on average samples measured per each metric: ', measured / len(possible_fields))
    samples_with_that_much_measurements = Counter(Counter(chain.from_iterable(samples_in_db)).values())

    samples_in_db = len(set.union(*[set(i) for i in samples_in_db]))
    print('in db', samples_in_db)

    mc_0 = attempted - samples_in_db
    mc_20 = sum([v for k,v in samples_with_that_much_measurements.items() if k < len(possible_fields)*0.2])
    mc_20_80 = sum([v for k,v in samples_with_that_much_measurements.items() if ((k > len(possible_fields)*0.2) 
                                                                                   and
                                                                                   (k < len(possible_fields)*0.8))])
    mc_80 = sum([v for k,v in samples_with_that_much_measurements.items() if ((k > len(possible_fields)*0.8) and k < len(possible_fields))])
    mc_100 = sum([v for k,v in samples_with_that_much_measurements.items() if k == len(possible_fields)])

    print('metrics coverage of 0/20/20-80/80/100:', f'{mc_0}/{mc_20}/{mc_20_80}/{mc_80}/{mc_100}')

import os
from glob import glob
import re

def _experiment_organ_statistics(experiment_addr, organ, scan_metainformation):
    addrs = glob(os.path.join(experiment_addr, 'Medaka_*_*'))
    ids = [re.findall('Medaka_(\d+)_', i)[0] for i in addrs]
    lines = [re.findall('Medaka_\d+_(.+)', i)[0] for i in addrs]
    
    attempted = len(addrs)
    print('experiment size: ', attempted)
    
    whole_db = pd.DataFrame(iface.as_table([]))
    db_experiment = whole_db[whole_db.id.isin(ids)]
    
    fieldset = (set([i for rec in iface.as_table([]) for i in rec.keys() if i.startswith(organ+'.')]) 
                   - {'line', 'id', 'sex', 'age', 'generation'})
    
    _db_statistics(fieldset, attempted, db_experiment)
    
    print('Failures count per metric:')
    print(db_experiment[list(fieldset)].isna().sum().sort_index(ascending=False))
    
    count_ids_db = db_experiment.dropna(how='all', subset=fieldset).groupby('line').id.count()
    count_ids_meta = scan_metainformation.groupby('Line')['S.No'].count()
    
    line_errors = pd.DataFrame({'failure_ratio': 1 - count_ids_db / count_ids_meta,
#                                 'count_ids_meta': count_ids_meta,
#                                 'count_ids_db': count_ids_db,
                                'failure_count': count_ids_meta - count_ids_db})
    
    print('Failure counts and ratios per fish line:')
    print(line_errors.sort_values('failure_ratio', ascending=False))

In [10]:
# overall
attempted = 340+232+169+182
possible_fields = (set([i for rec in iface.as_table([]) for i in rec.keys()]) 
                   - {'line', 'id', 'sex', 'age', 'generation'})
whole_db = pd.DataFrame(iface.as_table([]))
_db_statistics(possible_fields, attempted, whole_db)

  0%|          | 0/372 [00:00<?, ?it/s]

on average samples measured per each metric:  463.48118279569894
in db 854
metrics coverage of 0/20/20-80/80/100: 69/140/390/289/35


In [11]:
addr_2018_11 = '/mnt/LSDF/projects/code-vita/Medaka/2018_11'
addr_2019_12 = '/mnt/HD-LSDF/Medaka/201912_beamtime_medaka'
addr_2019_05 = '/mnt/HD-LSDF/Medaka/201905_beamtime_medaka_stained'
addr_2020_12 = '/mnt/HD-LSDF/Medaka/202012_beamtime_medaka'

## 2020_12

In [12]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2020_12, 'eyes', scan_202012)

experiment size:  262


  0%|          | 0/89 [00:00<?, ?it/s]

on average samples measured per each metric:  121.41573033707866
in db 138
metrics coverage of 0/20/20-80/80/100: 124/0/27/109/2
Failures count per metric:
eyes.retina.volume                         8
eyes.retina.surface_area                   8
eyes.retina.radius_minimal_sphere         16
eyes.retina.eccentricity_meridional       16
eyes.retina.distance_between_centers       8
eyes.retina.convex_volume                 16
eyes.retina.color_std_eroded               8
eyes.retina.color_std_dilated              8
eyes.retina.color_std                      8
eyes.retina.color_perc_99_eroded           8
eyes.retina.color_perc_99_dilated         16
eyes.retina.color_perc_99                 16
eyes.retina.color_perc_1_eroded            8
eyes.retina.color_perc_1_dilated          16
eyes.retina.color_perc_1                  16
eyes.retina.color_median_eroded            8
eyes.retina.color_median_dilated           8
eyes.retina.color_median                   8
eyes.retina.color_mean_eroded     

In [13]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2020_12, 'brain', scan_202012)

experiment size:  262


  0%|          | 0/173 [00:00<?, ?it/s]

on average samples measured per each metric:  87.5028901734104
in db 103
metrics coverage of 0/20/20-80/80/100: 159/0/36/66/1
Failures count per metric:
brain.torus_longuthing.volume                     43
brain.torus_longuthing.surface_area               43
brain.torus_longuthing.radius_minimal_sphere      89
brain.torus_longuthing.convex_volume              89
brain.torus_longuthing.color_std_dilated          43
brain.torus_longuthing.color_std                  43
brain.torus_longuthing.color_perc_99_dilated      89
brain.torus_longuthing.color_perc_99              89
brain.torus_longuthing.color_perc_1_dilated       89
brain.torus_longuthing.color_perc_1               89
brain.torus_longuthing.color_median_dilated       43
brain.torus_longuthing.color_median               43
brain.torus_longuthing.color_mean_dilated         43
brain.torus_longuthing.color_mean                 43
brain.torus_longuthing.color_average             144
brain.optical_tectum.volume                       43

In [14]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2020_12, 'heartkidney', scan_202012)

experiment size:  262


  0%|          | 0/88 [00:00<?, ?it/s]

on average samples measured per each metric:  91.9090909090909
in db 110
metrics coverage of 0/20/20-80/80/100: 152/0/48/60/2
Failures count per metric:
heartkidney.ventricle.volume                         36
heartkidney.ventricle.surface_area                   36
heartkidney.ventricle.radius_minimal_sphere          73
heartkidney.ventricle.eccentricity_meridional        73
heartkidney.ventricle.eccentricity_equatorial        73
heartkidney.ventricle.convex_volume                  73
heartkidney.ventricle.color_std_eroded               36
heartkidney.ventricle.color_std_dilated              36
heartkidney.ventricle.color_std                      36
heartkidney.ventricle.color_perc_99_eroded           36
heartkidney.ventricle.color_perc_99_dilated          73
heartkidney.ventricle.color_perc_99                  73
heartkidney.ventricle.color_perc_1_eroded            36
heartkidney.ventricle.color_perc_1_dilated           73
heartkidney.ventricle.color_perc_1                   73
heartki

In [15]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2020_12, 'liver', scan_202012)

experiment size:  262


  0%|          | 0/22 [00:00<?, ?it/s]

on average samples measured per each metric:  114.68181818181819
in db 125
metrics coverage of 0/20/20-80/80/100: 137/0/13/110/2
Failures count per metric:
liver.liver.volume                      21
liver.liver.surface_area                21
liver.liver.radius_minimal_sphere       34
liver.liver.eccentricity_meridional     34
liver.liver.eccentricity_equatorial     34
liver.liver.convex_volume               34
liver.liver.color_std_eroded            21
liver.liver.color_std_dilated           21
liver.liver.color_std                   21
liver.liver.color_perc_99_eroded        21
liver.liver.color_perc_99_dilated       34
liver.liver.color_perc_99               34
liver.liver.color_perc_1_eroded         21
liver.liver.color_perc_1_dilated        34
liver.liver.color_perc_1                34
liver.liver.color_median_eroded         21
liver.liver.color_median_dilated        21
liver.liver.color_median                21
liver.liver.color_mean_eroded           21
liver.liver.color_mean_dila

## 2019_05

In [16]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2019_05, 'eyes', scan_201905)

experiment size:  173


  0%|          | 0/89 [00:00<?, ?it/s]

on average samples measured per each metric:  150.73033707865167
in db 166
metrics coverage of 0/20/20-80/80/100: 7/0/20/145/1
Failures count per metric:
eyes.retina.volume                         0
eyes.retina.surface_area                   0
eyes.retina.radius_minimal_sphere          4
eyes.retina.eccentricity_meridional        4
eyes.retina.distance_between_centers       0
eyes.retina.convex_volume                  4
eyes.retina.color_std_eroded               0
eyes.retina.color_std_dilated              0
eyes.retina.color_std                      0
eyes.retina.color_perc_99_eroded           0
eyes.retina.color_perc_99_dilated          4
eyes.retina.color_perc_99                  4
eyes.retina.color_perc_1_eroded            0
eyes.retina.color_perc_1_dilated           4
eyes.retina.color_perc_1                   4
eyes.retina.color_median_eroded            0
eyes.retina.color_median_dilated           0
eyes.retina.color_median                   0
eyes.retina.color_mean_eroded       

In [17]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2019_05, 'brain', scan_201905)

experiment size:  173


  0%|          | 0/173 [00:00<?, ?it/s]

on average samples measured per each metric:  98.09248554913295
in db 113
metrics coverage of 0/20/20-80/80/100: 60/0/17/96/0
Failures count per metric:
brain.torus_longuthing.volume                     53
brain.torus_longuthing.surface_area               53
brain.torus_longuthing.radius_minimal_sphere      87
brain.torus_longuthing.convex_volume              87
brain.torus_longuthing.color_std_dilated          55
brain.torus_longuthing.color_std                  53
brain.torus_longuthing.color_perc_99_dilated      87
brain.torus_longuthing.color_perc_99              87
brain.torus_longuthing.color_perc_1_dilated       87
brain.torus_longuthing.color_perc_1               87
brain.torus_longuthing.color_median_dilated       55
brain.torus_longuthing.color_median               55
brain.torus_longuthing.color_mean_dilated         55
brain.torus_longuthing.color_mean                 55
brain.torus_longuthing.color_average             164
brain.optical_tectum.volume                       53

In [18]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2019_05, 'heartkidney', scan_201905)

experiment size:  173


  0%|          | 0/88 [00:00<?, ?it/s]

on average samples measured per each metric:  137.0681818181818
in db 151
metrics coverage of 0/20/20-80/80/100: 22/0/18/133/0
Failures count per metric:
heartkidney.ventricle.volume                         15
heartkidney.ventricle.surface_area                   15
heartkidney.ventricle.radius_minimal_sphere          33
heartkidney.ventricle.eccentricity_meridional        32
heartkidney.ventricle.eccentricity_equatorial        32
heartkidney.ventricle.convex_volume                  33
heartkidney.ventricle.color_std_eroded               18
heartkidney.ventricle.color_std_dilated              18
heartkidney.ventricle.color_std                      15
heartkidney.ventricle.color_perc_99_eroded           18
heartkidney.ventricle.color_perc_99_dilated          33
heartkidney.ventricle.color_perc_99                  33
heartkidney.ventricle.color_perc_1_eroded            18
heartkidney.ventricle.color_perc_1_dilated           33
heartkidney.ventricle.color_perc_1                   33
heartk

In [19]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2019_05, 'liver', scan_201905)

experiment size:  173


  0%|          | 0/22 [00:00<?, ?it/s]

on average samples measured per each metric:  151.22727272727272
in db 159
metrics coverage of 0/20/20-80/80/100: 14/0/2/153/4
Failures count per metric:
liver.liver.volume                       7
liver.liver.surface_area                 7
liver.liver.radius_minimal_sphere        9
liver.liver.eccentricity_meridional      9
liver.liver.eccentricity_equatorial      9
liver.liver.convex_volume                9
liver.liver.color_std_eroded             7
liver.liver.color_std_dilated            7
liver.liver.color_std                    7
liver.liver.color_perc_99_eroded         7
liver.liver.color_perc_99_dilated        9
liver.liver.color_perc_99                9
liver.liver.color_perc_1_eroded          7
liver.liver.color_perc_1_dilated         9
liver.liver.color_perc_1                 9
liver.liver.color_median_eroded          7
liver.liver.color_median_dilated         7
liver.liver.color_median                 7
liver.liver.color_mean_eroded            7
liver.liver.color_mean_dilate

## 2019_12

In [20]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2019_12, 'eyes', scan_201912)

experiment size:  237


  0%|          | 0/89 [00:00<?, ?it/s]

on average samples measured per each metric:  205.1573033707865
in db 216
metrics coverage of 0/20/20-80/80/100: 21/0/7/88/121
Failures count per metric:
eyes.retina.volume                        7
eyes.retina.surface_area                  7
eyes.retina.radius_minimal_sphere        13
eyes.retina.eccentricity_meridional      13
eyes.retina.distance_between_centers      7
eyes.retina.convex_volume                13
eyes.retina.color_std_eroded              9
eyes.retina.color_std_dilated             9
eyes.retina.color_std                     7
eyes.retina.color_perc_99_eroded          9
eyes.retina.color_perc_99_dilated        13
eyes.retina.color_perc_99                13
eyes.retina.color_perc_1_eroded           9
eyes.retina.color_perc_1_dilated         13
eyes.retina.color_perc_1                 13
eyes.retina.color_median_eroded           9
eyes.retina.color_median_dilated          9
eyes.retina.color_median                  9
eyes.retina.color_mean_eroded             9
eyes.retin

In [21]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2019_12, 'brain', scan_201912)

experiment size:  237


  0%|          | 0/173 [00:00<?, ?it/s]

on average samples measured per each metric:  152.26589595375722
in db 172
metrics coverage of 0/20/20-80/80/100: 65/0/20/116/36
Failures count per metric:
brain.torus_longuthing.volume                     51
brain.torus_longuthing.surface_area               51
brain.torus_longuthing.radius_minimal_sphere     116
brain.torus_longuthing.convex_volume             116
brain.torus_longuthing.color_std_dilated          59
brain.torus_longuthing.color_std                  51
brain.torus_longuthing.color_perc_99_dilated     116
brain.torus_longuthing.color_perc_99             116
brain.torus_longuthing.color_perc_1_dilated      116
brain.torus_longuthing.color_perc_1              116
brain.torus_longuthing.color_median_dilated       59
brain.torus_longuthing.color_median               59
brain.torus_longuthing.color_mean_dilated         59
brain.torus_longuthing.color_mean                 59
brain.torus_longuthing.color_average              53
brain.optical_tectum.volume                      

In [22]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2019_12, 'heartkidney', scan_201912)

experiment size:  237


  0%|          | 0/88 [00:00<?, ?it/s]

on average samples measured per each metric:  173.9318181818182
in db 194
metrics coverage of 0/20/20-80/80/100: 43/3/27/63/101
Failures count per metric:
heartkidney.ventricle.volume                        29
heartkidney.ventricle.surface_area                  29
heartkidney.ventricle.radius_minimal_sphere         71
heartkidney.ventricle.eccentricity_meridional       68
heartkidney.ventricle.eccentricity_equatorial       68
heartkidney.ventricle.convex_volume                 71
heartkidney.ventricle.color_std_eroded              34
heartkidney.ventricle.color_std_dilated             34
heartkidney.ventricle.color_std                     29
heartkidney.ventricle.color_perc_99_eroded          34
heartkidney.ventricle.color_perc_99_dilated         71
heartkidney.ventricle.color_perc_99                 71
heartkidney.ventricle.color_perc_1_eroded           34
heartkidney.ventricle.color_perc_1_dilated          71
heartkidney.ventricle.color_perc_1                  71
heartkidney.ventricl

In [23]:
pd.set_option("max_rows", None) # show all rows
_experiment_organ_statistics(addr_2019_12, 'liver', scan_201912)

experiment size:  237


  0%|          | 0/22 [00:00<?, ?it/s]

on average samples measured per each metric:  198.4090909090909
in db 204
metrics coverage of 0/20/20-80/80/100: 33/0/18/1/185
Failures count per metric:
liver.liver.volume                     19
liver.liver.surface_area               19
liver.liver.radius_minimal_sphere      37
liver.liver.eccentricity_meridional    21
liver.liver.eccentricity_equatorial    21
liver.liver.convex_volume              37
liver.liver.color_std_eroded           20
liver.liver.color_std_dilated          20
liver.liver.color_std                  19
liver.liver.color_perc_99_eroded       20
liver.liver.color_perc_99_dilated      37
liver.liver.color_perc_99              37
liver.liver.color_perc_1_eroded        20
liver.liver.color_perc_1_dilated       37
liver.liver.color_perc_1               37
liver.liver.color_median_eroded        20
liver.liver.color_median_dilated       20
liver.liver.color_median               20
liver.liver.color_mean_eroded          20
liver.liver.color_mean_dilated         20
liver.