In [1]:
from measure import get_database_interface
import yaml

In [2]:
import pandas as pd
from ftplib import *
from io import StringIO
import io

In [3]:
from tqdm.auto import tqdm

In [4]:
def get_df(iface, par_name):
    # create dataframe from the database
        
    selection = iface.as_table([par_name, 'line', 'id', 'sex', 'age', 'generation'])
    df = pd.DataFrame(selection)
    df = df.loc[~df[par_name].isna()]
    df = df.explode(par_name).explode(par_name).explode(par_name)
    df = df.rename(columns={o:n for o,n in zip([par_name, 'line', 'id', 'sex', 'age', 'generation'], 
                                               ['measurement', 'line_id', 'fish_id', 'fish_sex', 'fish_age', 'fish_generation'])})
    return df

In [5]:
with open('measurement_configs/processing/medaka_mongo.yaml') as f:
    iface = get_database_interface(yaml.safe_load(f)['db'])

## Adding meta-information to the database

**NB!** be sure to understand what you are doing good before making active and executing the last cell here

In [6]:
scan_201811 = pd.read_excel('samplelist_complete.xlsx', sheet_name=4)
scan_201905 = pd.read_excel('samplelist_complete.xlsx', sheet_name=3)
scan_201912 = pd.read_excel('samplelist_complete.xlsx', sheet_name=2)
scan_202012 = pd.read_excel('samplelist_complete.xlsx', sheet_name=1)

all_scans = pd.concat([scan_201811, scan_201905, scan_201912, scan_202012])
all_scans = all_scans[all_scans['Scanned stained'].isin(['yes', 'Yes', 'Scanned', 'scanned'])]
all_scans = all_scans[['S.No', 'Line', 'Gen.', 'Sex', 'DOB', 'DOS']]
all_scans['age'] = (all_scans['DOS'] - pd.to_datetime(all_scans['DOB'], errors='coerce')).dt.days
all_scans = all_scans.drop(['DOS', 'DOB'], axis=1)

all_scans = all_scans.rename(columns={'S.No': 'id', 'Line': 'line', 'Gen.': 'generation', 'Sex': 'sex', 'age':'age'})
all_scans['id'] = pd.to_numeric(all_scans['id'], errors='coerce')
all_scans = all_scans[~all_scans.isna()['id']]
all_scans['id'] = all_scans['id'].astype(int).astype(str)

In [7]:
set_of_reliable_ids = set([i['id'] for i in iface.as_table([])]) & set(all_scans.id)

## Generating the dataframes with all measurements available to be saved

In [12]:
from collections import Counter
from itertools import chain

def _transform_db(in_db, measurement_id):
    new_db = in_db[['line', 'id', 'sex', 'age', 'generation', measurement_id]]
    new_db = new_db.rename({o:n for o,n in zip([measurement_id, 'line', 'id', 'sex', 'age', 'generation'], 
                                               ['measurement', 'line_id', 'fish_id', 'fish_sex', 'fish_age', 'fish_generation'])}, axis=1)
    return new_db

def _db_statistics(possible_fields, attempted, whole_db):
    measured = 0
    samples_in_db = []
    for measurement_key in tqdm(possible_fields):
        df = _transform_db(whole_db, measurement_key)
        samples_id_current = list(df.dropna(how='any', subset=['measurement']).fish_id.unique())
    #     print(measurement_key, len(samples_id_current))
        measured += df.dropna(how='any', subset=['measurement']).fish_id.nunique()
        samples_in_db.append(samples_id_current)

    print('on average samples measured per each metric: ', measured / len(possible_fields))
    samples_with_that_much_measurements = Counter(Counter(chain.from_iterable(samples_in_db)).values())

    samples_in_db = len(set.union(*[set(i) for i in samples_in_db]))
    print('in db', samples_in_db)

    mc_0 = attempted - samples_in_db
    mc_20 = sum([v for k,v in samples_with_that_much_measurements.items() if k < len(possible_fields)*0.2])
    mc_20_80 = sum([v for k,v in samples_with_that_much_measurements.items() if ((k > len(possible_fields)*0.2) 
                                                                                   and
                                                                                   (k < len(possible_fields)*0.8))])
    mc_80 = sum([v for k,v in samples_with_that_much_measurements.items() if ((k > len(possible_fields)*0.8) and k < len(possible_fields))])
    mc_100 = sum([v for k,v in samples_with_that_much_measurements.items() if k == len(possible_fields)])

    print('metrics coverage of 0/20/20-40/80/100:', f'{mc_0}/{mc_20}/{mc_20_80}/{mc_80}/{mc_100}')

import os
from glob import glob
import re

def _experiment_organ_statistics(experiment_addr, organ, scan_metainformation):
    addrs = glob(os.path.join(experiment_addr, 'Medaka_*_*'))
    ids = [re.findall('Medaka_(\d+)_', i)[0] for i in addrs]
    lines = [re.findall('Medaka_\d+_(.+)', i)[0] for i in addrs]
    
    attempted = len(addrs)
    print('experiment size: ', attempted)
    
    whole_db = pd.DataFrame(iface.as_table([]))
    db_experiment = whole_db[whole_db.id.isin(ids)]
    
    fieldset = (set([i for rec in iface.as_table([]) for i in rec.keys() if i.startswith(organ+'.')]) 
                   - {'line', 'id', 'sex', 'age', 'generation'})
    
    _db_statistics(fieldset, attempted, db_experiment)
    
    print('Failures count per metric:')
    print(db_experiment[list(fieldset)].isna().sum().sort_values(ascending=False))
    
    count_ids_db = db_experiment.dropna(how='all', subset=fieldset).groupby('line').id.count()
    count_ids_meta = scan_metainformation.groupby('Line')['S.No'].count()
    
    line_errors = pd.DataFrame({'failure_ratio': 1 - count_ids_db / count_ids_meta,
                                'failure_count': count_ids_meta - count_ids_db})
    
    print('Failure counts and ratios per fish line:')
    print(line_errors.sort_values('failure_ratio', ascending=False))

In [13]:
# overall
attempted = 340+232
possible_fields = (set([i for rec in iface.as_table([]) for i in rec.keys()]) 
                   - {'line', 'id', 'sex', 'age', 'generation'})
whole_db = pd.DataFrame(iface.as_table([]))
_db_statistics(possible_fields, attempted, whole_db)

  0%|          | 0/108 [00:00<?, ?it/s]

on average samples measured per each metric:  333.6111111111111
in db 538
metrics coverage of 0/20/20-40/80/100: 34/43/298/99/98


In [14]:
addr_2018_11 = '/mnt/LSDF/projects/code-vita/Medaka/2018_11'

In [15]:
_experiment_organ_statistics(addr_2018_11, 'eyes', scan_201811)

experiment size:  335


  0%|          | 0/29 [00:00<?, ?it/s]

on average samples measured per each metric:  316.37931034482756
in db 324
metrics coverage of 0/20/20-40/80/100: 11/0/0/135/189
Failures count per metric:
eyes.muscles.eccentricity_meridional     100
eyes.iris.eccentricity_meridional         83
eyes.lens.thickness_axial                 32
eyes.lens.radius_axial                     6
eyes.retina.eccentricity_meridional        5
eyes.muscles.distance_between_centers      1
eyes.nerve.volume                          1
eyes.lens.distance_between_centers         1
eyes.retina.color_average                  1
eyes.lens.surface_area                     1
eyes.retina.volume                         1
eyes.lens.volume                           1
eyes.lens.color_average                    1
eyes.lens.color_std                        1
eyes.muscles.color_average                 1
eyes.retina.color_std                      1
eyes.iris.color_average                    1
eyes.nerve.color_average                   1
eyes.iris.surface_area            

In [59]:
_experiment_organ_statistics(addr_2018_11, 'brain', scan_201811)

experiment size:  335


  0%|          | 0/49 [00:00<?, ?it/s]

on average samples measured per each metric:  198.75510204081633
in db 205
metrics coverage of 0/20/20-40/80/100: 130/0/6/78/121
Failures count per metric:
brain.optical_nerves.eccentricity_equatorial     200
brain.optical_nerves.eccentricity_meridional     200
brain.forebrain.eccentricity_meridional          144
brain.forebrain.eccentricity_equatorial          144
brain.cerebellum.eccentricity_meridional         139
brain.cerebellum.eccentricity_equatorial         139
brain.hindbrain.eccentricity_equatorial          135
brain.hindbrain.eccentricity_meridional          135
brain.midbrain.eccentricity_meridional           130
brain.midbrain.eccentricity_equatorial           130
brain.optical_tectum.eccentricity_equatorial     125
brain.optical_tectum.eccentricity_meridional     125
brain.cerebellum.surface_area                    120
brain.torus_longuthing.color_average             120
brain.midbrain.volume                            120
brain.epyphysis.color_average                    