This file is part of MADIP: Molecular Atlas Data Integration Pipeline

This file cleans metadata

Copyright 2021 Blue Brain Project / EPFL 

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
   

In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

import re

import pickle as pkl

import scipy.stats as st

from matplotlib.cbook import boxplot_stats

import timeit
from collections import Counter

from numpy.random import seed
from numpy.random import randint
from scipy.stats import ks_2samp
from scipy.stats import levene
from scipy.stats import kruskal

import scikit_posthocs as sp

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

%matplotlib inline
%config InlineBackend.figure_format = 'retina'



sns.set_style('whitegrid',{'axes.grid':False})

In [3]:
with open('../data/3_df_with_conc_PerSampleNorm.pkl','rb') as f:
    df = pkl.load(f)
print(len(df))


2131942


In [4]:
df.loc[df['gene_id_final'].str.contains(';')]

Unnamed: 0,gene_names,Uniprot,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units,gene_name_unified,Uniprot_unified,gene_id_final,log_raw_data,uniprot_from_gn,Uniprot_final,TheorPepNum,conc_uM,log_conc_uM,copyNum,totalProtein,totalVolume,sample_full_id


In [4]:
df.columns

Index(['gene_names', 'Uniprot', 'Study', 'Organism', 'location', 'Age_cat',
       'Age_days', 'condition', 'sample_id', 'molecular_weight_kDa',
       'raw_data', 'raw_data_units', 'gene_name_unified', 'Uniprot_unified',
       'gene_id_final', 'log_raw_data', 'uniprot_from_gn', 'Uniprot_final',
       'TheorPepNum', 'conc_uM', 'log_conc_uM', 'copyNum', 'totalProtein',
       'totalVolume', 'sample_full_id'],
      dtype='object')

### Clean metadata

In [5]:
df.loc[df['Organism'].isna(),'Study'].unique()

array([], dtype=object)

In [6]:
df.loc[df['location'].isna(),'Study'].unique()

array([], dtype=object)

In [7]:
df.loc[df['Age_days'].isna(),'Study'].unique()

array([], dtype=object)

In [8]:
df.loc[df['Age_cat'].isna(),'Study'].unique()

array(['Hamezah 2019', 'Krogager 2018', 'Sharma 2015, cultured',
       'Geiger 2013', 'Hosp 2017, soluble', 'Hosp 2017, insoluble',
       'Sharma 2015, isolated', 'Wisniewski 2015', 'Kjell 2020',
       'Han 2014', 'Bai 2020', 'Hamezah 2018', 'Hasan 2020', 'Zhu 2018',
       'Hosp 2017, CSF', 'Fecher 2019', 'Carlyle 2017', 'Duda 2018'],
      dtype=object)

In [9]:
df['Age_cat'].unique()

array([nan, 'post-mortem', 'adult', 'embr'], dtype=object)

In [10]:
# Age_cat for mouse (C57BL/6J):

# embr: before birth
# infants/young: from birth to 21 days of age 
# juvenile/young: from 3 weeks to 8 weeks old
# adults: after 2 months (here consider inkl. according to data sources papers)

# mature adult: 3-6 months
# middle-aged: 10-14 months
# old: 18-24 months

# https://www.jax.org/news-and-insights/jax-blog/2017/november/when-are-mice-considered-old# https://www.researchgate.net/post/At_what_age_are_laboratory_mice_considered_adult2

# or according to the sources where explicitly stated

In [11]:
df.loc[~df['Age_cat'].isna(),'Study'].unique()

array(['Davis 2019', 'Guergues 2019', 'Chuang 2018', 'Itzhak 2017',
       'McKetney 2019', 'Fornasiero 2018', 'Beltran 2016'], dtype=object)

In [12]:
print(df.loc[df['Study']=='Hamezah 2019','Age_days'].unique()) # 15*30 #Five-month-old mice were divided into four groups ... for a duration of 10 months -> 15 months

df.loc[df['Study']=='Hamezah 2019','Age_cat'] = 'middle-aged' # https://www.jax.org/news-and-insights/jax-blog/2017/november/when-are-mice-considered-old#


[476]


In [13]:
print(df.loc[df['Study']=='Sharma 2015, isolated','Age_days'].unique())


df.loc[(df['Study']=='Sharma 2015, isolated')&(df['Age_days']==29.0),'Age_cat'] = 'young' # 29 #8 + 21 # 'cultured cells? or isolated?' 'P8'

df.loc[(df['Study']=='Sharma 2015, isolated')&(df['Age_days']==81.0),'Age_cat'] ='adult' # 81 # 60 + 21 'P60'

df.loc[(df['Study']=='Sharma 2015, isolated') &(df['Age_days']==26.0),'Age_cat'] = 'young' # 26 #5 + 21 # 'cultured cells? or isolated?' 'P5'
df.loc[(df['Study']=='Sharma 2015, isolated') &(df['Age_days']==35.0),'Age_cat'] = 'young' # 35 #14 + 21 # 'cultured cells? or isolated?' 'P14'
df.loc[(df['Study']=='Sharma 2015, isolated') &(df['Age_days']==45.0),'Age_cat'] = 'young' # 45 #24 + 21 # 'cultured cells? or isolated?' 'P24'


[29.0 81.0 26.0 35.0 45.0]


In [14]:
print(df.loc[df['Study']=='Sharma 2015, cultured','Age_days'].unique())

df.loc[df['Study']=='Sharma 2015, cultured','Age_cat'] = 'embr'

[0]


In [15]:
print(df.loc[df['Study']=='Hosp 2017, soluble','Age_days'].unique())

df.loc[(df['Study']=='Hosp 2017, soluble') & (df['Age_days']==56.0),'Age_cat'] = 'young'
df.loc[(df['Study']=='Hosp 2017, soluble') & (df['Age_days']==77.0),'Age_cat'] = 'adult'
df.loc[(df['Study']=='Hosp 2017, soluble') & (df['Age_days']==105.0),'Age_cat'] = 'adult'

[56.0 77.0 105.0]


In [16]:
print(df.loc[df['Study']=='Hosp 2017, insoluble','Age_days'].unique())

df.loc[(df['Study']=='Hosp 2017, insoluble') & (df['Age_days']==56.0),'Age_cat'] = 'young'
df.loc[(df['Study']=='Hosp 2017, insoluble') & (df['Age_days']==77.0),'Age_cat'] = 'adult'
df.loc[(df['Study']=='Hosp 2017, insoluble') & (df['Age_days']==105.0),'Age_cat'] = 'adult'

[56.0 77.0 105.0]


In [17]:
print(df.loc[df['Study']=='Hosp 2017, CSF','Age_days'].unique())

df.loc[(df['Study']=='Hosp 2017, CSF') & (df['Age_days']==56.0),'Age_cat'] = 'young'
df.loc[(df['Study']=='Hosp 2017, CSF') & (df['Age_days']==77.0),'Age_cat'] = 'adult'
df.loc[(df['Study']=='Hosp 2017, CSF') & (df['Age_days']==105.0),'Age_cat'] = 'adult'

[56.0 77.0 105.0]


In [18]:
print(df.loc[df['Study']=='Geiger 2013','Age_days'].unique())

df.loc[df['Study']=='Geiger 2013','Age_cat'] = 'adult'

[91]


In [19]:
print(df.loc[df['Study']=='Wisniewski 2015','Age_days'].unique())

df.loc[df['Study']=='Wisniewski 2015','Age_cat'] = 'adult'

[91]


In [20]:
print(df.loc[df['Study']=='Han 2014','Age_days'].unique())

df.loc[df['Study']=='Han 2014','Age_cat'] = 'embr'

[0]


In [21]:
print(df.loc[df['Study']=='Kjell 2020','Age_days'].unique())

df.loc[df['Study']=='Kjell 2020','Age_cat'] = 'adult'

[84]


In [22]:
print(df.loc[df['Study']=='Krogager 2018','Age_days'].unique())

df.loc[df['Study']=='Krogager 2018','Age_cat'] = 'adult'

[112]


In [23]:
print(df.loc[df['Study']=='Hamezah 2018','Age_days'].unique())

#Hamezah - rats, lifespan differ from mouse
# rat lifespan https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3733029/ making correspondence to mouse
df.loc[(df['Study']=='Hamezah 2018') &(df['Age_days']==446.0),'Age_cat'] = 'adult' #446 # 365 + 2*30 + 21 # '14 months'
df.loc[(df['Study']=='Hamezah 2018') &(df['Age_days']==566.0),'Age_cat'] = 'mature adult' #569 # 365 + 365/2 +21 # '18 months'
df.loc[(df['Study']=='Hamezah 2018') &(df['Age_days']==721.0),'Age_cat'] = 'middle-aged' #721 # 365*2 -30 +21 # '23 months'
df.loc[(df['Study']=='Hamezah 2018') &(df['Age_days']==841.0),'Age_cat'] = 'old' #841 # 365*2 + 30*3 +21 # '27 months'



[446.0 566.0 721.0 841.0]


In [24]:
print(df.loc[df['Study']=='Duda 2018','Age_days'].unique())

df.loc[(df['Study']=='Duda 2018') & (df['Age_days']==51.0),'Age_cat'] = 'young'
df.loc[(df['Study']=='Duda 2018') & (df['Age_days']==386.0),'Age_cat'] = 'middle-aged'

[51.0 386.0]


In [25]:
print(df.loc[df['Study']=='Carlyle 2017','Age_days'].unique())

#human

df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==386.0),'Age_cat'] = 'child'
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==631.0),'Age_cat'] = 'child'
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==996.0),'Age_cat'] = 'child'
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==1361.0),'Age_cat'] = 'child' 
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==1726.0),'Age_cat'] = 'child' 
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==3186.0),'Age_cat'] = 'child'  
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==4281.0),'Age_cat'] = 'adolescence'
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==5741.0),'Age_cat'] = 'adolescence'
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==7201.0),'Age_cat'] = 'adult'
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==8661.0),'Age_cat'] = 'adult'
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==11216.0),'Age_cat'] = 'adult'
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==13406.0),'Age_cat'] = 'adult'
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']== 13771.0),'Age_cat'] = 'adult'
df.loc[(df['Study']=='Carlyle 2017')& (df['Age_days']==14866.0),'Age_cat'] = 'adult'

[1726.0 5741.0 386.0 631.0 13771.0 11216.0 7201.0 14866.0 8661.0 3186.0
 996.0 13406.0 1361.0 4281.0]


In [26]:
print(df.loc[df['Study']=='Zhu 2018','Age_days'].unique())

# rat P17

df.loc[df['Study']=='Zhu 2018','Age_cat'] = 'young'

[38]


In [27]:
print(df.loc[df['Study']=='Fecher 2019','Age_days'].unique()) # adult 

df.loc[df['Study']=='Fecher 2019','Age_cat'] = 'adult'

[77]


In [28]:
print(df.loc[df['Study']=='Bai 2020','Age_days'].unique())

df.loc[(df['Study']=='Bai 2020') & (df['Age_days']==111.0),'Age_cat'] = 'adult' #3months
df.loc[(df['Study']=='Bai 2020') & (df['Age_days']==201.0),'Age_cat'] = 'mature adult' #6months
df.loc[(df['Study']=='Bai 2020') & (df['Age_days']==386.0),'Age_cat'] = 'middle-aged' #12months

[111.0 201.0 386.0 'post-mortem']


In [29]:
df.loc[(df['Study']=='Bai 2020') & (df['Age_days']=='post-mortem'),'Age_cat'] =  'post-mortem' #

In [30]:
print(df.loc[df['Study']=='Hasan 2020','Age_days'].unique())

df.loc[df['Study']=='Hasan 2020','Age_cat'] = 'mature adult'

[156]


In [31]:
df.loc[df['condition']=='AD_severe','Study'].unique()

array(['McKetney 2019'], dtype=object)

In [32]:
df['condition'].unique()

array(['WT', 'Alzheimer', 'control', 'SORT', nan, 'AD',
       'LPC: low pathology of plaques and tangles. AD',
       'HPC: high Ab pathology but no detectable cognitive defects. AD',
       'MCI: mild cognitive impairment with Ab pathology and a slight but measurable defect in cognition. AD',
       'AD: late-stage AD with high pathology scores of plaques and tangles',
       'PSP: progressive supranuclear palsy, another neurodegenerative disorder of tauopathy',
       'AD_severe', 'AD_intermediate', 'EAE', 'young', 'adult'],
      dtype=object)

In [33]:
# do not replace conditon names as may lead to duplicates


#df.loc[(df['condition'].isin(['WT','control', 'SORT','young', 'adult']) | (df['condition'].isna())),'condition' ] = "control"

#df.loc[df['condition'].isin(['Alzheimer','AD']),'condition' ] = "Alzheimer"

#df.loc[df['condition']=='EAE','condition' ] = "experimental autoimmune encephalomyelitis" # too long name for plots

In [33]:
with open('../data/4_df_agecat.pkl','wb') as f:
    pkl.dump(df,f)