This file is part of  MADIP: Molecular Atlas Data Integration Pipeline

Copyright 2021 Blue Brain Project / EPFL 

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import xlrd

import re


# to use the new data (of supported type) that is not in the list of studies in import collect_protein_data, 
# reshape it to be consistent with the format and units analogously with the code below

from collect_protein_data import (
    get_hamezah_2019_dataframe,
    get_hamezah_2018_dataframe,
    get_chuang_2018_dataframe,
    get_duda_2018_dataframe,
    get_krogager_2018_dataframe,
    get_hosp_2017_dataframe,
    get_itzhak_2017_dataframe,
    get_beltran_2016_dataframe,
    get_sharma_2015_dataframe,
    get_wisniewski_2015_dataframe,
    get_han_2014_dataframe,
    get_geiger_2013_dataframe,
    get_bai_2020_dataframe,
    get_human_samples_bai_2020_dataframe,
    get_carlyle_2017_dataframe,
    get_davis_2019_dataframe,
    get_fecher_2019_dataframe,
    get_fornasiero_2018_dataframe,
    get_guergues_2019_dataframe,
    get_mcketney_2019_dataframe,
    get_hasan_2020_dataframe,
    get_zhu_2018_dataframe,
    get_kjell_2020_dataframe
)

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
#data format

meta_columns = [
    'gene_names','gene_name_unified','Uniprot','Uniprot_unified',
    'Study','Organism','location','Age_cat','Age_days','condition','sample_id',
    'molecular_weight_kDa','raw_data','raw_data_units'
]

df_all = pd.DataFrame(columns=meta_columns)

# Import data sets

This module loads the data publicly available of the referred sources. Full references are provided in the paper A Standardized Brain Molecular Atlas: a resource for systems modeling and simulation.

Age in days is only approximate and not involved in the downstream analysis. Qualitative age category is defined based on the original data sources.

### Hamezah_2019

In [3]:
#Hamezah et al 2019 (Mice Hippocampus, Medial Prefrontal Cortex, and Striatum)

hamezah_2019_df = get_hamezah_2019_dataframe()
hamezah_2019_df.head()

Importing Hamezah 2019 pandas dataframe.


Unnamed: 0,Uniprot,gene_names,molecular_weight_kDa,location,condition,raw_data,Study,Organism,Age_days,raw_data_units
0,A0A068BEQ2;P50171;G3UX44,H2-KE6;HSD17B8,26.587,hippocampus,WT,3324025.0,Hamezah 2019,mouse,476,LFQintensity
1,A0A068BEQ2;P50171;G3UX44,H2-KE6;HSD17B8,26.587,hippocampus,Alzheimer,1736439.0,Hamezah 2019,mouse,476,LFQintensity
2,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,RAB11B;RAB11A,24.489,hippocampus,WT,42737520.0,Hamezah 2019,mouse,476,LFQintensity
3,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,RAB11B;RAB11A,24.489,hippocampus,Alzheimer,115394600.0,Hamezah 2019,mouse,476,LFQintensity
4,Q3V3A4;A0A068BGT0;Q8C754;G3UY33,VPS52,82.099,hippocampus,WT,1298037.0,Hamezah 2019,mouse,476,LFQintensity


##### Review gene names

In [4]:
hamezah_2019_df_nn = hamezah_2019_df.loc[~hamezah_2019_df['gene_names'].isna()].copy()

print(len(hamezah_2019_df_nn.loc[hamezah_2019_df_nn['gene_names'].str.contains(';'),'gene_names'].unique()))

hamezah_2019_df_nn.loc[hamezah_2019_df_nn['gene_names'].str.contains(';'),'gene_names'].unique()

240


array(['H2-KE6;HSD17B8', 'RAB11B;RAB11A', 'MT-CO2;COX2;MTCO2',
       'MT-CO3;COX3;COXIII', 'EEF1B2;EEF1B', '2310035C23RIK;KIAA1468',
       'UBC;GM8797;UBA52;GM1821;RPS27A;KXD1;UBB;GM5239',
       'HIST1H2AH;HIST1H2AA;HIST1H2AD;HIST3H2A;H2AFJ;HIST1H2AK;HIST1H2AF;HIST1H2AB;H2AFX',
       'EIF4A3;GM8994', 'SERPINA1C;SERPINA1A', 'SYT2;SYT II',
       'ND4;MT-ND4;MTND4', 'MT-ND5;ND5;MTND5;MT-ND6',
       'HIST1H2BJ;HIST1H2BK;HIST1H2BM;HIST1H2BR;LOC665622;HIST1H2BP;HIST1H2BC;HIST2H2BB;HIST1H2BH;HIST1H2BB;HIST1H2BF;HIST1H2BA',
       'HSPA1B;HSPA1A', 'NLGN3;MKIAA1480', 'RPL10;RPL10L', 'ETL4;SKT',
       'EPB4.1L1;EPB41L1', 'SET;BC085271', 'ATP6V0C;GM15487',
       'HBBT1;HBB-B1;HBB1;HBB-BS', 'VAMP2;VAMP3', 'GYK;GK', 'AIFM1;PDCD8',
       'SCAMP3;TU52', 'HIST2H4;HIST1H4A', 'H2AFV;H2AFZ', 'DDX3X;D1PAS1',
       'NPTXR;NPCD', 'GAPDH;GAPDH;GM3839;GM7293',
       'RPS2;GM6576;GM5786;GM18025;GM8225',
       'GM17669;RPL29;GM10709;GM3550;GM5218', 'AKR1B10;AKR1B8',
       'SIRPA;PTPNS1',
       'H3

### Hamezah_2018

In [5]:
hamezah_2018_df = get_hamezah_2018_dataframe()
hamezah_2018_df.head()

Importing pandas Hamezah 2018 dataframe.


Unnamed: 0,Uniprot,gene_names,molecular_weight_kDa,location,sample_id,raw_data,Study,Organism,Age_days,raw_data_units
0,A0JPJ7,OLA1,44.535,hippocampus,14months,6885834.0,Hamezah 2018,rat,446.0,LFQintensity
1,A0JPJ7,OLA1,44.535,hippocampus,18months,25711310.0,Hamezah 2018,rat,566.0,LFQintensity
2,A0JPJ7,OLA1,44.535,hippocampus,23months,9639989.0,Hamezah 2018,rat,721.0,LFQintensity
3,A0JPJ7,OLA1,44.535,hippocampus,27months,9605305.0,Hamezah 2018,rat,841.0,LFQintensity
4,A1A5R1,RBFOX2,45.55,hippocampus,14months,3089410.0,Hamezah 2018,rat,446.0,LFQintensity


### Chuang_2018

In [6]:
chuang2018_df = get_chuang_2018_dataframe()
chuang2018_df.head()

Importing Chuang 2018 pandas dataframe.


Unnamed: 0,Uniprot,gene_names,molecular_weight_kDa,location,sample_id,raw_data,Study,Organism,Age_days,Age_cat,raw_data_units
0,Q63041,A1M,167.12,axon,Experiment1,7628500.0,Chuang 2018,rat,18,embr,iBAQ
1,Q63041,A1M,167.12,axon,Experiment2,634200.0,Chuang 2018,rat,18,embr,iBAQ
2,Q9JMI1,AACS,75.039,axon,Experiment1,716030.0,Chuang 2018,rat,18,embr,iBAQ
3,Q9JMI1,AACS,75.039,axon,Experiment2,27545.0,Chuang 2018,rat,18,embr,iBAQ
4,Q9R0Z7,AAGAB,34.363,axon,Experiment1,193420.0,Chuang 2018,rat,18,embr,iBAQ


### Duda_2018

In [7]:
duda_2018_df = get_duda_2018_dataframe()
duda_2018_df.head()

Importing Duda 2018 pandas dataframe.


Unnamed: 0,gene_names,location,condition,raw_data,Study,Organism,Age_days,raw_data_units
0,KRT8,hippocampus,young,0.83246,Duda 2018,mouse,51.0,Mean concentration [mol/(g total protein)]
1,KRT8,hippocampus,adult,0.061566,Duda 2018,mouse,386.0,Mean concentration [mol/(g total protein)]
2,MYO5B,hippocampus,young,0.023429,Duda 2018,mouse,51.0,Mean concentration [mol/(g total protein)]
3,MYO5B,hippocampus,adult,0.002103,Duda 2018,mouse,386.0,Mean concentration [mol/(g total protein)]
4,PCDH17,hippocampus,young,0.526599,Duda 2018,mouse,51.0,Mean concentration [mol/(g total protein)]


### Krogager_2018

In [8]:
krogager_df = get_krogager_2018_dataframe()
krogager_df.head()

Importing Krogager 2018 pandas dataframe.


Unnamed: 0,Uniprot,gene_names,condition,raw_data,Study,Organism,Age_days,location,raw_data_units
0,Q9QZM0,UBQLN2,control,2529499.0,Krogager 2018,mouse,112,striatum,LFQintensity
1,Q9QZM0,UBQLN2,SORT,778229300.0,Krogager 2018,mouse,112,neurons,LFQintensity
2,Q7TQ95,LNP,control,1561304.0,Krogager 2018,mouse,112,striatum,LFQintensity
3,Q7TQ95,LNP,SORT,274460800.0,Krogager 2018,mouse,112,neurons,LFQintensity
4,P42225,STAT1,control,427396.3,Krogager 2018,mouse,112,striatum,LFQintensity


### Hosp_2017

In [9]:
hosp_3 = get_hosp_2017_dataframe()
hosp_3.head()

Importing Hosp 2017 pandas dataframe. This can last a while.


Unnamed: 0,Uniprot,gene_names,molecular_weight_kDa,sample_id,raw_data,Study,Organism,raw_data_units,Age_days,location
0,Q80Z24;D3Z4T6;A0A4W9,NEGR1,37.9,5wWTce1,249560000.0,"Hosp 2017, soluble",mouse,iBAQ,56.0,cerebellum
1,Q80Z24;D3Z4T6;A0A4W9,NEGR1,37.9,5wWTce2,244370000.0,"Hosp 2017, soluble",mouse,iBAQ,56.0,cerebellum
2,Q80Z24;D3Z4T6;A0A4W9,NEGR1,37.9,5wWTce3,134340000.0,"Hosp 2017, soluble",mouse,iBAQ,56.0,cerebellum
3,Q80Z24;D3Z4T6;A0A4W9,NEGR1,37.9,5wWTce4,74941000.0,"Hosp 2017, soluble",mouse,iBAQ,56.0,cerebellum
4,Q80Z24;D3Z4T6;A0A4W9,NEGR1,37.9,5wWTco1,154570000.0,"Hosp 2017, soluble",mouse,iBAQ,56.0,cortex


###  Itzhak 2017

In [10]:
itzhakConc = get_itzhak_2017_dataframe()
itzhakConc.head()

Importing itzhak 2017 pandas dataframe. This can last a while.


Unnamed: 0,gene_names,Uniprot,location,raw_data,molecular_weight_kDa,raw_data_units,Study,Organism,Age_days,Age_cat
0,Slc20a2,Q80UP8,Plasma membrane,3.151791,70.865,Median cellular concentration [nM],Itzhak 2017,mouse,15,embr
1,Vcam1,P29533;Q3UPN1,Plasma membrane,7.825855,81.317,Median cellular concentration [nM],Itzhak 2017,mouse,15,embr
2,Sh3bp4,Q921I6,Plasma membrane,9.097981,107.58,Median cellular concentration [nM],Itzhak 2017,mouse,15,embr
3,Abcg2,Q7TMS5;S4R2E1;A0A0R4J0B6;D3Z150,Plasma membrane,12.400113,72.977,Median cellular concentration [nM],Itzhak 2017,mouse,15,embr
4,Sirpa,Q6P6I8;E0CYM8;P97797-2;P97797;A0A0R4J1Z7,Plasma membrane,20.724743,55.986,Median cellular concentration [nM],Itzhak 2017,mouse,15,embr


### Beltran 2016

In [11]:
beltranfin = get_beltran_2016_dataframe()
beltranfin.head()

Importing beltran 2016 pandas dataframe


Unnamed: 0,Uniprot,gene_names,Organism,raw_data,raw_data_units,Study,Age_days,Age_cat,location
0,A0FGR8,ESYT2,human,222854.0,iBAQ,Beltran 2016,0,embr,ER
1,A1L0T0,ILVBL,human,335487.0,iBAQ,Beltran 2016,0,embr,subcellular not specified
2,A2RRP1,NBAS,human,154462.0,iBAQ,Beltran 2016,0,embr,ER
3,A2RU67,FAM234B,human,11101.0,iBAQ,Beltran 2016,0,embr,subcellular not specified
4,A3KMH1,VWA8,human,100274.82,iBAQ,Beltran 2016,0,embr,Mitochondria


### Sharma 2015

In [12]:
sharma4o_df, sharma_df = get_sharma_2015_dataframe()
sharma4o_df.head()

Importing sharma 2015 pandas dataframe. This can last a while.


Unnamed: 0,Uniprot,gene_names,molecular_weight_kDa,sample_id,raw_data,Study,Organism,raw_data_units,Age_days,location
0,P60710,Actb,41.736,IsolatedAstrocytes,419180300000.0,"Sharma 2015, isolated",mouse,LFQintensity,29.0,astrocytes
1,P60710,Actb,41.736,IsolatedMicroglia,538359900000.0,"Sharma 2015, isolated",mouse,LFQintensity,29.0,microglia
2,P60710,Actb,41.736,IsolatedNeurons,313221000000.0,"Sharma 2015, isolated",mouse,LFQintensity,29.0,neurons
3,P60710,Actb,41.736,IsolatedOligodendrocytes,360640400000.0,"Sharma 2015, isolated",mouse,LFQintensity,29.0,oligodendrocytes
4,P60710,Actb,41.736,Brain,353823800000.0,"Sharma 2015, isolated",mouse,LFQintensity,81.0,brain


In [13]:
sharma_df.head()

Unnamed: 0,Uniprot,gene_names,molecular_weight_kDa,sample_id,raw_data,Study,Organism,raw_data_units,Age_days,location
0,Q5FWJ3;P20152,Vim,53.687,adultMicroglia1,175220000000,"Sharma 2015, cultured",mouse,LFQintensity,0,microglia
1,Q5FWJ3;P20152,Vim,53.687,adultMicroglia2,185940000000,"Sharma 2015, cultured",mouse,LFQintensity,0,microglia
2,Q5FWJ3;P20152,Vim,53.687,adultMicroglia3,190360000000,"Sharma 2015, cultured",mouse,LFQintensity,0,microglia
3,Q5FWJ3;P20152,Vim,53.687,youngMicroglia1,159400000000,"Sharma 2015, cultured",mouse,LFQintensity,0,microglia
4,Q5FWJ3;P20152,Vim,53.687,youngMicroglia2,145040000000,"Sharma 2015, cultured",mouse,LFQintensity,0,microglia


### Wisńiewski 2015

In [14]:
wisniewski_df = get_wisniewski_2015_dataframe()
wisniewski_df.head()

Importing Wisńiewski 2015 pandas dataframe


Unnamed: 0,Uniprot,gene_names,sample_id,raw_data,Study,Organism,location,Age_days,raw_data_units
0,Q923B0;E0CX39;E0CZ04;Q923B0-2,A2ld1,1,1.648892e-10,Wisniewski 2015,mouse,brain,91,Protein concentration (mol/g protein)
1,Q923B0;E0CX39;E0CZ04;Q923B0-2,A2ld1,2,1.384135e-10,Wisniewski 2015,mouse,brain,91,Protein concentration (mol/g protein)
2,Q923B0;E0CX39;E0CZ04;Q923B0-2,A2ld1,3,2.097264e-10,Wisniewski 2015,mouse,brain,91,Protein concentration (mol/g protein)
3,P58742,Aaas,1,3.229649e-11,Wisniewski 2015,mouse,brain,91,Protein concentration (mol/g protein)
4,P58742,Aaas,2,2.422095e-11,Wisniewski 2015,mouse,brain,91,Protein concentration (mol/g protein)


### Han 2014

In [15]:
han_df = get_han_2014_dataframe()
han_df.head()

Importing Han 2014 pandas dataframe


Unnamed: 0,Uniprot,gene_names,sample_id,raw_data,Study,Organism,raw_data_units,Age_days,location
0,Q6PGH2;Q3TJZ7;Q3TM10;Q3U7P4,Hn1l,WCLset1tech1,2212400000,Han 2014,mouse,LFQintensity,0,astrocytes
1,Q6PGH2;Q3TJZ7;Q3TM10;Q3U7P4,Hn1l,WCLset1tech2,2409300000,Han 2014,mouse,LFQintensity,0,astrocytes
2,Q6PGH2;Q3TJZ7;Q3TM10;Q3U7P4,Hn1l,WCLset1tech3,2893000000,Han 2014,mouse,LFQintensity,0,astrocytes
3,Q6PGH2;Q3TJZ7;Q3TM10;Q3U7P4,Hn1l,WCLset2tech1,1880200000,Han 2014,mouse,LFQintensity,0,astrocytes
4,Q6PGH2;Q3TJZ7;Q3TM10;Q3U7P4,Hn1l,WCLset2tech2,1751900000,Han 2014,mouse,LFQintensity,0,astrocytes


In [16]:
han_dfnn = han_df.loc[~han_df['Uniprot'].isna()].copy()

han_dfnn.loc[han_dfnn['Uniprot'].str.contains('D3YYU8D3Z0M8')].head() 

#!!! pay attention to these Uniprot combined ids, see step_3_protein_conc_calc.ipynb

Unnamed: 0,Uniprot,gene_names,sample_id,raw_data,Study,Organism,raw_data_units,Age_days,location
48771,D3YYU8D3Z0M8;F7AD47;Q80WA6;E9PXH7;Q80TU8,Obsl1,WCLset1tech1,78643000,Han 2014,mouse,LFQintensity,0,astrocytes
48772,D3YYU8D3Z0M8;F7AD47;Q80WA6;E9PXH7;Q80TU8,Obsl1,WCLset1tech2,0,Han 2014,mouse,LFQintensity,0,astrocytes
48773,D3YYU8D3Z0M8;F7AD47;Q80WA6;E9PXH7;Q80TU8,Obsl1,WCLset1tech3,116670000,Han 2014,mouse,LFQintensity,0,astrocytes
48774,D3YYU8D3Z0M8;F7AD47;Q80WA6;E9PXH7;Q80TU8,Obsl1,WCLset2tech1,140180000,Han 2014,mouse,LFQintensity,0,astrocytes
48775,D3YYU8D3Z0M8;F7AD47;Q80WA6;E9PXH7;Q80TU8,Obsl1,WCLset2tech2,131280000,Han 2014,mouse,LFQintensity,0,astrocytes


### Geiger 2013

In [17]:
geiger_df = get_geiger_2013_dataframe()
geiger_df.head()

Importing Geiger 2013 pandas dataframe. This operation can last a while.


Unnamed: 0,Uniprot,gene_names,molecular_weight_kDa,location,raw_data,Study,Organism,raw_data_units,Age_days
0,F6VME3;P04370-10;P04370-5;F6RT34;P04370-4;P043...,Mbp,20.829,cortex,6573000000.0,Geiger 2013,mouse,IntensityL,91
1,F6VME3;P04370-10;P04370-5;F6RT34;P04370-4;P043...,Mbp,20.829,medulla,23493000000.0,Geiger 2013,mouse,IntensityL,91
2,F6VME3;P04370-10;P04370-5;F6RT34;P04370-4;P043...,Mbp,20.829,cerebellum,18276000000.0,Geiger 2013,mouse,IntensityL,91
3,F6VME3;P04370-10;P04370-5;F6RT34;P04370-4;P043...,Mbp,20.829,midbrain,4432600000.0,Geiger 2013,mouse,IntensityL,91
4,F6VME3;P04370-10;P04370-5;F6RT34;P04370-4;P043...,Mbp,20.829,olfactory bulb,30746000000.0,Geiger 2013,mouse,IntensityL,91


### Bai 2020

In [18]:
bai2020_df = get_bai_2020_dataframe()
bai2020_df.head()

Importing Bai 2020 pandas dataframe.


Unnamed: 0,Uniprot,gene_names,sample_id,raw_data,Organism,location,raw_data_units,Study,Age_days,condition
0,Q19LI2;P04217,A1BG,WT3M1,31936.04,mouse,cortex,Protein Abundance (Summerized TMT Reporter Ion...,Bai 2020,111.0,control
1,Q19LI2;P04217,A1BG,WT3M2,32619.88,mouse,cortex,Protein Abundance (Summerized TMT Reporter Ion...,Bai 2020,111.0,control
2,Q19LI2;P04217,A1BG,WT6M1,23592.29,mouse,cortex,Protein Abundance (Summerized TMT Reporter Ion...,Bai 2020,201.0,control
3,Q19LI2;P04217,A1BG,WT6M2,29239.86,mouse,cortex,Protein Abundance (Summerized TMT Reporter Ion...,Bai 2020,201.0,control
4,Q19LI2;P04217,A1BG,WT6M3,31684.95,mouse,cortex,Protein Abundance (Summerized TMT Reporter Ion...,Bai 2020,201.0,control


In [19]:
bai2020human_df = get_human_samples_bai_2020_dataframe()
bai2020human_df.head()

Importing human samples Bai 2020 pandas dataframe.


Unnamed: 0,Uniprot,gene_names,sample_id,raw_data,Organism,location,raw_data_units,Study,condition,Age_days
0,Q19LI2;P04217,A1BG,LPC1,1610961.0,human,cortex,Protein Abundance (Summerized TMT Reporter Ion...,Bai 2020,LPC: low pathology of plaques and tangles. AD,post-mortem
1,Q19LI2;P04217,A1BG,LPC2,1580507.0,human,cortex,Protein Abundance (Summerized TMT Reporter Ion...,Bai 2020,LPC: low pathology of plaques and tangles. AD,post-mortem
2,Q19LI2;P04217,A1BG,HPC1,1556383.0,human,cortex,Protein Abundance (Summerized TMT Reporter Ion...,Bai 2020,HPC: high Ab pathology but no detectable cogni...,post-mortem
3,Q19LI2;P04217,A1BG,HPC2,1359861.0,human,cortex,Protein Abundance (Summerized TMT Reporter Ion...,Bai 2020,HPC: high Ab pathology but no detectable cogni...,post-mortem
4,Q19LI2;P04217,A1BG,MCI1,1458553.0,human,cortex,Protein Abundance (Summerized TMT Reporter Ion...,Bai 2020,MCI: mild cognitive impairment with Ab patholo...,post-mortem


### Carlyle 2017

In [20]:
carlyle2017_df_filt = get_carlyle_2017_dataframe()
carlyle2017_df_filt.head()

Importing Carlyle 2017 pandas dataframe.


Unnamed: 0,gene_names,sample_id,Organism,Study,raw_data,raw_data_units,location,Age_days
0,A1BG,118AMY,human,Carlyle 2017,9560032.0,LFQintensity,amygdala,1726.0
1,A2M,118AMY,human,Carlyle 2017,2720239000.0,LFQintensity,amygdala,1726.0
2,AADAT,118AMY,human,Carlyle 2017,38706800.0,LFQintensity,amygdala,1726.0
3,AAK1,118AMY,human,Carlyle 2017,2761356000.0,LFQintensity,amygdala,1726.0
4,AAMDC,118AMY,human,Carlyle 2017,8940352.0,LFQintensity,amygdala,1726.0


### Davis 2019

In [21]:
davis2019_df = get_davis_2019_dataframe()
davis2019_df.head()

Importing Davis 2019 pandas dataframe.


Unnamed: 0,Uniprot,gene_names,molecular_weight_kDa,sample_id,raw_data,Organism,Study,raw_data_units,location,Age_days,Age_cat
0,A0A024QZX5;A0A087X1N8;P35237,SERPINB6,43.024,BetzCapSP31,932190.0,human,Davis 2019,iBAQ,neurons,post-mortem,post-mortem
1,A0A024QZX5;A0A087X1N8;P35237,SERPINB6,43.024,BetzCapSP32,1242500.0,human,Davis 2019,iBAQ,neurons,post-mortem,post-mortem
2,A0A024QZX5;A0A087X1N8;P35237,SERPINB6,43.024,BetzCapSP33,1358200.0,human,Davis 2019,iBAQ,neurons,post-mortem,post-mortem
3,A0A024QZX5;A0A087X1N8;P35237,SERPINB6,43.024,PurkinjeCapSP31,66993.0,human,Davis 2019,iBAQ,neurons,post-mortem,post-mortem
4,A0A024QZX5;A0A087X1N8;P35237,SERPINB6,43.024,PurkinjeCapSP32,63420.0,human,Davis 2019,iBAQ,neurons,post-mortem,post-mortem


### Fecher 2019

In [22]:
fecher2019_df = get_fecher_2019_dataframe()
fecher2019_df.head()

Importing Fecher 2019 pandas dataframe.


Unnamed: 0,gene_names,sample_id,Organism,Study,raw_data_units,raw_data,location,Age_days
0,0610007P14Rik;ORF11,ICGFP5Purkinjecellmito,mouse,Fecher 2019,LFQintensity,369244000.0,mitochondria,77
6,2610507B11Rik;Kiaa0100,ICGFP5Purkinjecellmito,mouse,Fecher 2019,LFQintensity,65273730.0,mitochondria,77
8,4833439L19Rik;P33monox,ICGFP5Purkinjecellmito,mouse,Fecher 2019,LFQintensity,988043400.0,mitochondria,77
9,4932438A13Rik;Kiaa1109,ICGFP5Purkinjecellmito,mouse,Fecher 2019,LFQintensity,206276000.0,mitochondria,77
11,6430548M08Rik;Kiaa0513,ICGFP5Purkinjecellmito,mouse,Fecher 2019,LFQintensity,504402400.0,mitochondria,77


### Fornasiero 2018

In [23]:
fornasiero2018 = get_fornasiero_2018_dataframe()

fornasiero2018.head()

Importing Fornasiero 2018 pandas dataframe.


Unnamed: 0,Uniprot,gene_names,raw_data,raw_data_units,Study,Organism,location,Age_cat,Age_days
0,O08553,Dpysl2;Crmp2;Ulip2,132605300.0,iBAQ,Fornasiero 2018,mouse,cortex,adult,168.0
1,O35643,Ap1b1;Adtb1,610001.3,iBAQ,Fornasiero 2018,mouse,cortex,adult,168.0
2,O54734,Ddost,14635.81,iBAQ,Fornasiero 2018,mouse,cortex,adult,168.0
3,O55131,Sept7;Cdc10,3060770.0,iBAQ,Fornasiero 2018,mouse,cortex,adult,168.0
4,O55143,Atp2a2,2589334.0,iBAQ,Fornasiero 2018,mouse,cortex,adult,168.0


### Guergues 2019

In [24]:
guergues2019_df = get_guergues_2019_dataframe ()
guergues2019_df.head()

Importing Guergues 2019 pandas dataframe.


Unnamed: 0,Uniprot,gene_names,molecular_weight_kDa,sample_id,raw_data,Organism,Study,raw_data_units,location,Age_cat,Age_days
0,P60710;E9Q1F2;E9Q5F4;G3UZ07,Actb;Actg1,41.736,STrap300K1,437100000000,mouse,Guergues 2019,LFQintensity,microglia,adult,77
1,P60710;E9Q1F2;E9Q5F4;G3UZ07,Actb;Actg1,41.736,STrap300K2,357320000000,mouse,Guergues 2019,LFQintensity,microglia,adult,77
2,P60710;E9Q1F2;E9Q5F4;G3UZ07,Actb;Actg1,41.736,STrap300K3,447270000000,mouse,Guergues 2019,LFQintensity,microglia,adult,77
3,P10126;D3YZ68,Eef1a1,50.113,STrap300K1,284050000000,mouse,Guergues 2019,LFQintensity,microglia,adult,77
4,P10126;D3YZ68,Eef1a1,50.113,STrap300K2,275170000000,mouse,Guergues 2019,LFQintensity,microglia,adult,77


### McKetney 2019

In [25]:
McKetney2019_df = get_mcketney_2019_dataframe()
McKetney2019_df.head()

Importing McKetney 2019 pandas dataframe.


Unnamed: 0,Uniprot,gene_names,sample_id,raw_data,Organism,Study,raw_data_units,Age_cat,Age_days,location,condition
0,A0A0A0MS14,IGHV1-45,s146AMY,4172000,human,McKetney 2019,LFQintensity,adult,11315,amygdala,control
1,A0A0A0MS14,IGHV1-45,s146CNC,0,human,McKetney 2019,LFQintensity,adult,11315,striatum,control
2,A0A0A0MS14,IGHV1-45,s146CBM,0,human,McKetney 2019,LFQintensity,adult,11315,cerebellum,control
3,A0A0A0MS14,IGHV1-45,s146ECX,0,human,McKetney 2019,LFQintensity,adult,11315,cortex,control
4,A0A0A0MS14,IGHV1-45,s146MFG,0,human,McKetney 2019,LFQintensity,adult,11315,cortex,control


### Hasan 2020

In [26]:
hasan2020_df = get_hasan_2020_dataframe()
hasan2020_df.head()

Importing  Hasan 2020 pandas dataframe.


Unnamed: 0,Uniprot,location,sample_id,raw_data,Organism,Study,raw_data_units,Age_days,condition
0,P16546,brainstem,EAE1,2374.8,mouse,Hasan 2020,tmt abundance,156,EAE
1,P16546,brainstem,EAE2,2441.7,mouse,Hasan 2020,tmt abundance,156,EAE
2,P16546,brainstem,EAE3,2461.8,mouse,Hasan 2020,tmt abundance,156,EAE
3,P16546,brainstem,CON1,2195.7,mouse,Hasan 2020,tmt abundance,156,control
4,P16546,brainstem,CON2,2616.6,mouse,Hasan 2020,tmt abundance,156,control


### Zhu 2018

In [27]:
zhu2018_df = get_zhu_2018_dataframe()
zhu2018_df.head()

Importing Zhu 2018 pandas dataframe.


Unnamed: 0,Uniprot,gene_names,sample_id,Organism,Study,raw_data,raw_data_units,Age_days,location
0,E9PT29;A0A096MIX2,Ddx17,CTX1,rat,Zhu 2018,132380.399119,LFQ,38,cortex
1,E9PT29;A0A096MIX2,Ddx17,CTX2,rat,Zhu 2018,193479.340537,LFQ,38,cortex
2,E9PT29;A0A096MIX2,Ddx17,CTX3,rat,Zhu 2018,116379.707526,LFQ,38,cortex
3,E9PT29;A0A096MIX2,Ddx17,CTX4,rat,Zhu 2018,179290.559513,LFQ,38,cortex
4,E9PT29;A0A096MIX2,Ddx17,CC1,rat,Zhu 2018,17022.555306,LFQ,38,corpus callosum


### Kjell 2020

In [28]:
kjell_2020_df = get_kjell_2020_dataframe()
kjell_2020_df.head()

Importing Kjell 2020 pandas dataframe.


Unnamed: 0,gene_names,sample_id,location,Uniprot,Organism,Study,Age_days,raw_data_units,raw_data
0,0610011F06Rik,cortexLMSS,cortex,,mouse,Kjell 2020,84,LFQ,147139800.0
1,0610037L13Rik,cortexLMSS,cortex,,mouse,Kjell 2020,84,LFQ,678765000.0
2,1600014C10Rik,cortexLMSS,cortex,,mouse,Kjell 2020,84,LFQ,62579840.0
3,1700021F05Rik,cortexLMSS,cortex,,mouse,Kjell 2020,84,LFQ,53377740.0
4,1700037H04Rik,cortexLMSS,cortex,,mouse,Kjell 2020,84,LFQ,1412729000.0


## Combine data

In [29]:
df_all = pd.concat([df_all, hamezah_2019_df, 
                    hamezah_2018_df, chuang2018_df,duda_2018_df,krogager_df,hosp_3,
                    itzhakConc,beltranfin,sharma_df,sharma4o_df,wisniewski_df,han_df,geiger_df,
                    bai2020_df,bai2020human_df,
                    carlyle2017_df_filt,davis2019_df,fecher2019_df,fornasiero2018,guergues2019_df,McKetney2019_df,hasan2020_df,zhu2018_df,kjell_2020_df
                   ], ignore_index=True,sort=False)
df_all = df_all.reset_index(drop=True)


df_all['gene_names'] = df_all['gene_names'].str.upper()
df_all['Uniprot'] = df_all['Uniprot'].str.upper()

df_all['Organism'] = df_all['Organism'].str.lower()
df_all['location'] = df_all['location'].str.lower()


In [30]:
df_all['Organism'].unique()

array(['mouse', 'rat', 'human'], dtype=object)

In [31]:
df_all.loc[df_all['Organism'].isna(),'Study'].unique()

array([], dtype=object)

In [32]:
df_all.loc[df_all['location'].isna(),'Study'].unique()

array([], dtype=object)

In [33]:
df_all.loc[df_all['Age_days'].isna(),'Study'].unique()

array([], dtype=object)

In [34]:
df_all.loc[df_all['Age_cat'].isna(),'Study'].unique()

array(['Hamezah 2019', 'Hamezah 2018', 'Duda 2018', 'Krogager 2018',
       'Hosp 2017, soluble', 'Hosp 2017, CSF', 'Hosp 2017, insoluble',
       'Sharma 2015, cultured', 'Sharma 2015, isolated',
       'Wisniewski 2015', 'Han 2014', 'Geiger 2013', 'Bai 2020',
       'Carlyle 2017', 'Fecher 2019', 'Hasan 2020', 'Zhu 2018',
       'Kjell 2020'], dtype=object)

In [35]:
df_all['location'].unique()

array(['hippocampus', 'cortex', 'striatum', 'axon', 'neurons',
       'cerebellum', 'csf', 'plasma membrane', 'peroxisome',
       'nuclear pore complex/nuclear', 'mitochondrion', 'lysosome',
       'large protein complex', 'golgi', 'ergic/cisgolgi',
       'er_high_curvature', 'er', 'endosome', 'actin binding proteins',
       'subcellular not specified', 'mitochondria', 'cytoplasm',
       'microglia', 'astrocytes', 'oligodendrocytes', 'brain',
       'brainstem', 'corpus callosum', 'olfactory bulb', 'optic nerve',
       'thalamus', 'medulla', 'midbrain', 'amygdala', 'spinal cord',
       'subependymal zone', 'medial subependymal zone'], dtype=object)

In [36]:
df_all.loc[df_all['location']=='striatum ','location'].unique()

array([], dtype=object)

In [37]:
df_all.loc[df_all['location']=='nuclear pore complex/nuclear','location'] = 'nucleus'
df_all.loc[df_all['location']=='mitochondrion','location'] = 'mitochondria'
df_all.loc[df_all['location']=='ergic/cisgolgi','location'] = 'golgi'
df_all.loc[df_all['location']=='er_high_curvature','location'] = 'er'

In [38]:
df_all.loc[df_all['location'].isna(),'Study'].unique()

array([], dtype=object)

In [39]:
df_all['location'].unique()

array(['hippocampus', 'cortex', 'striatum', 'axon', 'neurons',
       'cerebellum', 'csf', 'plasma membrane', 'peroxisome', 'nucleus',
       'mitochondria', 'lysosome', 'large protein complex', 'golgi', 'er',
       'endosome', 'actin binding proteins', 'subcellular not specified',
       'cytoplasm', 'microglia', 'astrocytes', 'oligodendrocytes',
       'brain', 'brainstem', 'corpus callosum', 'olfactory bulb',
       'optic nerve', 'thalamus', 'medulla', 'midbrain', 'amygdala',
       'spinal cord', 'subependymal zone', 'medial subependymal zone'],
      dtype=object)

In [40]:
print(len(df_all))

df_all = df_all[~df_all['raw_data'].isna()].copy()

print(len(df_all))

2559290
2461156


In [41]:
print(len(df_all))
df_all = df_all.loc[df_all['raw_data']>0]
print(len(df_all))

2461156
2142774


In [42]:
df_all = df_all.reset_index(drop=True)
df_all.head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units
0,H2-KE6;HSD17B8,,A0A068BEQ2;P50171;G3UX44,,Hamezah 2019,mouse,hippocampus,,476,WT,,26.587,3324025.0,LFQintensity
1,H2-KE6;HSD17B8,,A0A068BEQ2;P50171;G3UX44,,Hamezah 2019,mouse,hippocampus,,476,Alzheimer,,26.587,1736439.0,LFQintensity
2,RAB11B;RAB11A,,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,,Hamezah 2019,mouse,hippocampus,,476,WT,,24.489,42737520.0,LFQintensity
3,RAB11B;RAB11A,,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,,Hamezah 2019,mouse,hippocampus,,476,Alzheimer,,24.489,115394600.0,LFQintensity
4,VPS52,,Q3V3A4;A0A068BGT0;Q8C754;G3UY33,,Hamezah 2019,mouse,hippocampus,,476,WT,,82.099,1298037.0,LFQintensity


In [43]:
with open('../data/1_df_all_9may2021.pkl','wb') as f:
    pkl.dump(df_all,f)