This file is part of MADIP: Molecular Atlas Data Integration Pipeline

This module provides helper functions for perform alignment of protein and gene ids among different data sources.

Copyright 2021 Blue Brain Project / EPFL 

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

## Align IDs

In [1]:
import pandas as pd
import numpy as np

import re

import pickle as pkl

import networkx as nx
from collections import Counter

from itertools import chain

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [192]:
import importlib
importlib.reload(protein_ids_alignment_helpers)

<module 'protein_ids_alignment_helpers' from '/gpfs/bbp.cscs.ch/project/proj34/sh_ngvm/github_os_polina/MADIP/protein_ids_alignment_helpers.py'>

In [193]:
import protein_ids_alignment_helpers
from protein_ids_alignment_helpers import (
    process_uniprot_mapping_data, 
    get_gene_unified, 
    get_uniprot_unified,
    check_GN_consistency_within_studies,
    any_in,
    get_uniprot_raw_data,
    get_gene_id_final
)


In [3]:
# load data produced by step_1_collect_protein_data.ipynb jupyter notebook 

with open('../data/1_df_all_9may2021.pkl','rb') as f:
    df_all = pkl.load(f)

In [4]:
# check for contaminant protein ids

print(len(df_all.loc[(~df_all['Uniprot'].isna())&(df_all['Uniprot'].str.contains("CON_"))]))
# CON_ is contaminant protein according to MaxQuant annotation

#drop potential experimental contaminants "CON__"
print(len(df_all))
df_all = df_all.loc[~((~df_all['Uniprot'].isna())&(df_all['Uniprot'].str.contains("CON_")))]
print(len(df_all))

490
2142774
2142284


In [5]:
# manually get Uniprot data from https://www.uniprot.org (latest done on 21july2020). 
#The queries are:
#(taxonomy:"Mus musculus (Mouse) [10090]" OR taxonomy:"Rattus norvegicus (Rat) [10116]" OR taxonomy:"Homo sapiens (Human) [9606]") AND reviewed:yes
#(taxonomy:"Mus musculus (Mouse) [10090]" OR taxonomy:"Rattus norvegicus (Rat) [10116]" OR taxonomy:"Homo sapiens (Human) [9606]") AND reviewed:no


uniprot_rev, uniprot_unrev, uniprot_rev_dict, uniprot_unrev_dict = process_uniprot_mapping_data()

# It will take a while.

uniprot_rev.head()

uniprot_rev:  45524
uniprot_unrev:  270950
check == 1.0: 1.0
check == 1.0: 1.0
rev gene names 50244
unrev gene names 63631
21407
229391


Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,gene_id_entry_name
0,P22071,3BHS1_RAT,reviewed,3 beta-hydroxysteroid dehydrogenase/Delta 5-->...,[HSD3B1],Rattus norvegicus (Rat),373,3BHS1
1,P15650,ACADL_RAT,reviewed,"Long-chain specific acyl-CoA dehydrogenase, mi...",[ACADL],Rattus norvegicus (Rat),430,ACADL
2,P07872,ACOX1_RAT,reviewed,Peroxisomal acyl-coenzyme A oxidase 1 (AOX) (E...,"[ACOX1, ACOX]",Rattus norvegicus (Rat),661,ACOX1
3,Q09137,AAPK2_RAT,reviewed,5'-AMP-activated protein kinase catalytic subu...,"[PRKAA2, AMPK, AMPK2]",Rattus norvegicus (Rat),552,AAPK2
4,Q64602,AADAT_RAT,reviewed,Kynurenine/alpha-aminoadipate aminotransferase...,"[AADAT, KAT2]",Rattus norvegicus (Rat),425,AADAT


In [6]:
# check for data formatting 
print(df_all.loc[(~df_all['gene_names'].isna()) & (df_all['gene_names'].str.contains('; ')),'Study'].unique())

df_all.loc[(~df_all['Uniprot'].isna()) & (df_all['Uniprot'].str.contains(' '))].head()

['Beltran 2016']


Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units


In [7]:
# clean formatting: replace "; " with ";"

df_all.loc[(df_all['Study']=='Beltran 2016') & (df_all['gene_names'].str.contains('; ')),'gene_names'] = df_all.loc[(df_all['Study']=='Beltran 2016') & (df_all['gene_names'].str.contains('; ')),'gene_names'].str.replace('; ',';')

# check for data formatting 
df_all.loc[(~df_all['gene_names'].isna()) & (df_all['gene_names'].str.contains('; ')),'Study'].unique()

array([], dtype=object)

In [8]:
# check for GN consistency within studies

multiids = check_GN_consistency_within_studies(df_all)

for i,v in multiids.items():
    print(i,'\t',len(v))

Hamezah 2019 	 45
Hamezah 2018 	 2
Chuang 2018 	 0
Duda 2018 	 0
Krogager 2018 	 0
Hosp 2017, soluble 	 10
Hosp 2017, CSF 	 4
Hosp 2017, insoluble 	 0
Itzhak 2017 	 0
Beltran 2016 	 0
Sharma 2015, cultured 	 25
Sharma 2015, isolated 	 14
Wisniewski 2015 	 13
Han 2014 	 0
Geiger 2013 	 12
Bai 2020 	 0
Carlyle 2017 	 0
Davis 2019 	 7
Fecher 2019 	 0
Fornasiero 2018 	 0
Guergues 2019 	 8
McKetney 2019 	 0
Hasan 2020 	 0
Zhu 2018 	 0
Kjell 2020 	 60


In [9]:
multiids['Hamezah 2019']

['SERPINA1C;SERPINA1A',
 'HSPA1B;HSPA1A',
 'VAMP2;VAMP3',
 'AIFM1;PDCD8',
 'NPTXR;NPCD',
 'AKR1B10;AKR1B8',
 'GM20390;NME1;NME2',
 'RAC1;RAC3',
 'ALYREF;ALYREF2;REFBP2',
 'ACOT1;ACOT2',
 'HIST1H1C;HIST1H1D',
 'ALDH1A1;ALDH1A7',
 'MUG1;MUG2',
 'PABPC1;PABPC6',
 'ARF3;ARF1;ARF2',
 'FGF12;FGF14',
 'KCNA2;KCNA3',
 'MYL12A;MYL12B;MYL9',
 'RHOA;RHOC',
 'STK24;STK26',
 'UBE2V2;UBE2V1;GM20431',
 'TPM3;TPM3-RS7',
 'NPM1;GM5611',
 'FAM120A;FAM120A',
 'MAPK10;MAPK8;MAPK9',
 'RBMXL1;RBMX',
 'TF;GM20425',
 'L1CAM;L1',
 'HNRNPA3;GM9242;GM6793',
 'ARF3;ARF1',
 'SRSF3;GM12355',
 'EIF2S3X;EIF2S3Y',
 'KRAS;HRAS',
 'EPB41L3;EPB4.1L3',
 'SERPINA1A;SERPINA1C',
 'RPL23A;GM6096',
 'MAPK10;MAPK9;MAPK8',
 'GABRB3;GABRB1;GABRB2',
 'CA2;CAR2',
 'FBL;FBLL1',
 'KCNA2;KCNA1;KCNA3',
 'PPP2CA;PPP2CB',
 'MYL12B;MYL12A;MYL9',
 'PDE10A MUS MUSCULUS;PDE10A',
 'GSPT1;GSPT2']

In [10]:
multiids['Hamezah 2018']

['SLK;STK10', 'KIF5A;KIF5B']

In [11]:
multiids['Hosp 2017, soluble']

['HM13;H13',
 'GM20498;SYNJ2BP',
 'RBM4;RBM14;RBM4B',
 'HBS1L;GM9923',
 'HNRNPA3;GM9242;GM8991',
 'RAB11B;RAB11A',
 'PALM2;GM20459',
 'RBM12;CPNE1',
 'EIF1B;EIF1',
 'PGM1;PGM2']

In [12]:
multiids['Hosp 2017, CSF']

['ACTB;ACTG1',
 'UBE2V1;GM20431;UBE2V2',
 'GM9242;HNRNPA3;GM8991;GM6793',
 'TPM3;TPM3-RS7']

In [13]:
multiids['Sharma 2015, cultured']

['TPM3;TPM3-RS7',
 'EPB4.1L2;EPB41L2',
 'ALDOA;ALDOART1',
 'PGM1;PGM2',
 'MAP4;MTAP4',
 'GM20521;PABPN1',
 'EPB41;EPB4.1',
 'ZBTB11;RPL24',
 'KIAA1468;2310035C23RIK',
 'SUMO2;SUMO3',
 'RBM12;CPNE1',
 'HM13;H13',
 'ANKS1A;ANKS1',
 'MRPS36;MRPS36-PS1',
 'SKT;ETL4',
 'RBFOX2;RBFOX1',
 'LILRB4;GP49A',
 'MLTK;B230120H23RIK',
 'HBB-B1;BETA-S',
 'PALM2;AKAP2',
 'ZFP451;ZNF451',
 'RNF8;FTSJD2',
 'CBX6;NPCD;NPTXR',
 'ZFML;ZNF638',
 'OTTMUSG00000005540;GM5431;GM12185;9930111J21RIK']

In [14]:
multiids['Sharma 2015, isolated']

['PGM1;PGM2',
 'RAB11B;RAB11A',
 'GM20431;UBE2V1',
 'GM9242;HNRNPA3;GM6793',
 '2310035C23RIK;KIAA1468',
 'ZNF207;ZFP207',
 'ANKS1A;ANKS1',
 'GM8730;RPLP0',
 'HM13;H13',
 'HDHD2;IER3IP1;GM10784',
 'RBM8;RBM8A',
 'CD1D1;CD1D2',
 'HBS1L;GM9923',
 'ICA;1300017J02RIK']

In [15]:
multiids['Wisniewski 2015']

['EPB4.1L2;EPB41L2',
 'EPB41L1;EPB4.1L1',
 'ETL4;SKT',
 'GM10094;SAP18',
 'GM20521;PABPN1',
 'GM8730;RPLP0',
 'H2AFZ;H2AFV',
 'HDHD2;IER3IP1',
 'HM13;H13',
 'MRPS36;MRPS36-PS1',
 'RAB1A;RAB1',
 'RBM12;CPNE1',
 'SUMO2;SUMO3']

In [16]:
multiids['Geiger 2013']

['EPB4.1L1;EPB41L1',
 'MRPS36;MRPS36-PS1',
 'EPB4.1L2;EPB41L2',
 'MAP4;MTAP4',
 '0610010K14RIK;BAP18',
 'RBM12;CPNE1',
 'RAB1A;RAB1',
 'SRP54;SRP54C',
 'EPB49;EPB4.9',
 'GM16709;IGH-VJ558',
 'HDHD2;IER3IP1',
 'RAB31;RAB22A']

In [17]:
multiids['Davis 2019']

['ATP5J2-PTCD1;ATP5J2',
 'TMEM189-UBE2V1;UBE2V1',
 'TPM3;DKFZP686J1372',
 'NME1-NME2;NME2;NME1',
 'RPL26;RPL26L1',
 'UBE2V2;UBE2V1',
 'NPTN;DKFZP566H1924']

In [18]:
multiids['Guergues 2019']

['ACTB;ACTG1',
 'SUMO2;SUMO3',
 'GM20431;UBE2V1',
 'PGM1;PGM2',
 'EPB4.1;EPB41',
 'ALDOA;ALDOART1',
 'HDHD2;IER3IP1',
 'MBNL1;MBNL2']

In [19]:
multiids['Kjell 2020']

['2310035C23RIK;KIAA1468',
 'ACAA1A;ACAA1B',
 'CLDND1;CLDN25',
 'CYB5A;CYB5',
 'DCDC2;DCDC2A',
 'EPB4.1L1;EPB41L1',
 'ETL4;SKT',
 'GAPDH;GM7293',
 'GM13695;GM13697;CWC22',
 'GM20521;PABPN1',
 'GM21985;SLC12A6',
 'H2AFV;H2AFZ',
 'HNRNPA3;GM9242',
 'LPPR3;BC005764',
 'MAP1LC3B;GM5612',
 'MOB1A;MOB1B',
 'NDC1;TMEM48',
 'PFDN6;H2-KE2',
 'PGM1;PGM2',
 'PSMD2;GM5422',
 'RBM12;CPNE1',
 'RBM4;RBM4B;RBM14',
 'RTCA;RTCA',
 'SRP54;SRP54C',
 'TDRP;2610019F03RIK',
 'BAI2;ADGRB2',
 'GM20498;SYNJ2BP',
 'ZNF207;ZFP207',
 'ADGRL2;LPHN2',
 'NPCD;NPTXR',
 'KIAA1468;2310035C23RIK',
 'ADGRL1;LPHN1',
 'GM9242;HNRNPA3;GM8991;GM6793',
 'COI;MTCO1',
 'MTCO2;MT-CO2',
 'MT-ATP6;MTATP6',
 'MT-ND2;MTND2',
 'MT-ND4;MTND4',
 'MTND5;MT-ND5',
 'MTATP8;MT-ATP8',
 'EGR1;EGR2;EGR3',
 'STAT5B;STAT5A',
 'RAB11B;RAB11A',
 'UBE2E3;UBE2E2;UBE2E1',
 'ARF3;ARF1',
 'SUMO2;SUMO3',
 'CYCS;GM10108',
 'DYNLL1;BC048507',
 'AARSD1;GM27029',
 'BAI1;ADGRB1',
 'NXPH1;NXPH2',
 'NPM1;GM5611',
 'DDX3X;D1PAS1',
 'CA4;CAR4',
 'LPHN3;ADGRL3',


In [20]:
df_all.loc[(df_all['gene_names']=='UCHL3;UCHL4') & (df_all['Study']=='Kjell 2020')].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units
2138682,UCHL3;UCHL4,,Q9JKB1;P58321,,Kjell 2020,mouse,cortex,,84,,cortex1,,3758969000.0,LFQ
2138683,UCHL3;UCHL4,,Q9JKB1;P58321,,Kjell 2020,mouse,cortex,,84,,cortex2,,373703800.0,LFQ
2138684,UCHL3;UCHL4,,Q9JKB1;P58321,,Kjell 2020,mouse,cortex,,84,,cortex3,,14526970.0,LFQ
2138685,UCHL3;UCHL4,,Q9JKB1;P58321,,Kjell 2020,mouse,cortex,,84,,cortex4,,61994200.0,LFQ
2138686,UCHL3;UCHL4,,Q9JKB1;P58321,,Kjell 2020,mouse,olfactory bulb,,84,,olfbulb1,,4237247000.0,LFQ


In [21]:
df_all.loc[(df_all['gene_names']=='UCHL3') & (df_all['Study']=='Kjell 2020')].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units
2054133,UCHL3,,,,Kjell 2020,mouse,cortex,,84,,cortexLMSS,,3520870000.0,LFQ
2060640,UCHL3,,,,Kjell 2020,mouse,olfactory bulb,,84,,olfbulbLMSS,,2940145000.0,LFQ
2067147,UCHL3,,,,Kjell 2020,mouse,subependymal zone,,84,,sezLMSS,,2768241000.0,LFQ
2073654,UCHL3,,,,Kjell 2020,mouse,medial subependymal zone,,84,,mezLMSS,,3224097000.0,LFQ


In [22]:
df_all.loc[(df_all['gene_names']=='UCHL4') & (df_all['Study']=='Kjell 2020')].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units


In [23]:
df_all.loc[(df_all['gene_names']=='NFIA;NFIX') & (df_all['Study']=='Guergues 2019')].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units
1575680,NFIA;NFIX,,B1AUB6;Q3TYK3;E9PUH7;B1AUB8;B1AUB9;D3YZ00;B1AU...,,Guergues 2019,mouse,microglia,adult,77,,STrap300K1,25.912,6641600.0,LFQintensity
1575681,NFIA;NFIX,,B1AUB6;Q3TYK3;E9PUH7;B1AUB8;B1AUB9;D3YZ00;B1AU...,,Guergues 2019,mouse,microglia,adult,77,,STrap300K2,25.912,17392000.0,LFQintensity
1575682,NFIA;NFIX,,B1AUB6;Q3TYK3;E9PUH7;B1AUB8;B1AUB9;D3YZ00;B1AU...,,Guergues 2019,mouse,microglia,adult,77,,STrap300K3,25.912,13059000.0,LFQintensity


In [24]:
df_all.loc[(df_all['gene_names']=='NFIA') & (df_all['Study']=='Guergues 2019')].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units


In [25]:
df_all.loc[(df_all['gene_names']=='NFIX') & (df_all['Study']=='Guergues 2019')].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units


In [26]:
# remove partial duplicates of type "GN1;GN2" .. "GN1" within studies, because we need unique, unambigous ids

df_all = df_all.reset_index(drop=True)

print(len(df_all))
for i,study in enumerate(df_all['Study'].unique()):
    indexNames = df_all.loc[(df_all['Study'] == study) & (df_all['gene_names'].isin(multiids[study]))].index
    
    df_all.drop(indexNames , inplace=True)
    
print(len(df_all))    

df_all = df_all.reset_index(drop=True)

2142284
2140373


In [27]:
# check for GN consistency within studies

multiids2 = check_GN_consistency_within_studies(df_all)

for i,v in multiids2.items():
    print(i,'\t',len(v))

Hamezah 2019 	 0
Hamezah 2018 	 0
Chuang 2018 	 0
Duda 2018 	 0
Krogager 2018 	 0
Hosp 2017, soluble 	 0
Hosp 2017, CSF 	 0
Hosp 2017, insoluble 	 0
Itzhak 2017 	 0
Beltran 2016 	 0
Sharma 2015, cultured 	 0
Sharma 2015, isolated 	 0
Wisniewski 2015 	 0
Han 2014 	 0
Geiger 2013 	 0
Bai 2020 	 0
Carlyle 2017 	 0
Davis 2019 	 0
Fecher 2019 	 0
Fornasiero 2018 	 0
Guergues 2019 	 0
McKetney 2019 	 0
Hasan 2020 	 0
Zhu 2018 	 0
Kjell 2020 	 0


In [28]:
#for check
df_all.loc[(~df_all['gene_names'].isna()) & (df_all['gene_names'].str.contains(','))].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units


In [29]:
#for check
df_all.loc[(~df_all['gene_names'].isna()) & (df_all['gene_names'].str.contains('; '))].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units


In [30]:
#for check
df_all.loc[(~df_all['Uniprot'].isna()) & (df_all['Uniprot'].str.contains(','))].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units


In [31]:
#for check
df_all.loc[(~df_all['Uniprot'].isna()) & (df_all['Uniprot'].str.contains(' '))].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units


In [32]:
# del NA from df_all['Uniprot']

df_all_4ids = df_all[['gene_names','Uniprot','Study']].copy()

df_all_4ids = df_all_4ids.drop_duplicates(keep='first')
print(len(df_all_4ids))

df_all_4ids[~df_all_4ids['Uniprot'].isna()]['Uniprot'].str.split(";") [0:3]

138583


0                         [A0A068BEQ2, P50171, G3UX44]
2    [Q78ZJ8, A0A068BFR3, P46638, G3UY29, E9Q3P9, F...
4                 [Q3V3A4, A0A068BGT0, Q8C754, G3UY33]
Name: Uniprot, dtype: object

In [35]:
a_uniprot = df_all_4ids[~df_all_4ids['Uniprot'].isna()]['Uniprot'].str.split(";") 

print(len(a_uniprot))

120827


In [41]:
# to remove isoform index, i.e. transform from PPPPPP-1 to PPPPPP
# and remove CON_ from weird uniprot ids

for idx,elem in enumerate(a_uniprot):
    if isinstance(elem,list):       
        for i, s in enumerate(elem):
            if "CON__" in s:
                s = s.replace("CON__","")
            if len(s)<6:
                print(s)
                
            elem[i] = s.split('-')[0] # it will change a_uniprot
            

















In [42]:
# 1536,1550d1535

# remove incorrect Uniprot ids: "1536,1550d1535"

for L in a_uniprot:
    try:
        L.remove("1536,1550d1535")
    except ValueError:
        pass

In [43]:
a_gn = df_all_4ids[~df_all_4ids['gene_names'].isna()]['gene_names'].str.split(";") 

In [44]:
print(len(a_uniprot))
print(len(a_gn))

print(len(df_all_4ids[~df_all_4ids['Uniprot'].isna()]['Uniprot'].str.split(";") ))

120827
126620
120827


In [45]:
# to check

for idx,elem in enumerate(a_uniprot):
    if isinstance(elem,list):        
        for i, s in enumerate(elem):
            if "-" in s:
                print(s)

In [46]:
print(len(df_all[~df_all['gene_names'].isna()]['gene_names'].unique()))

print(len(df_all[~df_all['gene_names'].isna()]['gene_names'].str.replace(",", ";").unique()))

16914
16914


In [47]:
# this will take a while

cG = nx.Graph()
cc = Counter()

for idx,pp in enumerate(a_gn):
    cc = cc + Counter(pp)
    cG.add_nodes_from(pp)
    for i in range(len(pp)-1):
        cG.add_edge(pp[i], pp[i+1])


In [48]:
prot_align_id_gn= {}

In [49]:
while len(cc):
    mc = cc.most_common(1)[0][0]
    for n in nx.node_connected_component(cG, mc):
        prot_align_id_gn[n] = mc
        del cc[n]

In [50]:
df_g_s = df_all.loc[~df_all['gene_names'].isna(),['gene_names','Study']].drop_duplicates(keep='first')
print(len(df_g_s))
df_g_s = df_g_s.reset_index(drop=True)
df_g_s.head()

117353


Unnamed: 0,gene_names,Study
0,H2-KE6;HSD17B8,Hamezah 2019
1,RAB11B;RAB11A,Hamezah 2019
2,VPS52,Hamezah 2019
3,MT-CO2;COX2;MTCO2,Hamezah 2019
4,MT-CO3;COX3;COXIII,Hamezah 2019


In [51]:
# calc number of occurences for every gn

gn_study_dict = dict()

for idx, row in df_g_s.iterrows():
    gn_list = row["gene_names"].replace(" ","").split(";")
    #gn_list = [x.split("-")[0] for x in gn_list0 if x is not None] # check if its needed now!!!!
    
    for elem in gn_list:
        check = gn_study_dict.get(elem,"NewIDfoundInGNStudyList")
        
        if check == "NewIDfoundInGNStudyList":
            gn_study_dict[elem] = 1
        else:
            gn_study_dict[elem] = 1+check

In [52]:
print(len(gn_study_dict)) # 16743

16743


In [53]:
df_all['gene_name_unified'].unique()

array([nan], dtype=object)

In [54]:
df_all = df_all.drop(columns='gene_name_unified')

In [55]:
# to get gene_names to gene_name_unified mapping using gn_study_dict (occurences dict)

df_all_gn = pd.DataFrame(df_all.loc[~(df_all['gene_names'].isna()),'gene_names'].copy())
df_all_gn = df_all_gn.drop_duplicates(keep='first')
df_all_gn = df_all_gn.reset_index(drop=True)

In [68]:
df_all_gn = df_all_gn.reset_index(drop=True)

df_all_gn['gene_name_unified'] =  None

for index,row in df_all_gn.iterrows():
    df_all_gn.loc[index,'gene_name_unified'] = get_gene_unified(index,row,gn_study_dict)

In [69]:
print(len(df_all))

df = pd.merge(df_all,df_all_gn,how='inner',on=['gene_names'])

print(len(df))

2140373
1864148


In [70]:
(len(df_all.loc[df_all['gene_names'].isna()]) + len(df))/len(df_all)

1.0

In [71]:
df_all['gene_name_unified'] = None

In [None]:
# make same columns order

In [72]:
df.columns

Index(['gene_names', 'Uniprot', 'Uniprot_unified', 'Study', 'Organism',
       'location', 'Age_cat', 'Age_days', 'condition', 'sample_id',
       'molecular_weight_kDa', 'raw_data', 'raw_data_units',
       'gene_name_unified'],
      dtype='object')

In [73]:
df_all.columns

Index(['gene_names', 'Uniprot', 'Uniprot_unified', 'Study', 'Organism',
       'location', 'Age_cat', 'Age_days', 'condition', 'sample_id',
       'molecular_weight_kDa', 'raw_data', 'raw_data_units',
       'gene_name_unified'],
      dtype='object')

In [74]:
# make same columns order
df = df[df_all.columns]

df.columns

Index(['gene_names', 'Uniprot', 'Uniprot_unified', 'Study', 'Organism',
       'location', 'Age_cat', 'Age_days', 'condition', 'sample_id',
       'molecular_weight_kDa', 'raw_data', 'raw_data_units',
       'gene_name_unified'],
      dtype='object')

In [75]:
df2 = pd.concat([df, df_all.loc[df_all['gene_names'].isna()]],sort=False )

df2 = df2.reset_index(drop=True)

In [76]:
#df_all['gene_name_unified'] = df_all['gene_names'].str.split(";", 1).str[0].replace(prot_align_id_gn)
# replace(" ","")

In [77]:
df_all = df2.copy()

In [78]:
df_all.columns

Index(['gene_names', 'Uniprot', 'Uniprot_unified', 'Study', 'Organism',
       'location', 'Age_cat', 'Age_days', 'condition', 'sample_id',
       'molecular_weight_kDa', 'raw_data', 'raw_data_units',
       'gene_name_unified'],
      dtype='object')

In [79]:
len(df_all['gene_name_unified'].unique()) #14537

14537

In [80]:
print(len(df_all.loc[df_all['gene_names'].isna(),'Study'].unique()))
print(len(df_all.loc[~df_all['gene_names'].isna(),'Study'].unique()))

21
24


In [81]:
len(df_all['Study'].unique())

25

In [82]:
cGuniprot = nx.Graph()

ccuniprot = Counter()

for idxu,ppu in enumerate(a_uniprot):
    
    ccuniprot = ccuniprot + Counter(ppu)
    cGuniprot.add_nodes_from(ppu)
    for iu in range(len(ppu)-1):
        cGuniprot.add_edge(ppu[iu], ppu[iu+1])


print(len(a_uniprot))

print(len(cGuniprot.nodes()))

print(len(ccuniprot))

120827
61791
61791


In [83]:
prot_align_id_uniprot= {}

In [84]:
while len(ccuniprot):
    mcuniprot = ccuniprot.most_common(1)[0][0]
    for nuniprot in nx.node_connected_component(cGuniprot, mcuniprot):
        prot_align_id_uniprot[nuniprot] = mcuniprot
        del ccuniprot[nuniprot]

print(len(prot_align_id_uniprot))

61791


In [85]:
df_u_s = df_all.loc[~df_all['Uniprot'].isna(),['Uniprot','Study']].drop_duplicates(keep='first')
print(len(df_u_s)) #120899
df_u_s.head()

120819


Unnamed: 0,Uniprot,Study
0,A0A068BEQ2;P50171;G3UX44,Hamezah 2019
38,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,Hamezah 2019
40,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,Hamezah 2019
44,P46638,Krogager 2018
46,Q78ZJ8;P46638;G3UY29;E9Q3P9;F8WGS1;Q0PD45;P624...,"Sharma 2015, cultured"


In [86]:
print(len(df_u_s.loc[df_u_s['Uniprot'].str.contains("CON_")]))

# CON_ is contaminant protein according to MaxQuant annotation


0


In [87]:
# 
df_u_s.loc[df_u_s['Uniprot'].str.contains("1536,1550d1535")].head()

Unnamed: 0,Uniprot,Study


In [88]:
# calc number of occurences for every uniprot id and check for weird ids

uniprot_study_dict = dict()

for idx, row in df_u_s.iterrows():
    uniprots_list0 = row["Uniprot"].replace(" ","").split(";")
    uniprots_list = [x.split("-")[0] for x in uniprots_list0 if x is not None] # check if its needed now!!!!
    
    for elem in uniprots_list:
        check = uniprot_study_dict.get(elem,"NewIDfoundInUniprotStudyList")
        
        if check == "NewIDfoundInUniprotStudyList":
            uniprot_study_dict[elem] = 1
        else:
            uniprot_study_dict[elem] = 1+check

In [89]:
#df_all['Uniprot_unified'] = df_all['Uniprot'].str.split(";", 1).str[0].replace(prot_align_id_uniprot)

#df_all['Uniprot_unified'] =  #df_all['Uniprot'].str.split(";", 1).str[0]  # get most common


In [90]:
# to get gene_names to gene_name_unified mapping using gn_study_dict (occurences dict)

df_all_uni = pd.DataFrame(df_all.loc[~(df_all['Uniprot'].isna()),'Uniprot'].copy())
df_all_uni = df_all_uni.drop_duplicates(keep='first')
df_all_uni = df_all_uni.reset_index(drop=True)

df_all_uni['Uniprot_unified'] =  None

for index,row in df_all_uni.iterrows():
    df_all_uni.loc[index,'Uniprot_unified'] = get_uniprot_unified(index,row,uniprot_study_dict)

In [91]:
df_all_uni.columns

Index(['Uniprot', 'Uniprot_unified'], dtype='object')

In [92]:
df_all.columns

Index(['gene_names', 'Uniprot', 'Uniprot_unified', 'Study', 'Organism',
       'location', 'Age_cat', 'Age_days', 'condition', 'sample_id',
       'molecular_weight_kDa', 'raw_data', 'raw_data_units',
       'gene_name_unified'],
      dtype='object')

In [93]:
df_all['Uniprot_unified'].unique()

array([nan], dtype=object)

In [94]:
df_all = df_all.drop(columns = 'Uniprot_unified')

In [95]:
print(len(df_all))

df = pd.merge(df_all,df_all_uni,how='inner',on=['Uniprot'])

print(len(df))



2140373
1620258


In [96]:
df.columns

Index(['gene_names', 'Uniprot', 'Study', 'Organism', 'location', 'Age_cat',
       'Age_days', 'condition', 'sample_id', 'molecular_weight_kDa',
       'raw_data', 'raw_data_units', 'gene_name_unified', 'Uniprot_unified'],
      dtype='object')

In [97]:
df_all['Uniprot_unified'] = None

# make same columns order
df = df[df_all.columns]

df.columns

Index(['gene_names', 'Uniprot', 'Study', 'Organism', 'location', 'Age_cat',
       'Age_days', 'condition', 'sample_id', 'molecular_weight_kDa',
       'raw_data', 'raw_data_units', 'gene_name_unified', 'Uniprot_unified'],
      dtype='object')

In [98]:
df2 = pd.concat([df, df_all.loc[df_all['Uniprot'].isna()]],sort=False )

df2 = df2.reset_index(drop=True)

df_all = df2.copy()

In [99]:
#df_all['Uniprot_unified'] = df_all['Uniprot_unified'].str.split("-").str[0]


In [100]:
df_all['Uniprot_unified'] = df_all['Uniprot_unified'].replace(prot_align_id_uniprot)

In [101]:
len(df_all.loc[df_all['Uniprot_unified'].isna()])

520115

In [102]:
df_all.loc[(~df_all['Uniprot'].isna())&(df_all['Uniprot'].str.contains("-"))&(df_all['Study']!='Hosp 2017, soluble')].head()

Unnamed: 0,gene_names,Uniprot,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units,gene_name_unified,Uniprot_unified
44,RAB11B;RAB11A,Q15907;Q15907-2;H3BMH2;H3BSC1;P62491-2;P62491,Davis 2019,human,neurons,post-mortem,post-mortem,,BetzCapSP31,24.488,5906800.0,iBAQ,RAB11B,P46638
45,RAB11B;RAB11A,Q15907;Q15907-2;H3BMH2;H3BSC1;P62491-2;P62491,Davis 2019,human,neurons,post-mortem,post-mortem,,BetzCapSP32,24.488,12107000.0,iBAQ,RAB11B,P46638
46,RAB11B;RAB11A,Q15907;Q15907-2;H3BMH2;H3BSC1;P62491-2;P62491,Davis 2019,human,neurons,post-mortem,post-mortem,,BetzCapSP33,24.488,7468100.0,iBAQ,RAB11B,P46638
47,RAB11B;RAB11A,Q15907;Q15907-2;H3BMH2;H3BSC1;P62491-2;P62491,Davis 2019,human,neurons,post-mortem,post-mortem,,PurkinjeCapSP31,24.488,1095500.0,iBAQ,RAB11B,P46638
48,RAB11B;RAB11A,Q15907;Q15907-2;H3BMH2;H3BSC1;P62491-2;P62491,Davis 2019,human,neurons,post-mortem,post-mortem,,PurkinjeCapSP32,24.488,987940.0,iBAQ,RAB11B,P46638


In [103]:
print(len(df_all['Uniprot_unified'].unique())) # 18718

print(len(df_all.loc[df_all['Uniprot'].isna(),'Study'].unique()))
print(len(df_all.loc[~df_all['Uniprot'].isna(),'Study'].unique()))

print(len(df_all.loc[df_all['Uniprot_unified'].isna(),'Study'].unique()))
print(len(df_all.loc[~df_all['Uniprot_unified'].isna(),'Study'].unique()))

print(len(df_all.loc[(df_all['Uniprot_unified'].isna()) & (df_all['Uniprot'].isna()),'Study'].unique()))
print(len(df_all.loc[(~df_all['Uniprot_unified'].isna()) & (df_all['Uniprot'].isna()),'Study'].unique()))

print(len(df_all.loc[df_all['Uniprot'].isna() & df_all['gene_names'].isna(),'Study'].unique()))

print(len(df_all.loc[df_all['Uniprot_unified'].isna() & df_all['gene_name_unified'].isna(),'Study'].unique()))

18718
5
22
5
22
5
0
0
0


### Get Uniprot ids by gene_name_unified for missing Uniprot ids; get Gene Names by Uniprot_unified for missing Gene Names; use Uniprot-GN mapping to check GN-unified

In [104]:
#Data downloaded on 05june2020 is from 
#ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/
# is in /Users/polina/git/bbpMolAtlas/2020/data/ids_mapping_05june2020

mouse_uniprot_ids = pd.read_csv('../data/MOUSE_10090_idmapping.dat.gz',header=None,sep='\t')
rat_uniprot_ids = pd.read_csv('../data/RAT_10116_idmapping.dat.gz',header=None,sep='\t')
human_uniprot_ids = pd.read_csv('../data/HUMAN_9606_idmapping.dat.gz',header=None,sep='\t')

mouse_uniprot_ids.columns = ['Uniprot','ID_type','ID']
rat_uniprot_ids.columns = ['Uniprot','ID_type','ID']
human_uniprot_ids.columns = ['Uniprot','ID_type','ID']

mouse_uniprot_ids['id_of_organism'] = 'mouse'
rat_uniprot_ids['id_of_organism'] = 'rat'
human_uniprot_ids['id_of_organism'] = 'human'

#combine data for multiple organisms
uniprot_ids_mrh = pd.concat([mouse_uniprot_ids,rat_uniprot_ids,human_uniprot_ids],ignore_index=True,sort=True)

print((len(mouse_uniprot_ids['Uniprot'].unique())+len(rat_uniprot_ids['Uniprot'].unique())+len(human_uniprot_ids['Uniprot'].unique()))/len(uniprot_ids_mrh['Uniprot'].unique()),len(uniprot_ids_mrh))

#keep only needed id-types
print(len(uniprot_ids_mrh))
uniprot_ids_mrh = uniprot_ids_mrh.loc[uniprot_ids_mrh['ID_type'].isin(['UniProtKB-ID', 'Gene_Name','GeneID','Gene_Synonym','GeneCards','HGNC'])].copy()
print(len(uniprot_ids_mrh))

uniprot_ids_mrh = uniprot_ids_mrh.reset_index(drop=True)

uniprot_ids_mrh['ID'] = uniprot_ids_mrh['ID'].str.upper()

len(uniprot_ids_mrh.loc[uniprot_ids_mrh['ID_type']=='Gene_Name','Uniprot'].unique())

1.0 9349861
9349861
828947


268298

In [105]:
uniprot_ids_mrh.head()

Unnamed: 0,ID,ID_type,Uniprot,id_of_organism
0,1433B_MOUSE,UniProtKB-ID,Q9CQV8,mouse
1,YWHAB,Gene_Name,Q9CQV8,mouse
2,54401,GeneID,Q9CQV8,mouse
3,1433E_MOUSE,UniProtKB-ID,P62259,mouse
4,YWHAE,Gene_Name,P62259,mouse


In [106]:
len(df_all.loc[df_all['Uniprot_unified'].isna() & (~df_all['gene_name_unified'].isna()),'gene_name_unified'].unique())

8083

In [107]:
df_all.loc[df_all['gene_name_unified']=='KRT8','Uniprot_unified'].unique()

array(['P11679', 'P05787', None], dtype=object)

In [108]:
len(df_all.loc[(~df_all['Uniprot_unified'].isna()) & (df_all['Uniprot_unified'].str.contains('CON_'))])

0

In [109]:
len(df_all.loc[(df_all['Study']=='Davis 2019') & (df_all['gene_name_unified'].isna()) ])

214

In [110]:
len(df_all.loc[(df_all['Study']=='Davis 2019') & (df_all['gene_name_unified'].isna()) ])/len(df_all.loc[(df_all['Study']=='Davis 2019')])

0.006543542074363992

In [111]:
df_all = df_all.reset_index(drop=True)

In [112]:
len(df_all.loc[df_all['gene_name_unified']!=df_all['gene_names']]) 

303482

In [113]:
df_all_withGN = df_all.loc[~df_all['gene_names'].isna()].copy()

In [114]:
len(df_all_withGN.loc[~df_all_withGN['gene_names'].str.contains(';'),'gene_names'].unique())

14449

In [115]:
len(df_all_withGN.loc[~df_all_withGN['gene_names'].str.contains(';'),'gene_name_unified'].unique())

14449

In [116]:
len(df_all_withGN.loc[df_all_withGN['gene_names'].str.contains(';'),'gene_names'].unique())

2465

In [117]:
len(df_all_withGN.loc[df_all_withGN['gene_names'].str.contains(';'),'gene_name_unified'].unique())

1902

In [118]:
len(df_all_withGN.loc[(df_all_withGN['gene_name_unified']!=df_all_withGN['gene_names']) & (~df_all_withGN['gene_names'].str.contains(';'))])

112

In [119]:
len(df_all_withGN.loc[(df_all_withGN['gene_name_unified']==df_all_withGN['gene_names']) & (~df_all_withGN['gene_names'].str.contains(';'))])

1836891

In [120]:
df_all_withGN.loc[df_all_withGN['gene_names'].str.contains('SRSF5;')].head(100)

Unnamed: 0,gene_names,Uniprot,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units,gene_name_unified,Uniprot_unified
1485240,SRSF5;HRS;SFRS5,O35326,Fornasiero 2018,mouse,cortex,adult,168,,,,555044.262627,iBAQ,SRSF5,O35326


In [121]:
df_all_withGN.loc[df_all_withGN['gene_names'].str.contains(';HRS')].head(100)

Unnamed: 0,gene_names,Uniprot,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units,gene_name_unified,Uniprot_unified
38154,HGS;HRS,Q99LI8,Fornasiero 2018,mouse,cortex,adult,168,,,,3818.936451,iBAQ,HGS,Q99LI8
1485240,SRSF5;HRS;SFRS5,O35326,Fornasiero 2018,mouse,cortex,adult,168,,,,555044.262627,iBAQ,SRSF5,O35326


In [122]:
len(uniprot_ids_mrh)

828947

In [123]:
uniprot_ids_mrh['ID_type'].unique()

array(['UniProtKB-ID', 'Gene_Name', 'GeneID', 'Gene_Synonym', 'GeneCards',
       'HGNC'], dtype=object)

In [124]:
len(uniprot_ids_mrh[uniprot_ids_mrh['ID_type']=='Gene_Name'])

268476

In [125]:
len(uniprot_ids_mrh.loc[uniprot_ids_mrh['ID_type']=='Gene_Name',['Uniprot','ID']].drop_duplicates(keep='first'))

268476

In [126]:
uniprot_ids_mrh_gnDupl = uniprot_ids_mrh.loc[uniprot_ids_mrh['ID_type']=='Gene_Name',['Uniprot','ID']]

print(len(uniprot_ids_mrh_gnDupl))
uniprot_ids_mrh_gnDupl = uniprot_ids_mrh_gnDupl.drop_duplicates(keep=False)

print(len(uniprot_ids_mrh_gnDupl))
uniprot_ids_mrh_gnDupl.head(10)

268476
268476


Unnamed: 0,Uniprot,ID
1,Q9CQV8,YWHAB
4,P62259,YWHAE
7,P68510,YWHAH
10,P61982,YWHAG
13,O70456,SFN
17,P68254,YWHAQ
20,P63101,YWHAZ
23,A2AIG8,ACCS
26,Q3UX83,ACCSL
30,Q6PD03,PPP2R5A


In [127]:
uniprot_ids_mrh_gnDupl_d = uniprot_ids_mrh_gnDupl.groupby('Uniprot').count()
print(len(uniprot_ids_mrh_gnDupl_d[uniprot_ids_mrh_gnDupl_d['ID']>1]))


90


In [128]:
uniprot_ids_mrh[(uniprot_ids_mrh['Uniprot'].isin(uniprot_ids_mrh_gnDupl_d[uniprot_ids_mrh_gnDupl_d['ID']>1].index.to_list())) & (uniprot_ids_mrh['ID_type']=='Gene_Name')].head(10)

Unnamed: 0,ID,ID_type,Uniprot,id_of_organism
11647,CYP3A41A,Gene_Name,Q9JMA7,mouse
11649,CYP3A41B,Gene_Name,Q9JMA7,mouse
13852,DEFA6,Gene_Name,P50704,mouse
13854,DEFA12,Gene_Name,P50704,mouse
15142,DPPA5B,Gene_Name,P85965,mouse
15143,DPPA5C,Gene_Name,P85965,mouse
23103,HIST2H2AA1,Gene_Name,Q6GSS7,mouse
23104,HIST2H2AA2,Gene_Name,Q6GSS7,mouse
23129,H2AL1A,Gene_Name,Q5M8Q2,mouse
23130,H2AL1C,Gene_Name,Q5M8Q2,mouse


In [129]:
uniprot_gn = uniprot_ids_mrh.loc[uniprot_ids_mrh['ID_type']=='Gene_Name',['Uniprot','ID']].groupby('Uniprot').aggregate(lambda tdf: tdf.unique().tolist())
uniprot_gn.head()

Unnamed: 0_level_0,ID
Uniprot,Unnamed: 1_level_1
A0A023GRW5,[RT1-N3]
A0A023HJ61,[RAB4A]
A0A023I7F4,[CYTB]
A0A023I7H2,[ND5]
A0A023I7H5,[ATP6]


In [130]:
uniprot_ids_mrh_dict = pd.Series(uniprot_gn['ID'].values,index=uniprot_gn.index).to_dict()

In [131]:
df_fgn = df_all[['gene_names', 'gene_name_unified', 'Uniprot', 'Uniprot_unified']].copy()
df_fgn = df_fgn.drop_duplicates(keep='first')
print(len(df_all))
print(len(df_fgn))

2140373
87028


In [132]:
df_fgn['gn_from_uniprot'] = df_fgn['Uniprot_unified'].copy()

df_fgn['gn_from_uniprot'] = df_fgn['gn_from_uniprot'].map(uniprot_ids_mrh_dict).fillna('NoMapping') #df_all.replace({"gn_from_uniprot": uniprot_ids_mrh_dict})

df_fgn.head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot
0,H2-KE6;HSD17B8,HSD17B8,A0A068BEQ2;P50171;G3UX44,P50171,[HSD17B8]
4,RAB11B;RAB11A,RAB11B,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,P46638,[RAB11B]
6,RAB11B;RAB11A,RAB11B,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,P46638,[RAB11B]
10,RAB11B;RAB11A,RAB11B,P46638,P46638,[RAB11B]
12,RAB11B;RAB11A,RAB11B,Q78ZJ8;P46638;G3UY29;E9Q3P9;F8WGS1;Q0PD45;P624...,P46638,[RAB11B]


In [133]:
gn_uniprot = uniprot_ids_mrh.loc[uniprot_ids_mrh['ID_type']=='Gene_Name',['Uniprot','ID']].groupby('ID').aggregate(lambda tdf: tdf.unique().tolist())
gn_uniprot.head()

Unnamed: 0_level_0,Uniprot
ID,Unnamed: 1_level_1
0 BETA-2 GLOBIN,[Q62670]
0610009B22RIK,[Q8R3W2]
0610009L18RIK,"[Q9CVY3, Q9CWA9]"
0610010F05RIK,"[Z4YM60, Z4YN22, Z4YN77, M0QWF3]"
0610010K14RIK,"[A2CF83, F6XN97, A2CF80, F8WH46, H3BJI0, D3Z68..."


In [134]:
uniprot_gn_mrh_dict =  pd.Series(gn_uniprot['Uniprot'].values,index=gn_uniprot.index).to_dict() 

In [135]:
df_fgn['uniprot_from_gn'] = df_fgn['gene_name_unified'].copy()

df_fgn['uniprot_from_gn'] = df_fgn['uniprot_from_gn'].map(uniprot_gn_mrh_dict).fillna('NoMapping') #df_all.replace({"gn_from_uniprot": uniprot_ids_mrh_dict})

df_fgn.head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn
0,H2-KE6;HSD17B8,HSD17B8,A0A068BEQ2;P50171;G3UX44,P50171,[HSD17B8],"[P50171, Q6MGB5, Q5BJM1, A0A0G2JV82, A0A0G2K6P..."
4,RAB11B;RAB11A,RAB11B,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
6,RAB11B;RAB11A,RAB11B,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
10,RAB11B;RAB11A,RAB11B,P46638,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
12,RAB11B;RAB11A,RAB11B,Q78ZJ8;P46638;G3UY29;E9Q3P9;F8WGS1;Q0PD45;P624...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."


In [136]:
gnu_count = df_fgn.loc[(~df_fgn['gene_name_unified'].isna()) & (~df_fgn['Uniprot_unified'].isna()),['gene_name_unified','Uniprot_unified']].drop_duplicates(keep='first').groupby('gene_name_unified').count()


In [137]:
gnu_count_dict = pd.Series(gnu_count['Uniprot_unified'].values,index=gnu_count.index).to_dict() 

In [138]:
gnu_count.sort_values('Uniprot_unified',ascending=False).head(10)

Unnamed: 0_level_0,Uniprot_unified
gene_name_unified,Unnamed: 1_level_1
ANK3,8
MAP2,7
NCAM1,7
MACF1,7
ANK2,7
HLA-C,6
TUBB4A,6
GLS,6
GFAP,6
KCNMA1,5


In [139]:
uniprot_u_count = df_fgn.loc[(~df_fgn['gene_name_unified'].isna()) & (~df_fgn['Uniprot_unified'].isna()),['gene_name_unified','Uniprot_unified']].drop_duplicates(keep='first').groupby('Uniprot_unified').count()
uniprot_u_count.sort_values('gene_name_unified',ascending=False).head(10)


Unnamed: 0_level_0,gene_name_unified
Uniprot_unified,Unnamed: 1_level_1
P35922,38
Q91XY4,12
Q6PB90,11
Q9EP75,8
P01897,7
P22599,6
Q66JV4,5
P01942,5
B2RPU8,5
Q9D1F0,5


In [140]:
uniprot_u_count_dict = pd.Series(uniprot_u_count['gene_name_unified'].values,index=uniprot_u_count.index).to_dict() 

In [141]:
df_fgn.loc[df_fgn['gene_name_unified']=='HIST1H2BC','Uniprot_unified'].unique()

array(['P35922', 'P57053', None], dtype=object)

In [142]:
gn_uniprot_unif_4dict = df_fgn.loc[(~df_fgn['gene_name_unified'].isna()) & (~df_fgn['Uniprot_unified'].isna()),['gene_name_unified','Uniprot_unified']].drop_duplicates(keep='first')

In [143]:
len(gn_uniprot_unif_4dict)

19565

In [144]:
#gn_uniprot_unif_dict =  pd.Series(gn_uniprot_unif_4dict['gene_name_unified'].values,index=gn_uniprot_unif_4dict['Uniprot_unified']).to_dict()

gn_uniprot_unif_dict = gn_uniprot_unif_4dict.groupby('Uniprot_unified').aggregate(lambda tdf: tdf.unique().tolist())

In [145]:
gn_uniprot_unif_dict.head()

Unnamed: 0_level_0,gene_name_unified
Uniprot_unified,Unnamed: 1_level_1
A0A024QZ33,[NSRP1]
A0A024R341,[THOC7]
A0A024R368,[PRICKLE2]
A0A024R3W6,[NRP2]
A0A024R5I4,[SLCO2B1]


In [146]:
len(gn_uniprot_unif_dict)

17960

In [147]:
gn_uniprot_unif_dict_l =  pd.Series(gn_uniprot_unif_dict['gene_name_unified'].values,index=gn_uniprot_unif_dict.index).to_dict() 

In [148]:
uniprot_gn_unif_dict = gn_uniprot_unif_4dict.groupby('gene_name_unified').aggregate(lambda tdf: tdf.unique().tolist())

uniprot_gn_unif_dict_l =  pd.Series(uniprot_gn_unif_dict['Uniprot_unified'].values,index=uniprot_gn_unif_dict.index).to_dict() 


In [149]:
uniprot_gn_unif_dict.head()

Unnamed: 0_level_0,Uniprot_unified
gene_name_unified,Unnamed: 1_level_1
0610007P22RIK,[Q5HZH2]
0610009B22RIK,[Q9CQP2]
0610010K14RIK,[Q9DCT6]
0610011F06RIK,[Q9DCS2]
0610031J06RIK,[Q9JHJ3]


In [150]:
len(df_fgn)

87028

In [151]:
df_fgn = df_fgn.reset_index(drop=True)

In [152]:
df_fgn[(df_fgn['gene_names'].isna()) & (~df_fgn['gene_name_unified'].isna())].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn


In [153]:
df_fgn.head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn
0,H2-KE6;HSD17B8,HSD17B8,A0A068BEQ2;P50171;G3UX44,P50171,[HSD17B8],"[P50171, Q6MGB5, Q5BJM1, A0A0G2JV82, A0A0G2K6P..."
1,RAB11B;RAB11A,RAB11B,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
2,RAB11B;RAB11A,RAB11B,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
3,RAB11B;RAB11A,RAB11B,P46638,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
4,RAB11B;RAB11A,RAB11B,Q78ZJ8;P46638;G3UY29;E9Q3P9;F8WGS1;Q0PD45;P624...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."


In [154]:
type([df_fgn.iloc[0,2]])

list

In [155]:
type(df_fgn.iloc[0,4])

list

# Given

In [156]:
# main data 

print(len(df_all))
df_all.head()

2140373


Unnamed: 0,gene_names,Uniprot,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units,gene_name_unified,Uniprot_unified
0,H2-KE6;HSD17B8,A0A068BEQ2;P50171;G3UX44,Hamezah 2019,mouse,hippocampus,,476,WT,,26.587,3324025.0,LFQintensity,HSD17B8,P50171
1,H2-KE6;HSD17B8,A0A068BEQ2;P50171;G3UX44,Hamezah 2019,mouse,hippocampus,,476,Alzheimer,,26.587,1736439.0,LFQintensity,HSD17B8,P50171
2,H2-KE6;HSD17B8,A0A068BEQ2;P50171;G3UX44,Hamezah 2019,mouse,striatum,,476,WT,,26.587,19806900.0,LFQintensity,HSD17B8,P50171
3,H2-KE6;HSD17B8,A0A068BEQ2;P50171;G3UX44,Hamezah 2019,mouse,striatum,,476,Alzheimer,,26.587,32413690.0,LFQintensity,HSD17B8,P50171
4,RAB11B;RAB11A,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,Hamezah 2019,mouse,hippocampus,,476,WT,,24.489,42737520.0,LFQintensity,RAB11B,P46638


In [157]:
# extracted from df_all, only columns related to IDs, dropped duplicates

print(len(df_fgn))

df_fgn.head()

# gene_names  -  original gene names from the raw data
# gene_name_unified - obtained by mapping from graph connected components (gene name with the highest number of occurences across all data sets compared to other genes of the the same connected component)

# Uniprot - original Uniprot IDs from the raw data
# Uniprot_unified - obtained by mapping from graph connected components 

# gn_from_uniprot - bad, because some studies report unrelated genes as one entry
# uniprot_from_gn - bad, because some studies report unrelated genes as one entry

# gene_id_final - needed

87028


Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn
0,H2-KE6;HSD17B8,HSD17B8,A0A068BEQ2;P50171;G3UX44,P50171,[HSD17B8],"[P50171, Q6MGB5, Q5BJM1, A0A0G2JV82, A0A0G2K6P..."
1,RAB11B;RAB11A,RAB11B,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
2,RAB11B;RAB11A,RAB11B,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
3,RAB11B;RAB11A,RAB11B,P46638,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
4,RAB11B;RAB11A,RAB11B,Q78ZJ8;P46638;G3UY29;E9Q3P9;F8WGS1;Q0PD45;P624...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."


In [158]:
# Uniprot-to-other IDs mapping. 
#Data downloaded on 05june2020 is from 
#ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/
# is in /Users/polina/git/bbpMolAtlas/2020/data/ids_mapping_05june2020

uniprot_ids_mrh.head()

Unnamed: 0,ID,ID_type,Uniprot,id_of_organism
0,1433B_MOUSE,UniProtKB-ID,Q9CQV8,mouse
1,YWHAB,Gene_Name,Q9CQV8,mouse
2,54401,GeneID,Q9CQV8,mouse
3,1433E_MOUSE,UniProtKB-ID,P62259,mouse
4,YWHAE,Gene_Name,P62259,mouse


In [159]:
# from uniprot_ids_mrh to make dict of lists:

#uniprot_gn = uniprot_ids_mrh.loc[uniprot_ids_mrh['ID_type']=='Gene_Name',['Uniprot','ID']].groupby('Uniprot').aggregate(lambda tdf: tdf.unique().tolist())
uniprot_gn.head()
# uniprot_ids_mrh_dict = pd.Series(uniprot_gn['ID'].values,index=uniprot_gn.index).to_dict()

Unnamed: 0_level_0,ID
Uniprot,Unnamed: 1_level_1
A0A023GRW5,[RT1-N3]
A0A023HJ61,[RAB4A]
A0A023I7F4,[CYTB]
A0A023I7H2,[ND5]
A0A023I7H5,[ATP6]


In [160]:
# from uniprot_ids_mrh make dict of lists:

#gn_uniprot = uniprot_ids_mrh.loc[uniprot_ids_mrh['ID_type']=='Gene_Name',['Uniprot','ID']].groupby('ID').aggregate(lambda tdf: tdf.unique().tolist())
gn_uniprot.head()
# uniprot_gn_mrh_dict =  pd.Series(gn_uniprot['Uniprot'].values,index=gn_uniprot.index).to_dict() 

Unnamed: 0_level_0,Uniprot
ID,Unnamed: 1_level_1
0 BETA-2 GLOBIN,[Q62670]
0610009B22RIK,[Q8R3W2]
0610009L18RIK,"[Q9CVY3, Q9CWA9]"
0610010F05RIK,"[Z4YM60, Z4YN22, Z4YN77, M0QWF3]"
0610010K14RIK,"[A2CF83, F6XN97, A2CF80, F8WH46, H3BJI0, D3Z68..."


In [161]:
# UniprotMainData
# Uniprot main data (reviewed), 21july2020

#(taxonomy:"Mus musculus (Mouse) [10090]" OR taxonomy:"Rattus norvegicus (Rat) [10116]" OR taxonomy:"Homo sapiens (Human) [9606]") AND reviewed:yes

uniprot_rev.head() # reviewed, best

# Entry - Uniprot
# Entry name - "AlmostGeneID_Organism" (almost, because in some cases it's modified gene name)
# Gene names -  list of TRUE synonymous gene names
# gene_id_entry_name - AlmostGeneID obtained by split("-") from Entry name

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,gene_id_entry_name
0,P22071,3BHS1_RAT,reviewed,3 beta-hydroxysteroid dehydrogenase/Delta 5-->...,[HSD3B1],Rattus norvegicus (Rat),373,3BHS1
1,P15650,ACADL_RAT,reviewed,"Long-chain specific acyl-CoA dehydrogenase, mi...",[ACADL],Rattus norvegicus (Rat),430,ACADL
2,P07872,ACOX1_RAT,reviewed,Peroxisomal acyl-coenzyme A oxidase 1 (AOX) (E...,"[ACOX1, ACOX]",Rattus norvegicus (Rat),661,ACOX1
3,Q09137,AAPK2_RAT,reviewed,5'-AMP-activated protein kinase catalytic subu...,"[PRKAA2, AMPK, AMPK2]",Rattus norvegicus (Rat),552,AAPK2
4,Q64602,AADAT_RAT,reviewed,Kynurenine/alpha-aminoadipate aminotransferase...,"[AADAT, KAT2]",Rattus norvegicus (Rat),425,AADAT


In [164]:
# UniprotMainData_2
# Uniprot main data 2 (unreviewed), 21july2020

#(taxonomy:"Mus musculus (Mouse) [10090]" OR taxonomy:"Rattus norvegicus (Rat) [10116]" OR taxonomy:"Homo sapiens (Human) [9606]") AND reviewed:no


uniprot_unrev.head() # unreviewed

# Entry - Uniprot
# Entry name - "AlmostGeneID_Organism" (almost, because in some cases it's modified gene name)
# Gene names -  list of TRUE synonymous gene names
# gene_id_entry_name - AlmostGeneID obtained by split("-") from Entry name

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,gene_id_entry_name
0,D3ZMG0,D3ZMG0_RAT,unreviewed,Serine/threonine-protein kinase (EC 2.7.11.1),"[ULK1, ULK1_MAPPED, RCG_21843]",Rattus norvegicus (Rat),1051,D3ZMG0
1,A0A0G2K5E4,A0A0G2K5E4_RAT,unreviewed,DnaJ heat shock protein family (Hsp40) member ...,"[DNAJA3, RCG_49803]",Rattus norvegicus (Rat),480,A0A0G2K5E4
2,G3V6B1,G3V6B1_RAT,unreviewed,Transforming growth factor beta,"[TGFB2, RCG_20076]",Rattus norvegicus (Rat),442,G3V6B1
3,D3Z9Z0,D3Z9Z0_RAT,unreviewed,"Ankyrin 1 (Ankyrin 1, erythroid)","[ANK1, RCG_43073]",Rattus norvegicus (Rat),1707,D3Z9Z0
4,F1M670,F1M670_RAT,unreviewed,Formyl peptide receptor 2,[FPR2],Rattus norvegicus (Rat),351,F1M670


In [165]:
# from  Uniprot main data

#uniprot_rev_dict = pd.Series(uniprot_rev['Gene names'].values,index=uniprot_rev['Entry']).to_dict() 
#uniprot_unrev_dict = pd.Series(uniprot_unrev['Gene names'].values,index=uniprot_unrev['Entry']).to_dict() 

#uniprot_rev_genes = list(set([item for sublist in uniprot_rev['Gene names'].tolist() for item in sublist]))

#uniprot_unrev_genes = list(set([item for sublist in uniprot_unrev['Gene names'].tolist() for item in sublist]))

In [166]:
uniprot_rev.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,gene_id_entry_name
0,P22071,3BHS1_RAT,reviewed,3 beta-hydroxysteroid dehydrogenase/Delta 5-->...,[HSD3B1],Rattus norvegicus (Rat),373,3BHS1
1,P15650,ACADL_RAT,reviewed,"Long-chain specific acyl-CoA dehydrogenase, mi...",[ACADL],Rattus norvegicus (Rat),430,ACADL
2,P07872,ACOX1_RAT,reviewed,Peroxisomal acyl-coenzyme A oxidase 1 (AOX) (E...,"[ACOX1, ACOX]",Rattus norvegicus (Rat),661,ACOX1
3,Q09137,AAPK2_RAT,reviewed,5'-AMP-activated protein kinase catalytic subu...,"[PRKAA2, AMPK, AMPK2]",Rattus norvegicus (Rat),552,AAPK2
4,Q64602,AADAT_RAT,reviewed,Kynurenine/alpha-aminoadipate aminotransferase...,"[AADAT, KAT2]",Rattus norvegicus (Rat),425,AADAT


In [167]:
#uniprot_rev.loc[uniprot_rev['Organism'].str.contains('Mus')].head()

In [168]:
df_fgn.head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn
0,H2-KE6;HSD17B8,HSD17B8,A0A068BEQ2;P50171;G3UX44,P50171,[HSD17B8],"[P50171, Q6MGB5, Q5BJM1, A0A0G2JV82, A0A0G2K6P..."
1,RAB11B;RAB11A,RAB11B,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
2,RAB11B;RAB11A,RAB11B,Q78ZJ8;A0A068BFR3;P46638;G3UY29;E9Q3P9;F8WGS1;...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
3,RAB11B;RAB11A,RAB11B,P46638,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."
4,RAB11B;RAB11A,RAB11B,Q78ZJ8;P46638;G3UY29;E9Q3P9;F8WGS1;Q0PD45;P624...,P46638,[RAB11B],"[P46638, Q78ZJ8, G3UY29, G3UZL4, G3UZD3, A0A06..."


In [169]:
# from df_fgn

# gene_names.split(";") count

#df_fgn['gene_names'] = df_fgn['gene_names'].replace(None,np.nan)

df_fgn_gnstudy = df_all.loc[~(df_all['gene_names'].isna()),['gene_names','Study']].copy()

df_fgn_gnstudy = df_fgn_gnstudy.drop_duplicates(keep='first')
df_fgn_gnstudy = df_fgn_gnstudy.reset_index(drop=True)

s = df_fgn_gnstudy['gene_names'].str.split(';').apply(pd.Series, 1).stack() #.str.replace(" ", "")
s.index = s.index.droplevel(-1) # to line up with df's index
s.name = 'gene_names' # needs a name to join
del df_fgn_gnstudy['gene_names']

df_fgn_gnstudy = df_fgn_gnstudy.join(s)

gn_study_count = df_fgn_gnstudy.groupby('gene_names').count()
gn_study_count.head()

Unnamed: 0_level_0,Study
gene_names,Unnamed: 1_level_1
,11
0610007P14RIK,6
0610007P22RIK,1
0610009B22RIK,2
0610010F05RIK,3


In [170]:
print(len(gn_study_count))
gn_study_count = gn_study_count.loc[gn_study_count.index!='',:]
print(len(gn_study_count))

16743
16742


In [171]:
gn_study_count.index

Index(['0610007P14RIK', '0610007P22RIK', '0610009B22RIK', '0610010F05RIK',
       '0610010K14RIK', '0610010O12RIK', '0610011F06RIK', '0610031J06RIK',
       '0610037L13RIK', '1110001A16RIK',
       ...
       'ZSWIM3', 'ZSWIM8', 'ZUBR1', 'ZW10', 'ZWILCH', 'ZWINT', 'ZYG11B', 'ZYX',
       'ZZEF1', 'ZZZ3'],
      dtype='object', name='gene_names', length=16742)

In [172]:
gn_study_count_dict = pd.Series(gn_study_count['Study'].values,index=gn_study_count.index).to_dict() 

In [173]:
len(gn_study_count_dict) #16742

16742

# Needed

In [None]:
#df_fgn['gene_id_final']

# and subsequently mapped:

#df_all['gene_id_final']

In **df_fgn**:

1) **gene_name_unified** == **gene_names**    -->    **gene_id_final** = **gene_names** == **gene_name_unified**

2) multiple gene names per entry (i.e. **";"** in **gene_names**) and **gene_name_unified** in **gene_names.split(";")**    -->     **gene_id_final** = **gene_name_unified** 

3) **gene_name_unified** not in **gene_names**.split(";") 

- I look at **Uniprot** of this entry. 

**Uniprot**.split(";")

- IF **gene_name_unified** or any of **gene_names**.split(";") for given entry are in the values of Uniprot-GN dictionaries (**uniprot_rev_dict, uniprot_unrev_dict, ?uniprot_gn**) for given list of **Uniprot**.split(";") IDs of this entry, 
I either set **gene_id_final** to the found gene name if there is only one match, 
OR inspect this entry visually. 

IF one **Uniprot**.split(";") ID found in Reviewed dictionary and another one is found in Unrevied dictionary, I set **gene_id_final** according to reviewed dictionary.


4) For entries in the raw data (**df_fgn**) which have only **Uniprot** IDs and don't have **gene_names** neither **gene_name_unified**: 
I get corresponding gene names using Uniprot-GN dictionaries (**uniprot_rev_dict, uniprot_unrev_dict, uniprot_gn**),
and IF there are multiple gene names, I check which one/ones are in list of all GN from **df_fgn[gene_names].split(";")**  
and IF it's only one, then I set final gene name as this gene name, 
and IF there are many, I inspect visually this entry.


Potentially in some cases **df_fgn[gn_from_uniprot]** and **df_fgn[uniprot_from_gn]** can help in the future


In [194]:
#df_fgn = df_fgn.drop(columns='gene_id_final')
df_fgn = df_fgn.reset_index(drop=True)

df_fgn['gene_id_final'] =  None

for index,row in df_fgn.iterrows():
    #print(index)
    df_fgn.loc[index,'gene_id_final'] = get_gene_id_final(index,row,uniprot_rev_dict,uniprot_unrev_dict,uniprot_ids_mrh_dict,gn_study_count_dict)

In [195]:
len(df_fgn)

87028

In [196]:
len(df_fgn['gene_id_final'].unique()) # 14841

14841

In [197]:
len(df_fgn.loc[df_fgn['gene_id_final'].isna()]) 

355

In [198]:
df_fgn.loc[df_fgn['gene_id_final'].isna()].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn,gene_id_final
4744,,,D3Z7R4,D3Z7R4,NoMapping,NoMapping,
10783,,,Q6PIU9,Q6PIU9,NoMapping,NoMapping,
15573,,,E9PZA8,Q9R0N7,[SYT7],NoMapping,
15584,,,Q8CF96,Q9R0N7,[SYT7],NoMapping,
15885,,,S4R2H6,Q8C8R3,[ANK2],NoMapping,


In [199]:
len(df_fgn.loc[(df_fgn['gene_id_final'].isna()) & (df_fgn['gene_names'].isna())]) # -> drop these entries no GN found by Uniprot  

355

In [200]:
print(len(df_fgn))

df_fgn = df_fgn.loc[~(  (df_fgn['gene_id_final'].isna()) & (df_fgn['gene_names'].isna())  )]

print(len(df_fgn))

87028
86673


In [201]:
df_fgn.loc[df_fgn['gene_id_final'].isna()].head()

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn,gene_id_final


In [None]:
# depending on the goal, this can be useful in some cases

#print(len(df_fgn.loc[(df_fgn['gene_id_final'].isna()) & (~df_fgn['gene_names'].isna())]))
#df_fgn.loc[(df_fgn['gene_id_final'].isna()) & (~df_fgn['gene_names'].isna())].head() # -> set gn_final to original gene name

#df_fgn.loc[(df_fgn['gene_id_final'].isna()) & (~df_fgn['gene_names'].isna()),'gene_id_final'] = df_fgn.loc[(df_fgn['gene_id_final'].isna()) & (~df_fgn['gene_names'].isna()),'gene_names'].copy()

In [202]:
with open('../data/2_df_fgn_9may2021.pkl','wb') as f:
    pkl.dump(df_fgn,f)

In [203]:
print(len(df_fgn.loc[df_fgn['gene_id_final'].str.contains('#')]))

print(len(df_fgn.loc[df_fgn['gene_id_final'].str.contains(';')]))
print(len(df_fgn.loc[df_fgn['gene_id_final'].str.contains('&')]))

print(len(df_fgn.loc[df_fgn['gene_id_final'].str.contains('@')]))

print(len(df_fgn))


0
33
0
83
86673


In [204]:
df_fgn = df_fgn.reset_index(drop=True)

### MANUAL CURATION for the entries containing "; &  @ ". See the get_gene_id_final from protein_ids_alignment_helpers.py for the details on how these composite ids were created

In [205]:
len(df_fgn.loc[(df_fgn['gene_id_final'].str.contains('&')) & (df_fgn['Uniprot']!=df_fgn['Uniprot_unified'])])

0

In [206]:
#df.loc[df['gene_id_final']== 'C2CD4CC2CD4 FAMILY',"gene_id_final"] = 'C2CD4C'
df_fgn.loc[df_fgn['gene_id_final']== 'C2CD4CC2CD4 FAMILY']

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn,gene_id_final


In [396]:
df_fgn.loc[(df_fgn['gene_id_final'].str.contains('&')) & (df_fgn['Uniprot']==df_fgn['Uniprot_unified'])]

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn,gene_id_final


In [207]:
df_fgn_gene_id_final_counts = df_fgn.loc[(~df_fgn['gene_id_final'].str.contains(';')) & (~df_fgn['gene_id_final'].str.contains('@'))].groupby('gene_id_final').count()

df_fgn_gene_id_final_counts['sum'] = df_fgn_gene_id_final_counts.sum(axis=1)

df_fgn_gene_id_final_counts_dict = pd.Series(df_fgn_gene_id_final_counts['sum'].values,index=df_fgn_gene_id_final_counts.index).to_dict() 

In [208]:
df_fgn = df_fgn.reset_index(drop=True)

In [209]:
len(df_fgn.loc[df_fgn['gene_id_final'].str.contains(';')])

33

In [210]:
for idx,row in df_fgn.loc[df_fgn['gene_id_final'].str.contains(';')].iterrows():
    
    gns_mostFreq41=np.nan
    

    gids = row['gene_id_final'].split(';')

    counts_occ=dict()
    for gid in gids:
        counts_occ[gid] = df_fgn_gene_id_final_counts_dict.get(gid,0.0)

    # get keys with max value:
    max_value = max(counts_occ.values())

    if max_value >0:
        gns_mostFreq41 = [k for k,v in counts_occ.items() if v == max_value]
    else:
        ns_mostFreq41 = np.nan
        print("check 1 ",idx)



    if isinstance(gns_mostFreq41, list):

        if len(gns_mostFreq41)==1:
            df_fgn.loc[idx,'gene_id_final'] = gns_mostFreq41[0]
            #print(gns_mostFreq41[0])


        elif len(gns_mostFreq41) >1:
            df_fgn.loc[idx,'gene_id_final'] = gns_mostFreq41[0]
            #print("check 2 ",idx)
        else:
            print("check 3 ",idx)
            


check 1  73497


In [211]:
len(df_fgn.loc[df_fgn['gene_id_final'].str.contains(';')])

1

In [212]:
df_fgn.loc[df_fgn['gene_id_final'].str.contains(';')]

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn,gene_id_final
73497,,,B2RTM0,P62806,"[H4C1, H4C2, H4C3, H4C4, H4C6, H4C8, H4C9, H4C...",NoMapping,HIST2H4;HIST1H4C


In [213]:
df_fgn.loc[(df_fgn['gene_id_final'].str.contains(';')) & (df_fgn['gene_id_final']== 'HIST2H4;HIST1H4C') & (df_fgn['Uniprot']=='B2RTM0'),"gene_id_final"] = 'HIST2H4'
#by Uniprot 


In [214]:
df_fgn.loc[df_fgn['gene_id_final'].str.contains(';')]

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn,gene_id_final


In [215]:
df_fgn.loc[df_fgn['gene_id_final'].str.contains('@')]

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn,gene_id_final
47473,,,Q3UZV7;Q3UZV7-2;Q3UZV7-3;E9PUJ3;Q3UZV7-5,Q3UZV7,NoMapping,NoMapping,ELAPOR2@EIG121L@ELAPOR2@EIG121L@ELAPOR2@EIG121...
69990,,,Q3TYS2,Q3TYS2,[CYBC1],NoMapping,CYBC1@EROS
72851,,,Q6AYA6,Q6AYA6,[CYBC1],NoMapping,CYBC1@EROS
72858,,,Q5RJN4,Q5RJN4,[SHFL],NoMapping,SHFL@RYDEN
72865,,,A2AG58;A2AG58-2,A2AG58,[BCLAF3],NoMapping,BCLAF3@BCLAF3
72872,,,Q3TYS2;X1WI15,Q3TYS2,[CYBC1],NoMapping,CYBC1@EROS
72875,,,Q499E6;Q499E6-3;Q499E6-2,Q499E6,[C1ORF109],NoMapping,C1ORF109@C1ORF109@C1ORF109
72878,,,Q6PD19;Q6PD19-3,Q6PD19,[ARMH3],NoMapping,ARMH3@ARMH3
72885,,,Q8C779;Q8C779-2,Q8C779,[RADX],NoMapping,RADX@RADX
72888,,,Q8CAK3;Q8CAK3-2,Q8CAK3,[SHFL],NoMapping,SHFL@RYDEN@SHFL@RYDEN


In [406]:
len(df_fgn.loc[df_fgn['gene_id_final'].str.contains('@')])

83

In [407]:
df_fgn = df_fgn.reset_index(drop=True)

df_fgn_gene_id_final_counts = df_fgn.loc[(~df_fgn['gene_id_final'].str.contains(';')) & (~df_fgn['gene_id_final'].str.contains('@'))].groupby('gene_id_final').count()

df_fgn_gene_id_final_counts['sum'] = df_fgn_gene_id_final_counts.sum(axis=1)

df_fgn_gene_id_final_counts_dict = pd.Series(df_fgn_gene_id_final_counts['sum'].values,index=df_fgn_gene_id_final_counts.index).to_dict() 

In [216]:
for idx,row in df_fgn.loc[df_fgn['gene_id_final'].str.contains('@')].iterrows():
    
    gns_mostFreq41=np.nan
    

    gids = row['gene_id_final'].split('@')

    counts_occ=dict()
    for gid in gids:
        counts_occ[gid] = df_fgn_gene_id_final_counts_dict.get(gid,0.0)

    # get keys with max value:
    max_value = max(counts_occ.values())

    if max_value >0:
        gns_mostFreq41 = [k for k,v in counts_occ.items() if v == max_value]
    else:
        df_fgn.loc[idx,'gene_id_final'] = gids[0]
        
        ns_mostFreq41 = np.nan
        print("check 1 ",idx)



    if isinstance(gns_mostFreq41, list):

        if len(gns_mostFreq41)==1:
            df_fgn.loc[idx,'gene_id_final'] = gns_mostFreq41[0]
            #print(gns_mostFreq41[0])


        elif len(gns_mostFreq41) >1:
            df_fgn.loc[idx,'gene_id_final'] = gns_mostFreq41[0]
            #print("check 2 ",idx)
        else:
            print("check 3 ",idx)
            


check 1  47473
check 1  69990
check 1  72851
check 1  72858
check 1  72865
check 1  72872
check 1  72875
check 1  72885
check 1  72888
check 1  72908
check 1  72911
check 1  72919
check 1  72921
check 1  72925
check 1  72948
check 1  72960
check 1  72983
check 1  72984
check 1  72993
check 1  72995
check 1  73019
check 1  73021
check 1  73040
check 1  73048
check 1  73049
check 1  73106
check 1  73166
check 1  73184
check 1  74781
check 1  74799
check 1  75163
check 1  75195
check 1  75428
check 1  75617
check 1  75731
check 1  75940
check 1  75958
check 1  76245
check 1  76527
check 1  76538
check 1  76664
check 1  76689
check 1  76774
check 1  76824
check 1  76859
check 1  77028
check 1  77466
check 1  77477
check 1  77506
check 1  77514
check 1  77559
check 1  77572
check 1  77594
check 1  77596
check 1  77634
check 1  77831
check 1  77874
check 1  77877
check 1  77926
check 1  78008
check 1  78070
check 1  78193
check 1  78195
check 1  78206
check 1  78207
check 1  78304
check 1  7

In [217]:
len(df_fgn.loc[df_fgn['gene_id_final'].str.contains('@')])

0

In [218]:
df_fgn.loc[df_fgn['gene_id_final'].str.contains('@')]

Unnamed: 0,gene_names,gene_name_unified,Uniprot,Uniprot_unified,gn_from_uniprot,uniprot_from_gn,gene_id_final


In [219]:
with open('../data/2_df_fgn_9May2021.pkl','wb') as f:
    pkl.dump(df_fgn,f)

In [None]:
# Possible improvement:

# for rat and human entries check if their gene_id_final exists in mouse entries
# if gene_id_final doesn't exist in mouse entries, try to find corresponding mouse gene name using Entry-name, Gene name, Uniprot from uniprot_rev, uniprot_unrev


In [220]:
len(df_fgn['gene_id_final'].unique())

14798

In [221]:
len(df_fgn.loc[~df_fgn['gene_id_final'].isna(),'gene_id_final'].unique()) #14823

14798

In [222]:
df_fgn = df_fgn.reset_index(drop=True)

In [223]:
df_all.columns

Index(['gene_names', 'Uniprot', 'Study', 'Organism', 'location', 'Age_cat',
       'Age_days', 'condition', 'sample_id', 'molecular_weight_kDa',
       'raw_data', 'raw_data_units', 'gene_name_unified', 'Uniprot_unified'],
      dtype='object')

In [224]:
df_fgn.columns

Index(['gene_names', 'gene_name_unified', 'Uniprot', 'Uniprot_unified',
       'gn_from_uniprot', 'uniprot_from_gn', 'gene_id_final'],
      dtype='object')

In [225]:
print(len(df_all))
print(len(df_fgn))

2140373
86673


In [226]:
df = pd.merge(df_all,df_fgn,how='inner',on=['gene_names', 'gene_name_unified', 'Uniprot', 'Uniprot_unified'])
len(df)

2132553

In [227]:
df.loc[df['gene_id_final'].isna()].head()

Unnamed: 0,gene_names,Uniprot,Study,Organism,location,Age_cat,Age_days,condition,sample_id,molecular_weight_kDa,raw_data,raw_data_units,gene_name_unified,Uniprot_unified,gn_from_uniprot,uniprot_from_gn,gene_id_final


In [228]:
### check gene_id_final

print(len(df))
print(len(df.loc[~df['gene_id_final'].isna()].copy()))



2132553
2132553


In [229]:
extra = pd.merge(df_all,df_fgn, how='left', indicator=True)

extra['_merge'].unique()

[both, left_only]
Categories (2, object): [both, left_only]

In [230]:
len(extra.loc[extra['_merge']=='left_only'])

7820

In [231]:
len(extra.loc[(extra['_merge']=='left_only') & (~extra['gene_names'].isna()) ])

0

In [232]:
with open('../data/2_df_best_alignedIDs_9May2021.pkl','wb') as f:
    pkl.dump(df,f)