## Generate map of entrez gene ids to protein complex(es)

**Input:** Protein complexes from CORUM.  

**Output:** Map of genes to protein complex membership

In [7]:
import pandas as pd
import numpy as np
import os

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../data/' +'/'.join(folders) +'/'+ fname)

file_CORUM_complexes = get_data_path(['CORUM'], 'allComplexes.txt')
file_complexes_map = get_local_data_path(['processed'], 'CORUM_entrez_map.csv')

In [2]:
all_complexes = pd.read_csv(file_CORUM_complexes, sep='\t')
all_complexes[:1]

Unnamed: 0,ComplexID,ComplexName,Organism,Synonyms,Cell line,subunits(UniProt IDs),subunits(Entrez IDs),Protein complex purification method,GO ID,GO description,FunCat ID,FunCat description,subunits(Gene name),Subunits comment,PubMed ID,Complex comment,Disease comment,SWISSPROT organism,subunits(Gene name syn),subunits(Protein name)
0,1,BCL6-HDAC4 complex,Human,,,P41182;P56524,604;9759,MI:0007-anti tag coimmunoprecipitation,GO:0006265;GO:0045892;GO:0051276;GO:0030183;GO...,DNA topological change;negative regulation of ...,10.01.09.05;11.02.03.04.03;42.10.03;43.03.07.0...,DNA conformation modification (e.g. chromatin)...,BCL6;HDAC4,,11929873,Transcriptional repression by BCL6 is thought ...,,Homo sapiens (Human);Homo sapiens (Human),BCL5 LAZ3 ZBTB27 ZNF51;KIAA0288,B-cell lymphoma 6 protein;Histone deacetylase 4


In [3]:
# Reduce down to human complexes
human_complexes = all_complexes[all_complexes.Organism == 'Human']
human_complexes = human_complexes[['ComplexID', 'ComplexName', 'subunits(Entrez IDs)']]
print('Num complexes:', human_complexes.shape[0])
print('Max num sub-units:', human_complexes['subunits(Entrez IDs)'].apply(lambda x: x.count(';')).max() + 1)
print('Expected df length:', sum(human_complexes['subunits(Entrez IDs)'].apply(lambda x: x.count(';') + 1)))
human_complexes[:2]

Num complexes: 2916
Max num sub-units: 143
Expected df length: 12216


Unnamed: 0,ComplexID,ComplexName,subunits(Entrez IDs)
0,1,BCL6-HDAC4 complex,604;9759
1,2,BCL6-HDAC5 complex,604;10014


In [4]:
# Note that some of the complexes include 'None' as one of the entrez ids, these should be filtered out, e.g.
display(human_complexes[human_complexes['ComplexID'] == 140])

Unnamed: 0,ComplexID,ComplexName,subunits(Entrez IDs)
114,140,E-box sequence-binding complex,4005;4808;8861;None


In [5]:
# Flatten the list of entrez ids associated with each complex id
complex_map = pd.DataFrame(human_complexes['subunits(Entrez IDs)'].apply(lambda x: pd.Series(x.split(';'))).stack().reset_index(level=1, drop=True))
complex_map = complex_map.reset_index().rename(columns={'index':'complex_id', 0:'entrez_id'})

# Verify: num complexes, max num sub-units, expected length
assert(complex_map.complex_id.nunique() == human_complexes.shape[0])
assert(complex_map.complex_id.value_counts().max() == 
       human_complexes['subunits(Entrez IDs)'].apply(lambda x: x.count(';')).max() + 1)
assert(sum(human_complexes['subunits(Entrez IDs)'].apply(lambda x: x.count(';') + 1)) == complex_map.shape[0])

# Filter out entrez ids == None and fix entrez id type
complex_map = complex_map[complex_map.entrez_id != 'None']
complex_map = complex_map.astype({'entrez_id':'int'})

print('Map length:', complex_map.shape[0])
complex_map[:4]

Map length: 12125


Unnamed: 0,complex_id,entrez_id
0,0,604
1,0,9759
2,1,604
3,1,10014


In [6]:
complex_map.to_csv(file_complexes_map)