This is a summary of preprocessing steps that were taken. This will provide you with individual files that are ready for serialization. All gene IDs are sorted such that the lower ID and its corresponding symbol will appear before that with the higher ID. This ensures any duplicated can be easily filtered later.

In [None]:
# used pandas
import pandas as pd
import re


BioGRID

In [None]:
# read it in
# can be found here https://downloads.thebiogrid.org/File/BioGRID/Release-Archive/BIOGRID-4.4.224/BIOGRID-ALL-4.4.224.tab3.zip
df = pd.read_csv('biogrid path',sep = "\t", low_memory=False)

In [None]:
#remove unneded meta data
toDrop = ["#BioGRID Interaction ID",
       'BioGRID ID Interactor A', 'BioGRID ID Interactor B',
       'Systematic Name Interactor A', 'Systematic Name Interactor B',
       'Synonyms Interactor A', 'Synonyms Interactor B',
       'Organism ID Interactor A', 'Organism ID Interactor B',
       'SWISS-PROT Accessions Interactor A', 'TREMBL Accessions Interactor A',
       'REFSEQ Accessions Interactor A', 'SWISS-PROT Accessions Interactor B',
       'TREMBL Accessions Interactor B', 'REFSEQ Accessions Interactor B',
       'Ontology Term IDs', 'Ontology Term Names', 'Ontology Term Categories',
       'Ontology Term Qualifier IDs', 'Ontology Term Qualifier Names',
       'Ontology Term Types', 'Organism Name Interactor A',
       'Organism Name Interactor B','Score', 'Modification', 'Qualifications', 'Tags']

df = df.drop(columns=toDrop)


In [None]:
# get rid of pubmed prefix on IDs
for index in df.index:
  df.loc[index, "Publication Source"] = df.loc[index, "Publication Source"][7::]

In [None]:
# keep only PPIs and not protien-gene interactions
df = df.loc[df["Experimental System Type"].str.contains('physical')]

In [None]:
# remove self-reactions
df = df[df['Entrez Gene Interactor A'] != df['Entrez Gene Interactor B']]

In [None]:
# convert dtype to string
df.applymap(str)

In [None]:
# sort columns so lower comes first

for index in df:
  idA = df.loc[index, "Entrez Gene Interactor A	Entrez Gene"]
  idB = df.loc[index, "Entrez Gene Interactor B	Entrez Gene"]
  symbolA = df.loc[index, "Official Symbol Interactor A"]
  symbolB = df.loc[index, "Official Symbol Interactor B"]

  if idA > idB:
    # swap both columns
    df.loc[index, "Entrez Gene Interactor A	Entrez Gene"] = idB
    df.loc[index, "Entrez Gene Interactor B	Entrez Gene"] = idA
    df.loc[index, "Official Symbol Interactor A"] = symbolB
    df.loc[index, "Official Symbol Interactor B"] = symbolA


In [None]:
df.to_csv("biogrid_ready_for_serializing.csv")

BioPlex3.0

In [None]:
# link to source https://bioplex.hms.harvard.edu/interactions.php
df2 = pd.read_csv("path to file", sep = '\t')

In [None]:
#pW: probability of a wrong ID
#pNI: probability of a non-interactor (non-specific background)
#pInt: probability of an interactor
# can drop other data if needed

# note from Dr. Huttlin on the paper: "Though the algorithm returns "probabilities", I wouldn't take them too literally in that sense. They're best interpreted as numerical scores with pInt reflecting our confidence that a particular protein is a true interacting protein."

toDrop = ['UniprotA', 'UniprotB']
df2 = df2.drop(columns=toDrop)

In [None]:
# add experimental system name
df2['Experimental System'] = ['Affinity-Purification Mass Spectrometry']* len(df2)

In [None]:
# map data to string
df2.applymap(str)

In [None]:
# rename columns
df2 = df2.rename(columns = {"pInt": "Interaction Confidence"})

In [None]:
# remove self interactions - do not think is necessary
df2 = df2[df2['GeneA'] != df2['GeneB']]

In [None]:
# sort columns so lower comes first

for index in df2:
  idA = df2.loc[index, "GeneA"]
  idB = df2.loc[index, "GeneB"]
  symbolA = df2.loc[index, "SymbolA"]
  symbolB = df2.loc[index, "SymbolB"]

  if idA > idB:
    # swap both columns
    df2.loc[index, "GeneA"] = idB
    df2.loc[index, "GeneB"] = idA
    df2.loc[index, "SymbolA"] = symbolB
    df2.loc[index, "SymbolB"] = symbolA

In [None]:
df2.to_csv("bioplex_ready_for_serializing.csv", index = False)

IID

In [None]:
# link to source http://iid.ophid.utoronto.ca/
df3 = pd.read_csv("put iid path in", sep = "\t")

In [None]:
# so much meta data - can be explored later - copy over only the esentials
df3 = df3[['symbol1', 'symbol2', 'methods', 'pmids', 'db_with_ppi', 'evidence_type']].copy()

In [None]:
# rename rows
df3 =df3.rename(columns = {'methods': 'Experimental System', 'db_with_ppi':'Source Database'})

In [None]:
# some proteins are not from IID acording to source_database, because IID is an aggregator for simplicity, can add IID to sourceDB, regex filtering is used in neo4j implementation
for index in df3.index:
  if bool(re.search( '.*iid.*', re.containsdf3.loc[index,'source_database'])):
    df3.loc[index,'source_database'] =  df3.loc[index,'source_database'] + "|iid"

In [None]:
# remove self interactions - do not think is necessary
df3 = df3[df3['symbol1'] != df3['symbol2']]

In [None]:
# map to entrez gene ID - used PyPath
# pip install https://github.com/saezlab/pypath.git
from pypath.utils import mapping

# make empty columns

df3['entrez1'] = None
df3['entrez2'] = None

for index in df3:
  a = df3.loc[index, 'symbolA']
  b = df3.loc[index, 'symbolB']
  df3.loc[index,'entrez 1'] = mapping.map_name(a, 'genesymbol', 'entrez')
  df3.loc[index,'entrez 2'] = mapping.map_name(b, 'genesymbol', 'entrez')

# data will be stored inside dicts on length = 1 so must remove from set

# convert dtype to string
df3.applymap(str)

# remove gene id from set
for index in df3.index:
  res = ""
  # length of set is one
  for s in df3.loc[index, 'entrez 1']:
    res = res + str(s)

  df3.loc[index, 'entrez 1'] = res

  res2 = ""
  for i in df3.loc[index, 'uniprot2']:
    res2 = res2 + str(i)

  df3.loc[index, 'entrez 2'] = res2



In [None]:
# sort columns so lower comes first

for index in df3:
  idA = df3.loc[index, "entrez 1"]
  idB = df3.loc[index, "entrez 2"]
  symbolA = df3.loc[index, "Symbol1"]
  symbolB = df3.loc[index, "Symbol2"]

  if idA > idB:
    # swap both columns
    df3.loc[index, "entrez 1"] = idB
    df3.loc[index, "entrez 2"] = idA
    df3.loc[index, "Symbol1"] = symbolB
    df3.loc[index, "Symbol2"] = symbolA

In [None]:
df2.to_csv("iid_ready_for_serializing.csv", index = False)

String

In [None]:
# download for homo spaiens https://string-db.org/cgi/download?sessionId=bsWgk1f8vBeG

df4 = pd.read_csv("path to file", sep = ' ')


In [None]:
# string is so big so all the mapping will take alot of time, which is why every cell is so split up
# make sure you have done confidence score filtering first

# make new df with the combined score
norm_df = df4['combined_score']

# compute score
norm_df = (norm_df-norm_df.min())/(norm_df.max()-norm_df.min())

# make it a df and rename
norm_df = norm_df.to_frame()
norm_df = norm_df.rename(columns = {'combined_score': 'normalized_score'})

# add back to original
df4 = pd.concat([df4, norm_df], axis = 1)


In [None]:
# find a confidence value threshhold - remember all reactions are double counted so divide number by 2
# this cell does not save the data frame to memory, so you can play around with different scores before you commit
x = 0.98
df4.loc[df4['normalized_score'] > x]

print(len(df4))

In [None]:
# ok this is for real now - make sure you haveq the right score
x = 0.98
df4 = df4.loc[df4['normalized_score'] > x]

In [None]:
# map to entrez IDs link to string mapper https://string-db.org/mapping_files/entrez/

procDF = pd.read_csv("path to mapper file")

# build mapper
mapper = {}
for index in procDF.index:
  mapper[procDF.loc[index, 'STRING']] = procDF.loc[index, 'entrez']


In [None]:
# map to entrez ids- if entrez id not there set to negatvie one too filter out
for index in df4.index:
  x = df4.loc[index,'protein1']
  if x in mapper:
    df4.loc[index,'protein1'] = mapper[x]
  else:
    df4.loc[index,'protein1'] = '-1'
  y = df4.loc[index,'protein2']
  if y in mapper:
    df4.loc[index,'protein2'] = mapper[y]
  else:
    df4.loc[index,'protein2'] = '-1'

In [None]:
# drop columns without entrez IDs
df4 = df4.loc[(df4['protein1'] != '-1') & (df4['protein1'] != '-1')]

In [None]:
# map to entrez gene ID - used PyPath
# pip install https://github.com/saezlab/pypath.git
from pypath.utils import mapping

# make empty columns

df4['entrez1'] = None
df4['entrez2'] = None

for index in df3:
  a = df4.loc[index, 'protein1']
  b = d4f.loc[index, 'protein2']
  df4.loc[index,'entrez 1'] = mapping.map_name(a, 'genesymbol', 'entrez')
  df4.loc[index,'entrez 2'] = mapping.map_name(b, 'genesymbol', 'entrez')

# data will be stored inside dicts on length = 1 so must remove from set

# convert dtype to string
df4.applymap(str)

# remove gene id from set
for index in df4.index:
  res = ""
  # length of set is one
  for s in df4.loc[index, 'entrez 1']:
    res = res + str(s)

  df4.loc[index, 'entrez 1'] = res

  res2 = ""
  for i in df4.loc[index, 'uniprot2']:
    res2 = res2 + str(i)

  df4.loc[index, 'entrez 2'] = res2


In [None]:
# remove self interactions - do not think is necessary
df4 = df4[df4['protein1'] != df4['protein2']]

In [None]:
# sort columns so lower comes first

for index in df4:
  idA = df4.loc[index, "entrez 1"]
  idB = df4.loc[index, "entrez 2"]
  symbolA = df4.loc[index, "protein1"]
  symbolB = df4.loc[index, "protein2"]

  if idA > idB:
    # swap both columns
    df4.loc[index, "entrez 1"] = idB
    df4.loc[index, "entrez 2"] = idA
    df4.loc[index, "protein1"] = symbolB
    df4.loc[index, "protein2"] = symbolA

In [None]:
# string double counts reacitons, make sure to run this line after sorting
df4 = df4.drop_duplicates(subset = ["entrez 1", "entrez 2"])

In [None]:
df4.to_csv("string_ready_for_serializing.csv", index = False)