In [1]:
import pandas as pd
import re
from Bio import SeqIO
import pickle
import networkx as nx

In [2]:
diptab = pd.read_csv("Hsapi20160430CR.txt", sep="\t", 
                     usecols=[0,1], header=0, index_col=False)

In [3]:
p1 = [re.search(r'^DIP-[0-9]+N', entry).group() for entry in diptab['ID interactor A'].tolist()]
p2 = [re.search(r'^DIP-[0-9]+N', entry).group() for entry in diptab['ID interactor B'].tolist()]

In [4]:
dip_interactions = pd.DataFrame({'Protein A': p1, 'Protein B': p2})

In [5]:
dip_interactions

Unnamed: 0,Protein A,Protein B
0,DIP-617N,DIP-617N
1,DIP-493N,DIP-147N
2,DIP-582N,DIP-472N
3,DIP-1078N,DIP-51N
4,DIP-189N,DIP-199N
...,...,...
6908,DIP-50081N,DIP-29011N
6909,DIP-29011N,DIP-39768N
6910,DIP-39768N,DIP-48673N
6911,DIP-46409N,DIP-46411N


In [6]:
dipgr = nx.from_pandas_edgelist(dip_interactions, 'Protein A', 'Protein B')

In [7]:
dipgr.remove_edges_from(nx.selfloop_edges(dipgr))

In [8]:
print(dipgr.number_of_edges())
print(dipgr.number_of_nodes())

6426
4491


In [9]:
dip_seq_dict = {re.search(r'DIP-[0-9]+N', str(rec.id)).group() : str(rec.seq) 
                for rec in SeqIO.parse("fasta20160501.seq", "fasta")}

In [10]:
final_seq_dict_dip_le1200 = {k : dip_seq_dict[k] for k in dip_seq_dict.keys() if 
                             (len(dip_seq_dict[k]) <= 1200 and len(dip_seq_dict[k]) > 50)}

In [11]:
not_including_list = list(set(list(dipgr.nodes)) - set(final_seq_dict_dip_le1200.keys()))
all(proteins in list(dipgr.nodes) for proteins in not_including_list)

True

In [12]:
len(not_including_list)

686

In [13]:
for i in range(len(not_including_list)):
    dipgr.remove_node(not_including_list[i])

In [14]:
print(dipgr.number_of_edges())
print(dipgr.number_of_nodes())

4839
3805


In [15]:
nx.to_pandas_edgelist(dipgr)

Unnamed: 0,source,target
0,DIP-617N,DIP-58525N
1,DIP-617N,DIP-58999N
2,DIP-617N,DIP-61424N
3,DIP-493N,DIP-421N
4,DIP-493N,DIP-922N
...,...,...
4834,DIP-56235N,DIP-53227N
4835,DIP-61775N,DIP-61776N
4836,DIP-37618N,DIP-41076N
4837,DIP-34542N,DIP-37882N


In [16]:
seq_dict = {x : final_seq_dict_dip_le1200[x] for x in list(dipgr.nodes)}

In [17]:
len(seq_dict)

3805

In [18]:
with open('dip_int_and_dict.pkl', 'wb') as f:
    pickle.dump([nx.to_pandas_edgelist(dipgr), seq_dict], f)