In [22]:
from Bio import SeqIO
from tqdm import tqdm
from itertools import groupby
import pandas as pd
import numpy as np

In [10]:
node_id_protein_id = pd.read_csv('nodeidx2proteinid.csv')

In [11]:
node_id_protein_id.head(5)

Unnamed: 0,node idx,protein id
0,0,4513.MLOC_25875.1
1,1,9823.ENSSSCP00000009454
2,2,9823.ENSSSCP00000026558
3,3,6239.F58B4.7
4,4,4577.GRMZM2G074351_P01


In [12]:
protein_id_seq_dict = dict()

fh = open('protein.sequences.v11.5.fa')

# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.
faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))

for header in tqdm(faiter):
    # drop the ">"
    headerStr = header.__next__()[1:].strip()

    # join all sequence lines to one.
    seq = "".join(s.strip() for s in faiter.__next__())

    protein_id_seq_dict[headerStr] = seq

67592464it [06:51, 164073.38it/s]


In [15]:
seq_list = []

for index, row in tqdm(node_id_protein_id.iterrows()):
    seq_list.append(protein_id_seq_dict[row['protein id']])

576289it [00:38, 14957.57it/s]


In [16]:
node_id_protein_id['seq'] = seq_list

In [17]:
node_id_protein_id.head(5)

Unnamed: 0,node idx,protein id,seq
0,0,4513.MLOC_25875.1,MTVQMRRGGATCLSLVQVVAVVSYVVVLMASAGVRAQLRVGFYDSS...
1,1,9823.ENSSSCP00000009454,MAVAAALAGLQAEAKCPICLDSLHDPVTIQCGHNFCRRCIQRSWAE...
2,2,9823.ENSSSCP00000026558,MSLKWLSLLLLLQLTCYFSSGRCGKVLVWPMEYSHWINMKIILEEL...
3,3,6239.F58B4.7,MICVIVIFLISCAMIVSFCSKNSRKCERENGDAEERKNTLLMISDN...
4,4,4577.GRMZM2G074351_P01,MSGPFAETRRPPARRPLSSCRSAPLSDHPPPHSSPATGAHFHELAS...


In [18]:
node_id_protein_id.to_csv('node_id_protein_id.csv')

## Get ProtVec Embeddings

In [19]:
protvec_model = pd.read_csv('protVec_100d_3grams.csv', delimiter = '\t')

In [23]:
trigram_dict = {}

for idx, row in tqdm(protvec_model.iterrows()):
    trigram_dict[row['words']] = protvec_model.iloc[idx, 1:].values.astype(np.float)

trigram_list = set(trigram_dict.keys())

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.
9048it [00:04, 1811.84it/s]


In [25]:
target_list = node_id_protein_id['seq'].tolist()

target_embeddings = np.zeros((len(target_list), 100)) ## 100 is the ProtVec embedding dimensions
length_of_target = [0 for _ in range(len(target_list))]

In [26]:
for idx, target in tqdm(enumerate(target_list)):
    
    n = 3
    split_by_three = [target[i : i + n] for i in range(0, len(target), n)]
    length_of_target[idx] = len(split_by_three)

    for trigram in split_by_three: 

        if len(trigram) == 2: 
            trigram = "X" + trigram

        elif len(trigram) == 1:
            trigram = "XX" + trigram

        if trigram in trigram_list:
            target_embeddings[idx, :] = target_embeddings[idx, :] + trigram_dict[trigram]


576289it [06:05, 1577.70it/s]


In [30]:
target_embeddings.shape

(576289, 100)

In [34]:
list(target_embeddings)[0]

array([-8.551412, -3.597922, -0.344051, -7.571492, -3.429436, -0.559949,
        1.056515, -3.411363, -1.23382 ,  8.935931, -0.227794, -2.177607,
       -2.04591 ,  0.279449,  0.550605, -1.279316, -1.358398, -1.880208,
       -1.21748 , -0.554456, -0.518071, -4.407914, -5.010958,  0.879433,
       -1.436518, -3.277022, -1.903488,  0.454888, -1.492889, -1.706882,
       -0.644654, -5.604222,  0.913387, -1.981428,  2.399694,  2.265022,
       -2.829976, -0.164854,  4.266591, -0.621724,  1.468068,  3.072114,
        3.082963, -0.450081, -1.237607,  1.161086, -4.056343,  0.140034,
       -2.031192,  1.960296, -1.879468, -3.944779,  0.644325, -5.3801  ,
        1.438516, -3.793947, -0.936373, -1.098297,  0.981725, -5.466712,
        1.626954, 10.535323, -0.941554,  2.717432,  2.472714,  3.074986,
        0.816342,  4.355083,  2.637897, -0.904921, -6.199516, -2.127933,
        2.497637,  0.394921, -3.854306, -4.414594,  1.344574, -4.179294,
        2.486423, -0.900568, -3.222019, -3.822983, 

In [33]:
node_id_protein_id['ProtVec'] = list(target_embeddings)

In [35]:
node_id_protein_id.to_csv('node_id_protein_id.csv')

In [36]:
from tempfile import TemporaryFile
#outfile = TemporaryFile()
np.save('protvec_embeddings.npy', target_embeddings)