In [1]:
import biovec
from Bio import SeqIO
import pandas as pd
from tqdm import tqdm

import numpy as np
import re

#from nltk.corpus import stopwords

from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import logging

In [9]:
node_id_protein_id = pd.read_csv('nodeidx2proteinid.csv')

In [10]:
wanted_ids = set(node_id_protein_id['protein id'].tolist())

In [11]:
fasta_file = "protein.sequences.v11.5.fa" # Input fasta file

fasta_sequences = SeqIO.parse(open(fasta_file),'fasta')

seq_id_seq_dict = dict()

for seq in tqdm(fasta_sequences):
    if seq.id in wanted_ids:
        seq_id_seq_dict[seq.id] = seq.seq

67592464it [08:16, 136152.04it/s]


In [12]:
seq_list = []

for index, row in tqdm(node_id_protein_id.iterrows()):
    try:
        seq_list.append(seq_id_seq_dict[row['protein id']])
    except:
        #seq_list('NA')
        
node_id_protein_id['seq'] = seq_list

576289it [00:26, 22148.70it/s]


In [13]:
len(seq_list)

576289

In [21]:
seq_list = [str(x) for x in seq_list]

In [27]:
corpus = []

for seq in tqdm(seq_list):
    trigrams_list = []
    for i in range(len(seq)-2):
        trigrams_list.append(seq[i]+seq[i+1]+seq[i+2])
    corpus.append(trigrams_list)

100%|████████████████████████████████████████████████████████████████████████| 576289/576289 [02:31<00:00, 3803.81it/s]


In [28]:
num_of_sentences = len(corpus)
num_of_words = 0
for line in corpus:
    num_of_words += len(line)

print('Num of sentences - %s'%(num_of_sentences))
print('Num of words - %s'%(num_of_words))

Num of sentences - 576289
Num of words - 277466235


In [34]:
corpus_df = pd.DataFrame()

corpus_df['seq_trigrams'] = corpus

corpus_df.to_csv('ppa_corpus.csv')

In [3]:
corpus_df = pd.read_csv('ppa_corpus.csv')

corpus = corpus_df['seq_trigrams'].tolist()

In [6]:
# sg - skip gram |  window = size of the window | size = vector dimension
size = 100
window_size = 2 # sentences weren't too long, so
epochs = 100
min_count = 2
workers = 4

# train word2vec model using gensim
model = Word2Vec(corpus, sg=1,window=window_size,vector_size=size,min_count=min_count,workers=workers,epochs=epochs,sample=0.01)

In [5]:
model.save('ppa_protvec_model')

#model = Word2Vec.load('ppa_protvec_model')

In [29]:
all_protvec = np.zeros([len(seq_list),size])

for i in tqdm(range(len(seq_list))):
    all_protvec[i,:] = np.mean(model.wv[seq_list[i]], axis=0)

100%|█████████████████████████████████████████████████████████████████████████| 576289/576289 [09:56<00:00, 966.16it/s]


In [36]:
np.save('new_protvec.npy',all_protvec)

In [30]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

In [31]:
k_clusters = 5

ppa_new_protvec_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(all_protvec)

In [32]:
davies_bouldin_score(all_protvec,ppa_new_protvec_kmeans.labels_)

1.9851042792368045

In [33]:
ppa_node2vec = np.load('ogbl_ppa_node2vec.npy')

In [34]:
ppa_node2vec_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_node2vec)

In [35]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_new_protvec_kmeans.labels_)

0.03716810889563361