In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, Doc2VecVocab
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors
from utils import embedding_tools

from collections import Counter

import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.utils import shuffle


In [None]:
data_path = Path('data\protein_data\submitochondrial\SM766-20')
sequences = shuffle(pd.read_csv(data_path / 'SM766-20.csv'))

In [None]:
def getVecs(model, sequences, k, mean=True):
    vectors = []
    for sequence in sequences:
        sentences = embedding_tools.seq_to_k_sentence(sequence, int(k))
        vector = np.array([model.infer_vector(sentence) for sentence in sentences])
        if mean == True:
            vectors.append(vector.mean(0))
        else:
            vectors.append(vector.sum(0))
    return vectors

def get_vectors(dm, dbow, sequences, k):
    dm_vecs = getVecs(dm, sequences, k)
    dbow_vecs = getVecs(dbow, sequences, k)
    vecs = np.concatenate((dm_vecs, dbow_vecs), axis=1)
    return vecs


def train(sequences, k, window, size):
    c_hypers = {'k': k,
                'overlop': False,
                'merge': True}
    model_hypers = {'vector_size': size, 
                    'min_count': 5,
                    'epochs': 20,
                    'window': window,
                    'workers': 4,
                    'negative': 5}
    print('Transfer sequences to %s grams' %k)
    documents = embedding_tools.Corpus(sequences, c_hypers)
    dm = Doc2Vec(**model_hypers)
    dbow = Doc2Vec(dm=0, **model_hypers)
    print('Training model....')
    dm.build_vocab(documents)
    dbow.build_vocab(documents)
    dm.train(documents, total_examples=dm.corpus_count, epochs=dm.epochs)
    dbow.train(documents, total_examples=dbow.corpus_count, epochs=dbow.epochs)
    return dm, dbow

def train_and_save_model(sequences, k, window, size, file_path):
    name_list = [str(k), str(window), str(size)]
    file_name = '_'.join(name_list) + '.pkl'
    dm_file = file_path / 'dm' / file_name
    dbow_file = file_path / 'dbow' / file_name
    if dm_file.exists():
        print('Model dm has already exists!')
        return
    if dbow_file.exists():
        print('Model dbow has already exists!')
        return
    print('k\t\twindow\t\tsize')
    print(name_list[0] + '\t\t' + '\t\t'.join(name_list[1:]))
    dm, dbow = train(sequences, k, window, size)
    print('Finished training! \nSaving model...')
    dm.save(str(dm_file))
    dbow.save(str(dbow_file))
    print('Finished saving! \nSaved at ' + str(dm_file) + str(dbow_file))

In [None]:
file_path = Path('output\doc2vec_models')
embedding_size = [16, 32, 64, 128, 256]
for size in embedding_size:
    for k in range(5, 8):
        for window in range(3, 8):
            train_and_save_model(sequences, k=k, window=window, size=size,file_path=file_path)

In [None]:
m983_path = Path('data\protein_data\submitochondrial\M983.csv')
m495_path = Path('data\protein_data\submitochondrial\M495.csv')

m983 = shuffle(pd.read_csv(m983_path))
m495 = shuffle(pd.read_csv(m495_path))

m983_sequences = m983['sequence'].values
m495_sequences = m495['sequence'].values

print(m983_sequences.shape, m495_sequences.shape)

In [None]:
sequences = list(m983_sequences) + list(m495_sequences)
len(sequences)

In [None]:
file_path = Path('output\doc2vec_models\m983_m495')
train_and_save_model(train_sequences, k=3, window=6, size=64,file_path=file_path)

In [None]:
file_path = Path('output\doc2vec_models\m983_m495')
embedding_size = [16, 32, 64, 128, 256]
for size in embedding_size:
    for k in range(2, 8):
        for window in range(3, 8):
            train_and_save_model(sequences, k=k, window=window, size=size,file_path=file_path)