In [202]:
import pandas as pd
import numpy as np
#Importing gensim libraries for d2v embedding
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [203]:
#Functions to generate kmers

def seq_to_kmers(seq, k=3, overlap=False, **kwargs):
    """ Divide a string into a list of kmer strings.
    Parameters:
        seq (string)
        k (int), default 3
        overlap (Boolean), default False
    Returns:
        List containing 1 list of kmers (overlap=True) or k lists of
            kmers (overlap=False)
    """
    N = len(seq)
    if overlap:
        return [[seq[i:i+k] for i in range(N - k + 1)]]
    else:
        return [[seq[i:i+k] for i in range(j, N - k + 1, k)]
                for j in range(k)]


def seqs_to_kmers(seqs, k=3, overlap=False, **kwargs):
    """Divide a list of sequences into kmers.
    Parameters:
        seqs (iterable) containing strings
        k (int), default 3
        overlap (Boolean), default False
    Returns:
        List of lists of kmers
    """
    as_kmers = []
    for seq in seqs:
        as_kmers += seq_to_kmers(seq.strip(), k=k, overlap=overlap)
    return as_kmers

## Data preprocessing:

In [215]:
dataset = pd.read_csv('data/orf1ab_df_seq_meta.csv')

In [222]:
dataset

Unnamed: 0.1,Unnamed: 0,Accession,Sequence,Release_Date,Species,Length,Geo_Location,Host,Isolation_Source,Collection_Date,GenBank_Title,Protein,Country,Host_agg
0,0,YP_009724389,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,2020-01-13T00:00:00Z,Severe acute respiratory syndrome-related coro...,7096,China,Homo sapiens,,2019-12,orf1ab polyprotein [Severe acute respiratory s...,orf1ab polyprotein,China,Homo
1,1,YP_009555238,MSKINKYGLELHWAPEFPWMFEDAEEKLDNPSSSEVDMICSTTAQK...,2019-02-21T00:00:00Z,Betacoronavirus 1,7095,USA,,,,Orf1ab [Human coronavirus OC43],Orf1ab,USA,
2,2,YP_002308478,MVKNVSKRSPIVLPQIQPPPLQLFIAVAAAEEGHPKDLKYLGNYNL...,2018-08-24T00:00:00Z,Bulbul coronavirus HKU11,6264,Hong Kong,Pycnonotus jocosus,,2007-01,orf1ab polyprotein [Bulbul coronavirus HKU11-934],orf1ab polyprotein,Hong Kong,Pycnonotus
3,3,YP_009513008,MSSATGEGSQGARATYRAALNNEKRHDHVALTVPCCGTEAKVTALS...,2018-08-24T00:00:00Z,Hedgehog coronavirus 1,7150,Germany,Erinaceus europaeus,feces,2012,orf1ab [Betacoronavirus Erinaceus/VMC/DEU/2012],orf1ab,Germany,Erinaceus
4,4,YP_009513020,MAKNKSKRDAIALPENVPPPLQLFIHVAAAEEGHPKVTTYLGNYNL...,2018-08-24T00:00:00Z,Coronavirus HKU15,6267,China: Hong Kong,Sus scrofa,,2010,replicase polyprotein [Porcine coronavirus HKU15],replicase polyprotein,China,Sus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2710,2710,QIU78777,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,2020-04-06T00:00:00Z,Severe acute respiratory syndrome-related coro...,7096,Spain,Homo sapiens,,2020-03-10,ORF1ab polyprotein [Severe acute respiratory s...,ORF1ab polyprotein,Spain,Homo
2711,2711,QIU78717,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,2020-04-06T00:00:00Z,Severe acute respiratory syndrome-related coro...,7096,Spain,Homo sapiens,,2020-03-10,ORF1ab polyprotein [Severe acute respiratory s...,ORF1ab polyprotein,Spain,Homo
2712,2712,QIU78705,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,2020-04-06T00:00:00Z,Severe acute respiratory syndrome-related coro...,7096,Spain,Homo sapiens,,2020-03-09,ORF1ab polyprotein [Severe acute respiratory s...,ORF1ab polyprotein,Spain,Homo
2713,2713,QIU78741,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,2020-04-06T00:00:00Z,Severe acute respiratory syndrome-related coro...,7096,Spain,Homo sapiens,,2020-03-10,ORF1ab polyprotein [Severe acute respiratory s...,ORF1ab polyprotein,Spain,Homo


In [205]:
#Removing sequences that doesn't start with Methionine
dataset=dataset[dataset['Sequence'].astype(str).str.startswith('M')]

In [None]:
ids = dataset.iloc[:, 1].values
seqs = dataset.iloc[:, 2].values
host= dataset.iloc[:, 7].values
hosts=[i for i in host]
sp = dataset.iloc[:, 10].values

In [206]:
#Generating k-mers of length k
k=4
kmers= seqs_to_kmers(seqs, k=k, overlap=True)

## Doc2Vec model:

In [207]:
#Generating tagged documents to embed
documents=[]
for i, val in enumerate(ids):
    documents.append(TaggedDocument(kmers[i], (val,)))

In [211]:
vector_sizes=[10,50,100,200,300]
epochs=[5,10,20,50,100,200]

for vs in vector_sizes:
    for ep in epochs:
        #Define doc2vec model
        model = gensim.models.doc2vec.Doc2Vec(vector_size=vs, min_count=1, epochs=ep, window=5)
        #Building model vocab
        model.build_vocab(documents)
        #training model
        print(f'Training d2v model with vector size:{vs} and epochs:{ep}...')
        model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
        #model.save(f'models/ep{ep}_vs{vs}.d2v')

50 5
done!
50 20
done!


In [196]:
#Getting embedded vectors
X= model.docvecs.vectors_docs