In [1]:
from Bio import SeqIO
import numpy as np

In [2]:
input_fasta = '../data/cleaned_enzyme_topts_v1.fasta'

In [39]:
def split_datasets(input_fasta,split=0.9):
    # input_fasta is a fasta file with topts and ogts of all enzymes
    # 
    
    # 1. remove those enzymes longer than 2000 residues
    recs = [rec for rec in SeqIO.parse(input_fasta,'fasta') if len(rec.seq)<=2000]
    print('Number of sequnces:', len(recs))
    print('ori',[recs[i].id for i in range(5)])
    recs = np.array(recs)
    
    # 2. random split it into train and test
    np.random.seed(212)
    shuffled_index = np.arange(len(recs))
    np.random.shuffle(shuffled_index)
    np.random.seed()
    print(shuffled_index)
    recs = recs[shuffled_index]
    print('shuffuled',[recs[i].id for i in range(5)])
    split_point = int(split*len(recs))
    training_seqs = recs[:split_point]
    test_seqs     = recs[split_point:]
    
    print('Number of training seqs',len(training_seqs))
    print('Number of test seqs',len(test_seqs))
    
    # save fasta files
    trainfile = input_fasta.replace('.fasta','')+'_train.fasta'
    testfile = input_fasta.replace('.fasta','')+'_test.fasta'
    
    ftrain = open(trainfile,'w')
    ftest = open(testfile,'w')
    
    SeqIO.write(training_seqs,ftrain,'fasta')
    SeqIO.write(test_seqs,ftest,'fasta')
    
    ftrain.close()
    ftest.close()

In [40]:
split_datasets(input_fasta,split=0.9)

Number of sequnces: 1896
ori ['K9L4P7', 'Q9L7P2', 'Q5JDG9', 'A0A0K2RV92', 'Q7S3R5']
[ 633   79  778 ... 1250  573  280]
shuffuled ['Q96XN9', 'Q7BPX6', 'Q70GL3', 'P9WJN3', 'P24228']
Number of training seqs 1706
Number of test seqs 190


In [42]:
!less ../data/cleaned_enzyme_topts_v1_train.fasta|head

>Q96XN9 ogt=75;topt=80.0
MAKLITLGEILIEFNALSPGPLRHVSYFEKHVAGSEANYCVAFIKQGNECGIIAKVGDDE
FGYNAIEWLRGQGVDVSHMKIDPSAPTGIFFIQRHYPVPLKSESIYYRKGSAGSKLSPED
VDEEYVKSADLVHSSGITLAISSTAKEAVYKAFEIASNRSFDTNIRLKLWSAEEAKREIL
KLLSKFHLKFLITDTDDSKIILGESDPDKAAKAFSDYAEIIVMKLGPKGAIVYYDGKKYY
SSGYQVPVEDVTGAGDALGGTFLSLYYKGFEMEKALDYAIVASTLNVMIRGDQENLPTTK
DIETFLREMKK
>Q7BPX6 ogt=31;topt=38.0
MAINLDWENLGFSYRNLPFRYIARFKDGKWSAGELTGDNQLHISESSPALHYGQQGFEGL
KAYRTKDGSIQLFRPDQNAARLQKTARRLCMAEVSTEMFIDAVKQVVKANKDFVPPYGTG


In [43]:
!less ../data/cleaned_enzyme_topts_v1_test.fasta|head

>B8Y3Y0 ogt=nan;topt=45.0
MKNKVQLIAYVDRISGGGFRKLHALLTGPLAEIFGGAHLLPFFTPIDGADAGFDPSDHTQ
VDPRLGTWDDVRILGGAIELVADLIVNHVSSSSPQFIDYSKKGSDSLYAGMFLTYDRVFP
EGAREADILRIYRPRPTLPFSPVTLSSRERKLLWTTFNPEQVDIDVRHPEAEAYLHSILK
KFQAAGIRMIRLDAVGYAIKKPGASCFMIPETFDFIAELTEKARALGIEVLVEIHSHYRK
QIEIARQVDWVYDFALPPLVLHALFASDPHPLAQWLSISPRNAVTVLDTHDGIGVIDVGA
DAEGNPGLLSPAAIDSLVETIHSRSQGQSREATGAAANNLDLYQVNCTFLDALGGREPDY
LIARALQFFAPGIPQVYYVGLLGGTNDMDLLGRSGVGRDINRHYYTDAEIDAALARPLVR
TLIALIRLRNTHPAFAGEFDVSVPAATQIRLRWQRQEHWIELHVDLSIPKASITGTGIHP
ITIPGAADAGAPS
