In [24]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import random

In [19]:
posSeqsFA = SeqIO.parse(open('../data/combined/toxin-0.fasta'), 'fasta')
negSeqsFA = SeqIO.parse(open('../data/combined/nontoxin-0.fasta'), 'fasta')

posSeqs = set()
negSeqs = set()

for fasta in posSeqsFA:
    seq = str(fasta.seq).upper()
    posSeqs.add(seq)
for fasta in negSeqsFA:
    seq = str(fasta.seq).upper()
    negSeqs.add(seq)

print(len(posSeqs))
print(len(negSeqs))



80000
28315


In [20]:
toxibtlProteinTrainPosSeqs = set()
toxibtlProteinTrainNegSeqs = set()

toxibtlProteinTrainSeqs = SeqIO.parse(open('../data/toxibtl/protein/train.fa'), 'fasta')
for fasta in toxibtlProteinTrainSeqs:
    _, isPos = fasta.description.split('\t')
    seq = str(fasta.seq)
    if isPos == '0':
        toxibtlProteinTrainNegSeqs.add(seq)
    else:
        toxibtlProteinTrainPosSeqs.add(seq)

toxibtlProteinTestPosSeqs = set()
toxibtlProteinTestNegSeqs = set()

toxibtlProteinTestSeqs = SeqIO.parse(open('../data/toxibtl/protein/test.fa'), 'fasta')
for fasta in toxibtlProteinTestSeqs:
    _, isPos = fasta.description.split('\t')
    seq = str(fasta.seq)
    if isPos == '0':
        toxibtlProteinTestNegSeqs.add(seq)
    else:
        toxibtlProteinTestPosSeqs.add(seq)
            
            
toxibtlPeptidePosSeqs = set()
toxibtlPeptideNegSeqs = set()

toxibtlPeptideSeqs = SeqIO.parse(open('../data/toxibtl/peptide/peptide.fasta'), 'fasta')
for fasta in toxibtlPeptideSeqs:
    isPos, _ = fasta.description.split(' ')
    seq = str(fasta.seq)
    if isPos == '|non-toxin':
        toxibtlPeptideNegSeqs.add(seq)
    else:
        toxibtlPeptidePosSeqs.add(seq)

toxibtlPosSeqs = toxibtlProteinTrainPosSeqs.union(toxibtlProteinTestPosSeqs).union(toxibtlPeptidePosSeqs)
toxibtlNegSeqs = toxibtlProteinTrainNegSeqs.union(toxibtlProteinTestNegSeqs).union(toxibtlPeptideNegSeqs)

In [25]:
posNotInIBTL = []
negNotInIBTL = []

for seq in posSeqs:
    if seq not in toxibtlPosSeqs and len(seq) < 50:
        posNotInIBTL.append(seq)
for seq in negSeqs:
    if seq not in toxibtlNegSeqs and len(seq) < 50:
        negNotInIBTL.append(seq)

In [26]:
print(len(posNotInIBTL))
print(len(negNotInIBTL))

1521
18931


In [29]:
posToTest = random.sample(posNotInIBTL, 200)
negToTest = random.sample(negNotInIBTL, 200)

In [33]:
posToTestFA = [SeqRecord(Seq(seq), id=str(i), description='') for i, seq in enumerate(posToTest)]
negToTestFA = [SeqRecord(Seq(seq), id=str(i), description='') for i, seq in enumerate(negToTest)]

with open("./pos.fasta", "w") as f:
    SeqIO.write(posToTestFA, f, "fasta")
with open("./neg.fasta", "w") as f:
    SeqIO.write(negToTestFA, f, "fasta")