In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

## ToxIBTL Dataset Analysis

In [2]:
# read in protein training data
toxibtlProteinTrainPosSeqs = set()
toxibtlProteinTrainNegSeqs = set()

toxibtlProteinTrainPosRepeats = []
toxibtlProteinTrainNegRepeats = []

toxibtlProteinTrainSeqs = SeqIO.parse(open('../data/toxibtl/protein/train.fa'), 'fasta')
for fasta in toxibtlProteinTrainSeqs:
    _, isPos = fasta.description.split('\t')
    seq = str(fasta.seq)
    if isPos == '0':
        if seq in toxibtlProteinTrainNegSeqs:
            toxibtlProteinTrainNegRepeats.append(seq)
        else:
            toxibtlProteinTrainNegSeqs.add(seq)
    else:
        if seq in toxibtlProteinTrainPosSeqs:
            toxibtlProteinTrainPosRepeats.append(seq)
        else:
            toxibtlProteinTrainPosSeqs.add(seq)

In [3]:
print(len(toxibtlProteinTrainPosSeqs))
print(len(toxibtlProteinTrainNegSeqs))

4331
5519


In [4]:
print(len(toxibtlProteinTrainPosRepeats))
print(len(toxibtlProteinTrainNegRepeats))

82
152


In [5]:
# read in protein test data
toxibtlProteinTestPosSeqs = set()
toxibtlProteinTestNegSeqs = set()

toxibtlProteinTestPosRepeats = []
toxibtlProteinTestNegRepeats = []

toxibtlProteinTestSeqs = SeqIO.parse(open('../data/toxibtl/protein/test.fa'), 'fasta')
for fasta in toxibtlProteinTestSeqs:
    _, isPos = fasta.description.split('\t')
    seq = str(fasta.seq)
    if isPos == '0':
        if seq in toxibtlProteinTestNegSeqs:
            toxibtlProteinTestNegRepeats.append(seq)
        else:
            toxibtlProteinTestNegSeqs.add(seq)
    else:
        if seq in toxibtlProteinTestPosSeqs:
            toxibtlProteinTestPosRepeats.append(seq)
        else:
            toxibtlProteinTestPosSeqs.add(seq)

In [6]:
print(len(toxibtlProteinTestPosSeqs))
print(len(toxibtlProteinTestNegSeqs))

59
670


In [7]:
print(len(toxibtlProteinTestPosRepeats))
print(len(toxibtlProteinTestNegRepeats))

0
0


In [8]:
# read in peptide data
toxibtlPeptidePosSeqs = set()
toxibtlPeptideNegSeqs = set()

toxibtlPeptidePosRepeats = []
toxibtlPeptideNegRepeats = []

toxibtlPeptideSeqs = SeqIO.parse(open('../data/toxibtl/peptide/peptide.fasta'), 'fasta')
for fasta in toxibtlPeptideSeqs:
    isPos, _ = fasta.description.split(' ')
    seq = str(fasta.seq)
    if isPos == '|non-toxin':
        if seq in toxibtlPeptideNegSeqs or seq in toxibtlProteinTrainNegSeqs or seq in toxibtlProteinTestNegSeqs:
            toxibtlPeptideNegRepeats.append(seq)
        else:
            toxibtlPeptideNegSeqs.add(seq)
    else:
        if seq in toxibtlPeptidePosSeqs or seq in toxibtlProteinTrainPosSeqs or seq in toxibtlProteinTestPosSeqs:
            toxibtlPeptidePosRepeats.append(seq)
        else:
            toxibtlPeptidePosSeqs.add(seq)

In [9]:
print(len(toxibtlPeptidePosSeqs))
print(len(toxibtlPeptideNegSeqs))

1303
1841


In [10]:
print(len(toxibtlPeptidePosRepeats))
print(len(toxibtlPeptideNegRepeats))

629
91


The following conclusions can be drawn:
* There are repeat sequences in the protein train set. This may affect training of model weights.
* There data points in the peptide dataset that are in the protein dataset. Given that Wei et al do not provide the actual train-test split they used, it's possible that the test set they used had data points that were used to train the model. This prevents a proper evaluation of the model.

## ToxinPred Dataset Analysis

In [11]:
toxibtlPosSeqs = toxibtlProteinTrainPosSeqs.union(toxibtlProteinTestPosSeqs).union(toxibtlPeptidePosSeqs)
toxibtlNegSeqs = toxibtlProteinTrainNegSeqs.union(toxibtlProteinTestNegSeqs).union(toxibtlPeptideNegSeqs)

In [12]:
# sanity check:
print(len(toxibtlPosSeqs))
print(len(toxibtlNegSeqs))

5693
8030


In [13]:
# read in train data
toxinpredTrainPosSeqs1 = []
toxinpredTrainPosSeqs2 = []
toxinpredTrainNegSeqs1 = []
toxinpredTrainNegSeqs2 = []

# positive ex
with open('../data/toxinpred/toxinpred-main-pos-1.txt') as f:
    lines = f.read().splitlines()
    toxinpredTrainPosSeqs1.extend(lines)
with open('../data/toxinpred/toxinpred-main-pos-2.txt') as f:
    lines = f.read().splitlines()
    toxinpredTrainPosSeqs2.extend(lines)
    
with open('../data/toxinpred/toxinpred-main-neg-1.txt') as f:
    lines = f.read().splitlines()
    toxinpredTrainNegSeqs1.extend(lines)
with open('../data/toxinpred/toxinpred-main-neg-2.txt') as f:
    lines = f.read().splitlines()
    toxinpredTrainNegSeqs2.extend(lines)

In [14]:
toxinpredTrainPosSeqs = set(toxinpredTrainPosSeqs1 + toxinpredTrainPosSeqs2)
toxinpredTrainNegSeqs = set(toxinpredTrainNegSeqs1 + toxinpredTrainNegSeqs2)

In [15]:
print(len(toxinpredTrainPosSeqs))
print(len(toxinpredTrainNegSeqs))

1805
16043


In [16]:
posTrainNotInIBTL = []
for seq in toxinpredTrainPosSeqs:
    if seq not in toxibtlPosSeqs:
        posTrainNotInIBTL.append(seq)
        
negTrainNotInIBTL = []
for seq in toxinpredTrainNegSeqs:
    if seq not in toxibtlNegSeqs:
        negTrainNotInIBTL.append(seq)

In [17]:
print(len(posTrainNotInIBTL))
print(len(negTrainNotInIBTL))

987
14968


In [18]:
print(len(toxinpredTrainPosSeqs1) - len(set(toxinpredTrainPosSeqs1)))
print(len(toxinpredTrainPosSeqs2) - len(set(toxinpredTrainPosSeqs2)))
print(len(toxinpredTrainNegSeqs1) - len(set(toxinpredTrainNegSeqs1)))
print(len(toxinpredTrainNegSeqs2) - len(set(toxinpredTrainNegSeqs2)))

0
0
0
0


In [19]:
# read in test data
toxinpredTestPosSeqs1 = []
toxinpredTestPosSeqs2 = []
toxinpredTestNegSeqs1 = []
toxinpredTestNegSeqs2 = []

toxinpredTestPosRepeatSeqs = set()
toxinpredTestNegRepeatSeqs = set()

with open('../data/toxinpred/toxinpred-ind-pos-1.txt') as f:
    lines = f.read().splitlines()
    toxinpredTestPosSeqs1.extend(lines)
with open('../data/toxinpred/toxinpred-ind-pos-2.txt') as f:
    lines = f.read().splitlines()
    toxinpredTestPosSeqs2.extend(lines)
    
    
with open('../data/toxinpred/toxinpred-ind-neg-1.txt') as f:
    lines = f.read().splitlines()
    toxinpredTestNegSeqs1.extend(lines)
with open('../data/toxinpred/toxinpred-ind-neg-2.txt') as f:
    lines = f.read().splitlines()
    toxinpredTestNegSeqs2.extend(lines)

In [20]:
toxinpredTestPosSeqs = set(toxinpredTestPosSeqs1 + toxinpredTestPosSeqs2)
toxinpredTestNegSeqs = set(toxinpredTestNegSeqs1 + toxinpredTestNegSeqs2)

In [21]:
print(len(toxinpredTestPosSeqs))
print(len(toxinpredTestNegSeqs))

303
1298


In [22]:
posTestNotInIBTL = set()
for seq in set(toxinpredTestPosSeqs):
    if seq not in toxibtlPosSeqs:
        posTestNotInIBTL.add(seq)
        
negTestNotInIBTL = set()
for seq in set(toxinpredTestNegSeqs):
    if seq not in toxibtlNegSeqs:
        negTestNotInIBTL.add(seq)

In [23]:
print(len(posTestNotInIBTL))
print(len(negTestNotInIBTL))

57
1214


In [24]:
print(len(toxinpredTestPosSeqs1) - len(set(toxinpredTestPosSeqs1)))
print(len(toxinpredTestPosSeqs2) - len(set(toxinpredTestPosSeqs2)))
print(len(toxinpredTestNegSeqs1) - len(set(toxinpredTestNegSeqs1)))
print(len(toxinpredTestNegSeqs2) - len(set(toxinpredTestNegSeqs2)))

0
0
0
0


In [25]:
# check overlap in train / test split
toxinpredPosOverlaps1 = set()
for seq in toxinpredTestPosSeqs1:
    if seq in toxinpredTrainPosSeqs1:
        toxinpredPosOverlaps1.add(seq)

toxinpredPosOverlaps2 = set()
for seq in toxinpredTestPosSeqs2:
    if seq in toxinpredTrainPosSeqs2:
        toxinpredPosOverlaps2.add(seq)
        
toxinpredNegOverlaps1 = set()
for seq in toxinpredTestNegSeqs1:
    if seq in toxinpredTrainNegSeqs1:
        toxinpredNegOverlaps1.add(seq)
        
toxinpredNegOverlaps2 = set()
for seq in toxinpredTestNegSeqs2:
    if seq in toxinpredTrainNegSeqs2:
        toxinpredNegOverlaps2.add(seq)

In [26]:
print(len(toxinpredPosOverlaps1))
print(len(toxinpredPosOverlaps2))
print(len(toxinpredNegOverlaps1))
print(len(toxinpredNegOverlaps2))

0
0
0
0


From the above, we can conclude the following:
* The ToxinPred dataset contains training examples that were not found in the ToxIBTL dataset.

## ToxDL Dataset Analysis

In [27]:
toxinpredPosSeqs = toxinpredTrainPosSeqs.union(toxinpredTestPosSeqs)
toxinpredNegSeqs = toxinpredTrainNegSeqs.union(toxinpredTestNegSeqs)

In [28]:
# sanity check:
print(len(toxinpredPosSeqs))
print(len(toxinpredNegSeqs))

2108
17331


In [29]:
# read in train & val data
toxdlTrainPosSeqs = set()
toxdlTrainNegSeqs = set()

toxdlTrainPosRepeats = set()
toxdlTrainNegRepeats = set()

toxdlTrainValFiles = ['../data/toxdl/train.fa', '../data/toxdl/valid.fa']
for file in toxdlTrainValFiles:
    toxdlSeqs = SeqIO.parse(open(file), 'fasta')
    for fasta in toxdlSeqs:
        _, isPos = fasta.description.split('\t')
        seq = str(fasta.seq)
        if isPos == '0':
            if seq in toxdlTrainNegSeqs:
                toxdlTrainNegRepeats.add(seq)
            toxdlTrainNegSeqs.add(seq)
        else:
            if seq in toxdlTrainPosSeqs:
                toxdlTrainPosRepeats.add(seq)
            toxdlTrainPosSeqs.add(seq)

In [30]:
print(len(toxdlTrainPosSeqs))
print(len(toxdlTrainNegSeqs))

4356
5796


In [31]:
print(len(toxdlTrainPosRepeats))
print(len(toxdlTrainNegRepeats))

69
85


In [32]:
# read in test data
toxdlTestPosSeqs = set()
toxdlTestNegSeqs = set()

toxdlTestPosRepeats = set()
toxdlTestNegRepeats = set()

toxdlTestFiles = ['../data/toxdl/bacteria1.fa', '../data/toxdl/test.fa']
for file in toxdlTestFiles:
    toxdlSeqs = SeqIO.parse(open(file), 'fasta')
    for fasta in toxdlSeqs:
        _, isPos = fasta.description.split('\t')
        seq = str(fasta.seq)
        if isPos == '0':
            if seq in toxdlTestNegSeqs:
                toxdlTestNegRepeats.add(seq)
            toxdlTestNegSeqs.add(seq)
        else:
            if seq in toxdlTestPosSeqs:
                toxdlTestPosRepeats.add(seq)
            toxdlTestPosSeqs.add(seq)

In [33]:
print(len(toxdlTestPosSeqs))
print(len(toxdlTestNegSeqs))

239
1047


In [34]:
print(len(toxdlTestPosRepeats))
print(len(toxdlTestNegRepeats))

0
5


In [35]:
toxdlPosOverlaps = []
toxdlPosNotInPrev = set()
for seq in toxdlTestPosSeqs:
    if seq not in toxibtlPosSeqs and seq not in toxinpredPosSeqs:
        toxdlPosNotInPrev.add(seq)
    if seq in toxdlTrainPosSeqs:
        toxdlPosOverlaps.append(seq)
        
toxdlNegOverlaps = []
toxdlNegNotInPrev = set()
for seq in toxdlTestNegSeqs:
    if seq not in toxibtlNegSeqs and seq not in toxinpredNegSeqs:
        toxdlNegNotInPrev.add(seq)
    if seq in toxdlTrainNegSeqs:
        toxdlNegOverlaps.append(seq)

In [36]:
print(len(toxdlPosOverlaps))
print(len(toxdlNegOverlaps))

0
0


In [37]:
print(len(toxdlPosNotInPrev))
print(len(toxdlNegNotInPrev))

180
377


From the above code, we can conclude that
* There are data points in the ToxDL dataset that are not in either the ToxIBTL or ToxinPred datasets.
* There are some repeat data points in the training dataset, which could affect training

## Combine datasets and write to new file

In [38]:
toxdlPosSeqs = toxdlTrainPosSeqs.union(toxdlTestPosSeqs)
toxdlNegSeqs = toxdlTrainNegSeqs.union(toxdlTestNegSeqs)

In [39]:
print(len(toxdlNegSeqs))

6843


In [104]:
posSeqs = set(toxibtlPosSeqs.union(toxinpredPosSeqs).union(toxdlPosSeqs))
negSeqs = set(toxibtlNegSeqs.union(toxinpredNegSeqs).union(toxdlNegSeqs))
posSeqs = [SeqRecord(Seq(seq), id=str(i)) for i, seq in enumerate(posSeqs)]
negSeqs = [SeqRecord(Seq(seq), id=str(i)) for i, seq in enumerate(negSeqs)]

print(len(posSeqs))
print(len(negSeqs))

# with open("../cdhit/input/posSeqs.fasta", "w") as f:
#     SeqIO.write(posSeqs, f, "fasta")
# with open("../cdhit/input/negSeqs.fasta", "w") as f:
#     SeqIO.write(negSeqs, f, "fasta")

6942
24857
