# Give Data Unique SeqIDs

Turns out some programs like to use SeqIDs (mmseqs2 cough cough) for important things. Redefining seq IDs now to be unique.

In [39]:
import random
import string
from Bio import SeqIO
import pandas as pd
from pathlib import Path
import pickle

In [18]:
class RandSeqID():
    def __init__(self, used_seqs = None):
        if not used_seqs:
            used_seqs = set()
        self.used_seqs = used_seqs
        self.alphabet = string.ascii_letters + string.digits
    
    def write_used_seqs(self, filename):
        with open(filename, 'wb') as file:
            pickle.dump(self.used_seqs, file)
    
    def read_used_seqs(self, filename):
        with open(filename, 'rb') as file:
            self.used_seqs = pickle.load(filename)
    
    def get_new_seq_id(self):
        seq_id = None
        new_id = False
        while not new_id:
            seq_id = random.choices(self.alphabet, k=6)
            seq_id = "".join(seq_id)
            if seq_id not in self.used_seqs:
                self.used_seqs.add(seq_id)
                new_id = True
        return seq_id

seqGen = RandSeqID()

# Go through each file, and rename the sequences with IDs.

## Nontoxin Work

In [37]:
nontoxin_fasta_files = [
    'nontoxin-0.fasta',
    'nontoxin-1.fasta',
    'nontoxin-2.fasta',
    'nontoxin-3.fasta',
]

for nontoxin_file in nontoxin_fasta_files:
    print(f"Processing {nontoxin_file}...")
    seqs = list(SeqIO.parse(nontoxin_file, 'fasta'))
    for seq in seqs:
        seq.description = ''
        seq.name = ''
        seq.id = 'nontox-' + seqGen.get_new_seq_id()
    SeqIO.write(seqs, nontoxin_file, 'fasta')

Processing nontoxin-0.fasta...
Processing nontoxin-1.fasta...
Processing nontoxin-2.fasta...
Processing nontoxin-3.fasta...


## Toxin Work

In [38]:
toxin_fasta_files = [
    'toxin-0.fasta',
    'toxin-1.fasta',
    'toxin-2.fasta',
]

for toxin_file in toxin_fasta_files:
    print(f"Processing {toxin_file}...")
    seqs = list(SeqIO.parse(toxin_file, 'fasta'))
    for seq in seqs:
        seq.description = ''
        seq.name = ''
        seq.id = 'tox-' + seqGen.get_new_seq_id()
    SeqIO.write(seqs, toxin_file, 'fasta')

Processing toxin-0.fasta...
Processing toxin-1.fasta...
Processing toxin-2.fasta...


## Write the Set of Names

In [46]:
with open('usedSeqs.pickle', 'wb') as file:
    pickle.dump(seqGen.used_seqs, file)