# Sequence format

Format and linearize bank sequences of interest to send to Quon lab for R-loop prediction calculations. Start of sequences need to be promoters and end at terminator sequences if present.

In [1]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils.CheckSum import seguid
from pathlib import Path
from datetime import datetime

current_date = datetime.today().strftime('%Y-%m-%d')

Class to linearize Genbank records with respect to the promoter and terminator if present. Start of sequence becomes the TSS and anything upstream of the TSS gets appended to the end of the sequence.

In [2]:
class SeqFormater():
    
    def __init__(self, record, promoter_label, terminator_label=None):
        self.record = record
        self.promoter_label = promoter_label
    
    @property
    def feature_labels(self):
        features = {}
        for f in self.record.features:
            if 'label' in f.qualifiers:
                features[f.qualifiers['label'][0]] = f
        return features
    
    @property
    def promoter(self):
        if self.promoter_label in self.feature_labels:
            return self.feature_labels[self.promoter_label]
        else:
            raise ValueError
    
    def _linearize_with_respect_to_promoter(self):
        if self.promoter.strand == -1:
            self.record = self.record.reverse_complement()
        
        promoter_start = int(self.promoter.location.start)
        sequence = self.record.seq[promoter_start:] + self.record.seq[:promoter_start]
        return sequence
    
    def to_transcribable_record(self):
        return SeqRecord(
            self._linearize_with_respect_to_promoter(),
            f'{self.record.name}-{seguid(self.record.seq)}',
            description=''
        )

Define the promoter and terminators of each series of plasmids.

In [3]:
init_dir = 'sequences/T7_initiation_series'
term_dir = 'sequences/T7_termination_series'
pFC8 = 'sequences/pFC8.gb'
pFC53 = 'sequences/pFC53.gb'

In [4]:
input_params = {
    init_dir: ['T7 +1 Site'],
    term_dir: ['T7\\+1\\Site'],
    pFC8: ['T3\\promoter'],
    pFC53: ['T3 Promoter']
}

In [5]:
def read_genbank_files(path, params_list):
    formaters = []
    path = Path(path)
    if path.is_dir():
        for each_file in path.iterdir():
            if each_file.suffix == '.gb':
                record = SeqIO.read(each_file, 'genbank')
                formaters.append(
                        SeqFormater(record, *params_list)
                )
    else:
        record = SeqIO.read(path, 'genbank')
        formaters.append(
            SeqFormater(record, *params_list)
        )
    return formaters

In [6]:
init_formaters = read_genbank_files(init_dir, input_params[init_dir])
term_formaters = read_genbank_files(term_dir, input_params[term_dir])
other_formaters = read_genbank_files(pFC8, input_params[pFC8]) + read_genbank_files(pFC53, input_params[pFC53])

'LOCUS       Exported                3993 bp DNA     circular SYN 06-NOV-2017\n'
Found locus 'Exported' size '3993' residue_type 'DNA'
Some fields may be wrong.


In [7]:
def get_all_variable_region_sequences(formaters):
    var_regions = []
    for each_form in formaters:
        var_regions.append(each_form.feature_labels['Variable region'].extract(each_form.record))
    return var_regions

Currently we do not know which initiatior sequence will be used in the termination series. Therefore replace the placeholder in each plasmid with each of the VR inserts for the termination series in the function below.

In [8]:
def permute_strong_inits(t7_term_formaters):
    variable_regions = get_all_variable_region_sequences(t7_term_formaters)
    records = [f.record for f in t7_term_formaters]
    permuted = []
    for each_form in t7_term_formaters:
        s = each_form.feature_labels['Placeholder strong initiator'].location.start.position
        e = each_form.feature_labels['Placeholder strong initiator'].location.end.position
        for each_vr in variable_regions:
            temp_seq = each_form.record.seq[:s] + each_vr.seq + each_form.record.seq[e:]
            each_form.record.seq = temp_seq
            permuted.append(each_form.to_transcribable_record())
    return permuted

In [9]:
term_series_records = permute_strong_inits(term_formaters)
len(term_series_records)

961

In [10]:
init_series_records = [f.to_transcribable_record() for f in init_formaters]
len(init_series_records)

31

In [11]:
others = [f.to_transcribable_record() for f in other_formaters]

Combine both series into a one list and write all included sequences to a fasta file.

In [12]:
all_records = term_series_records + init_series_records + others
SeqIO.write(all_records, f'prediction_substrates_{current_date}.fa', 'fasta')

994