# Sequence format

Format and linearize bank sequences of interest to send to Quon lab for R-loop prediction calculations. Start of sequences need to be promoters and end at terminator sequences if present.

In [145]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from pathlib import Path
from datetime import datetime

Class to linearize Genbank records with respect to the promoter and terminator if present. Start of sequence becomes the TSS and anything upstream of the TSS gets appended to the end of the sequence.

In [151]:
class SeqFormater():
    
    def __init__(self, record, promoter_label, terminator_label=None):
        self.record = record
        self.promoter_label = promoter_label
        self.terminator_label = terminator_label
    
    @property
    def feature_labels(self):
        return {f.qualifiers['label'][0]: f for f in self.record.features}
    
    @property
    def promoter(self):
        if self.promoter_label in self.feature_labels:
            return self.feature_labels[self.promoter_label]
        else:
            raise ValueError
    
    @property
    def terminator(self):
        if self.terminator_label:

            if self.terminator_label in self.feature_labels:
                return self.feature_labels[self.terminator_label]
            else:
                raise ValueError
        else:
            return None
        
    def _linearize_with_respect_to_promoter(self):
        promoter_start = int(self.promoter.location.start)
        if self.terminator:
            terminator_end = int(self.terminator.location.end)
            sequence = self.record.seq[promoter_start:terminator_end]
        else:
            sequence = self.record.seq[promoter_start:] + self.record.seq[:promoter_start]
        return sequence
    
    def to_linear_record(self):
        return SeqRecord(
            self._linearize_with_respect_to_promoter(),
            self.record.name,
            description=''
        )

Define the promoter and terminators of each series of plasmids.

In [144]:
input_params = {
    'sequences/T7_initiation_series': ['T7 +1 Site'],
    'sequences/T7_termination_series': ['T7\\+1\\Site'],
    'sequences/Tac_termination_series': ['tac\\promoter', 'T1T2\\terminators'],
    'sequences/Tac_initiation_series': ['tac\\promoter', 'T1T2\\terminators'],
}

In [152]:
linearized_records = []

for each_input in input_params:
    path = Path(each_input)
    
    if path.is_dir():  # get all files in the directory
        for each_file in path.iterdir():
            if each_file.suffix == '.gb':
                record = SeqIO.read(each_file, 'genbank')
                linearized_records.append(
                    SeqFormater(record, *input_params[each_input]).to_linear_record()
                )
    else:
        record = SeqIO.parse(each_input, 'genbank')
        linearized_records.append(
            SeqFormater(record, *input_params[each_input])
        )

Write linearized records to fasta file.

In [153]:
output_file = f'linearized_plasmids_{datetime.now().strftime("%m-%d-%Y")}.fa'
SeqIO.write(linearized_records, output_file, 'fasta')

124

In [155]:
for r in linearized_records:
    print(len(r.seq), r.id)

3322 T7_init_VR-6.ins
3322 T7_init_VR-28.in
3322 T7_init_VR-22.in
3322 T7_init_VR-23.in
3322 T7_init_VR-19.in
3322 T7_init_VR-7.ins
3322 T7_init_VR-31.in
3322 T7_init_VR-10.in
3322 T7_init_VR-15.in
3322 T7_init_VR-1.ins
3322 T7_init_VR-16.in
3322 T7_init_VR-3.ins
3322 T7_init_VR-29.in
3322 T7_init_VR-4.ins
3322 T7_init_VR-9.ins
3322 T7_init_VR-20.in
3322 T7_init_VR-11.in
3322 T7_init_VR-26.in
3322 T7_init_VR-17.in
3322 T7_init_VR-24.in
3322 T7_init_VR-21.in
3322 T7_init_VR-14.in
3322 T7_init_VR-18.in
3322 T7_init_VR-27.in
3322 T7_init_VR-25.in
3322 T7_init_VR-2.ins
3322 T7_init_VR-5.ins
3322 T7_init_VR-12.in
3322 T7_init_VR-13.in
3322 T7_init_VR-8.ins
3322 T7_init_VR-30.in
3039 T7_term_VR-10.in
3039 T7_term_VR-21.in
3039 T7_term_VR-22.in
3039 T7_term_VR-17.in
3039 T7_term_VR-24.in
3039 T7_term_VR-14.in
3039 T7_term_VR-2.ins
3039 T7_term_VR-11.in
3039 T7_term_VR-23.in
3039 T7_term_VR-26.in
3039 T7_term_VR-30.in
3039 T7_term_VR-19.in
3039 T7_term_VR-18.in
3039 T7_term_VR-25.in
3039 T7_te

In [156]:
y = '/home/ethan/Documents/predict-seqs/sequences/Tac_termination_series/tac_term_T7_term_VR-1.gb'
y1 = SeqIO.read(y, 'genbank')

In [162]:
y1.features.

[SeqFeature(FeatureLocation(ExactPosition(5), ExactPosition(34), strand=-1), type='promoter'),
 SeqFeature(FeatureLocation(ExactPosition(535), ExactPosition(541), strand=1), type='misc_feature'),
 SeqFeature(FeatureLocation(ExactPosition(1044), ExactPosition(1877), strand=-1), type='CDS'),
 SeqFeature(FeatureLocation(ExactPosition(1781), ExactPosition(1787), strand=1), type='misc_feature'),
 SeqFeature(FeatureLocation(ExactPosition(2566), ExactPosition(2938), strand=-1), type='terminator'),
 SeqFeature(FeatureLocation(ExactPosition(2944), ExactPosition(2959), strand=1), type='CDS'),
 SeqFeature(FeatureLocation(ExactPosition(2944), ExactPosition(2959), strand=1), type='primer_bind'),
 SeqFeature(FeatureLocation(ExactPosition(2959), ExactPosition(3159), strand=1), type='CDS'),
 SeqFeature(FeatureLocation(ExactPosition(3169), ExactPosition(3369), strand=-1), type='misc'),
 SeqFeature(FeatureLocation(ExactPosition(3349), ExactPosition(3369), strand=-1), type='primer_bind')]

In [167]:
str(y1.seq[2938:34])

''

In [169]:
l = list(range(10))
l

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [182]:
y1.features[0].location.start

ExactPosition(5)

In [None]:
def seq_linked_list(seq):
    
    linked = {}
    

In [188]:
y1.features

[SeqFeature(FeatureLocation(ExactPosition(5), ExactPosition(34), strand=-1), type='promoter'),
 SeqFeature(FeatureLocation(ExactPosition(535), ExactPosition(541), strand=1), type='misc_feature'),
 SeqFeature(FeatureLocation(ExactPosition(1044), ExactPosition(1877), strand=-1), type='CDS'),
 SeqFeature(FeatureLocation(ExactPosition(1781), ExactPosition(1787), strand=1), type='misc_feature'),
 SeqFeature(FeatureLocation(ExactPosition(2566), ExactPosition(2938), strand=-1), type='terminator'),
 SeqFeature(FeatureLocation(ExactPosition(2944), ExactPosition(2959), strand=1), type='CDS'),
 SeqFeature(FeatureLocation(ExactPosition(2944), ExactPosition(2959), strand=1), type='primer_bind'),
 SeqFeature(FeatureLocation(ExactPosition(2959), ExactPosition(3159), strand=1), type='CDS'),
 SeqFeature(FeatureLocation(ExactPosition(3169), ExactPosition(3369), strand=-1), type='misc'),
 SeqFeature(FeatureLocation(ExactPosition(3349), ExactPosition(3369), strand=-1), type='primer_bind')]

In [228]:
seq = 'ATG<------CATGGGTTGGGGCCCADSFADAFCADFASDFASDFASDFADFASFADSFSADF<t-------FFGFGFG'

In [229]:
p_end = 11
p_start = 3
t_s = 63
if p_end - t_s > 0:
    print(seq[t_s:p_end])
else:
    print(
        seq[t_s:len(seq)-1]+seq[0:p_end]
    )

<t-------FFGFGFATG<------C
