# Reverse complementing final sequences

Variable region definition file has option of creating the reverse complement of a sequence. We only want to do this after the best selected sequence is selected and any adapters have been 'ligated' on. Here I am just playing with Biopython to find best way to implement taking reverse complement of a selected sequence's fasta and tsv files.

In [1]:
from Bio.Seq import Seq
from Bio import SeqIO
# paths to selected sequences
f = '../output/initiation_regions/files/init-1/rankedSeqs/init-1.top_seq.fasta'
t = '../output/initiation_regions/files/init-1/rankedSeqs/init-1.top_seq.tsv'

Read the fasta record using Biopython

In [2]:
record = SeqIO.read(f, 'fasta')
record.description

'VR_init-1-33_initiation_region_1_GCskew:0.1_GCcontent:0.4_ATskew:0_ATcontent:0.6_Clustered:False'

Update description to mark as the reverse complement.

In [3]:
def get_RC_description(record):
    split = record.description.split('_')
    split[1] = f'RC-{split[1]}'
    description = '_'.join(split)
    return description

get_RC_description(record)


'VR_RC-init-1-33_initiation_region_1_GCskew:0.1_GCcontent:0.4_ATskew:0_ATcontent:0.6_Clustered:False'

In [9]:
def RC_record(record):
    rc_record = record.reverse_complement()
    rc_record.description = get_RC_description(record)
    return rc_record

rc = RC_record(record)
assert rc.seq != record.seq
rc

SeqRecord(seq=Seq('AGACCTGTCACTTGACCTATTTTCGTAGTATTAGAGTCATGTGTCGCATAGCTT...ACT'), id='<unknown id>', name='<unknown name>', description='VR_RC-init-1-33_initiation_region_1_GCskew:0.1_GCcontent:0.4_ATskew:0_ATcontent:0.6_Clustered:False', dbxrefs=[])

Bio.SeqRecord.SeqRecord

Read tsv file using pandas

In [5]:
import pandas as pd
table = pd.read_table(t, sep='\t')
table

Unnamed: 0,name,id_num,description,GC_content,GC_skew,AT_content,AT_skew,Cluster_length,Clustered_nucleotide,Clustering method,Sequence
0,init-1,33,VR_init-1-33_initiation_region_1_GCskew:0.1_GC...,0.4,0.1,0.6,0.0,,,find_available_random_range,AGTGGCTAAACTTCTATTTTTATACTGTATCATCATATGTTACGAT...


In [16]:
def update_tsv_record(table, RC_record):
    # should only be one record
    table.at[0, 'description'] = RC_record.description
    table.at[0, 'GC_skew'] = table.at[0, 'GC_skew'] * -1
    table.at[0, 'AT_skew'] = table.at[0, 'AT_skew'] * -1
    table.at[0, 'Sequence'] = str(RC_record.seq)
    table.at[0, 'name'] = f'RC-{table.at[0, "name"]}'
    return table

table = pd.read_table(t, sep='\t')  # reread the table to prevent double updates
update_tsv_record(table, rc)




Unnamed: 0,name,id_num,description,GC_content,GC_skew,AT_content,AT_skew,Cluster_length,Clustered_nucleotide,Clustering method,Sequence
0,RC-init-1,33,VR_RC-init-1-33_initiation_region_1_GCskew:0.1...,0.4,-0.1,0.6,-0.0,,,find_available_random_range,AGACCTGTCACTTGACCTATTTTCGTAGTATTAGAGTCATGTGTCG...
