# Pre-processing read alignments

**Summary:**

This notebook compiles the scripts that are used as part of the pre-processing step of our analysis in order to produce a data table (referred in other notebooks as **dataset**) used accross all downstream analysis. 

It is organized in three main parts:<br>

- **1. Parse genomic and transcriptomic alignments:**<br>
Scripts used for extracting alignments features such as start & end positions of each read, reference gene and isoform on which the read is mapped, etc. Also features a script for correcting genomic start & end positions based on transcriptomic alignments. 

- **2. Splice Leader (SL) search:**<br>
Method used for identifying splice leader sequences in transcriptomic reads (see **Figure 2 and Figure 3 notebooks**)

- **3. Hairpin search:**<br>
Method used for identifying hairpin structure (SL mimic) in non-SL reads (see **Figure 4 and Figure 5 notebooks**)


---
<br>



## Import libraries

In [1]:
import pandas as pd
import pysam
import re
from re import search
import json
import pyranges as pr
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
import parasail

<br>

# 1. Parse genomic and transcriptomic alignments

## 1.a. Parsing genomic alignments

In [2]:
# Extract features from genomic alignments (read name, chromosome, genomic start & end positions)

def genomic_stats(input_file, output_file):

    name = []
    start = []
    end = []
    chrom = []

    alignment = pysam.AlignmentFile(input_file, 'rb')

    for read in alignment:

        # Only take into account primary reads
        if not read.is_supplementary and not read.is_secondary and not read.is_unmapped and read.seq is not None:

            name.append(read.query_name)
            chrom.append(alignment.get_reference_name(read.reference_id))
            start.append(read.reference_start+1) # 0-based coordinates - see doc
            end.append(read.reference_end) # Points to one past last coordinate (so 1-based ?)

    # Build dataframe with lists
    table = pd.DataFrame(dict(read=name, chromosome=chrom, genomic_start=start, genomic_end=end,))

    # Save dataframe as a .tsv file
    table.to_csv(output_file, sep='\t', index=None)
    
    alignment.close()


In [3]:
runs = ['SSP_1', 'SSP_2', 'SSP_3', 'SSP_4', 'SSP_5', 'SSP_6', 'SL1_1', 'NP_1', 'NP_2', 'NP_3', 'NP_4', 'NP_5']

for ID in runs:

    input_file = f'/Volumes/elegans/rna_sequencing/{ID}/{ID}-genome_sorted.bam'
    output_name = f'{ID}-genomic_stats.tsv'
    
    # run script
    genomic_stats(input_file, output_name)

<br>

## 1.b. Parsing transcriptomic alignments

In [4]:
# Convert isoform name into gene name
# ex: MTCE.35.1 -> MTCE.35

def isoform_to_gene(isoform):
    
    match = re.search(r"\w+.\d+", isoform)

    if match is not None:
        return match.group(0)
    else:
        return None

In [5]:
# Extract features from transcriptomic alignments (read name, isoform, gene, transcriptomic start & end positions, etc.)

def transcriptomic_stats(input_file, output_file):

    name = []
    isoform = []
    start = []
    end = []
    orientation = []
    softclip = []

    transcriptome = pysam.AlignmentFile(input_file, 'rb')

    for read in transcriptome:

        # Only take into account primary reads
        if not read.is_unmapped and not read.is_secondary and not read.is_supplementary and read.seq is not None:

            name.append(read.query_name)
            isoform.append(transcriptome.get_reference_name(read.reference_id))
            start.append(read.reference_start+1)
            end.append(read.reference_end)

            # Find read orientation
            if read.is_reverse:
                orientation.append('antisense')
            else:
                orientation.append('sense')

            # Find soft-clip length
            cigar = read.cigarstring
            sc = search(r'^(\d*)S', cigar) if cigar is not None else None
            if sc:
                length = int(sc.group(1))
            else:
                length = 0

            softclip.append(1) if length > 80 else softclip.append(0)


    # Generate dataframe
    dataframe = pd.DataFrame(dict(read=name, isoform=isoform, transcriptomic_start=start, transcriptomic_end=end, softclip=softclip, read_orientation=orientation))

    # Find gene_ID with isoform value
    dataframe['gene'] = dataframe['isoform'].apply(isoform_to_gene)

    # Set columns order
    dataframe = dataframe[['read', 'gene', 'isoform', 'transcriptomic_start', 'transcriptomic_end', 'read_orientation', 'softclip']]

    # Save dataframe
    dataframe.to_csv(output_file, sep='\t', index=None)



In [6]:
runs = ['SSP_1', 'SSP_2', 'SSP_3', 'SSP_4', 'SSP_5', 'SSP_6', 'SL1_1', 'NP_1', 'NP_2', 'NP_3', 'NP_4', 'NP_5']

for ID in runs:
    
    input_file = f'/Volumes/elegans/rna_sequencing/{ID}/{ID}-transcriptome_sorted.bam'
    output_file = f'{ID}-transcriptome_stats.tsv'

    # run script
    transcriptomic_stats(input_file, output_file)

<br>

## 1.c. Create summary table

In [7]:
# set empty list to store intermediary dataframes
dflist = []

# loop over runs
runs = ['SSP_1', 'SSP_2', 'SSP_3', 'SSP_4', 'SSP_5', 'SSP_6', 'SL1_1', 'NP_1', 'NP_2', 'NP_3', 'NP_4', 'NP_5']

for ID in runs:

    genome = f'{ID}-genomic_stats.tsv'
    transcriptome = f'{ID}-transcriptome_stats.tsv'

    # open genomic stats
    genome_stats = pd.read_csv(genome, sep='\t')
    genome_stats = genome_stats.set_index('read')

    # open transcriptomics stats
    transcriptome_stats = pd.read_csv(transcriptome, sep='\t')
    transcriptome_stats = transcriptome_stats.set_index('read')

    # concat tables together
    final = pd.concat([genome_stats, transcriptome_stats], join='outer', axis=1)
    
    # only keep reads that are mapped against both genome and transcriptome files
    final = final.loc[(final['genomic_start'].notnull()) & (final['transcriptomic_start'].notnull())]

    final.index.name = 'read'
    final = final.reset_index()

    final['run'] = ID

    final = final[['read', 'gene', 'isoform', 'chromosome', 'genomic_start', 'genomic_end', 
                   'transcriptomic_start', 'transcriptomic_end', 'read_orientation', 'softclip', 'run']]

    dflist.append(final)

stats = pd.concat(dflist, ignore_index=True)


In [8]:
# Build dictionnary of gene orientation 
strand = pd.read_csv('/Volumes/elegans/rna_sequencing/ref/gene&strand.tsv', sep='\t')
strands = {x['gene']: x['strand'] for idx, x in strand.iterrows()}

# Set Start and end correctly
stats['start'] = stats.apply(lambda x: x['genomic_start'] if strands[x['gene']] == '+' else x['genomic_end'], axis=1)
stats['genomic_start'] = stats['start']

stats['end'] = stats.apply(lambda x: x['genomic_end'] if strands[x['gene']] == '+' else x['genomic_start'], axis=1)
stats['genomic_end'] = stats['end']

# reorder and drop 'start' and 'end' columns
stats = stats[['read', 'gene', 'isoform', 'chromosome',
               'genomic_start', 'genomic_end', 'transcriptomic_start', 'transcriptomic_end',
                'read_orientation', 'softclip', 'run']]

# save result
stats.to_csv(f'dataset.tsv', sep='\t', index=None)

In [9]:
## TO REMOVE
stats.to_csv(f'dataset.tsv', sep='\t', index=None)

<br>

## 1.d. Correction of genomic coordinates based on transcriptome alignments

In [10]:
# Open GTF file as PyRanges-object
gtf = pr.read_gtf('/Volumes/elegans/rna_sequencing/ref/c_elegans.PRJNA13758.WS270.canonical_geneset.gtf')
gtf = gtf.df

strand = gtf.loc[gtf['Feature'] == 'transcript'].set_index('transcript_id')['Strand'].to_dict()

gtf = gtf.loc[~gtf['exon_id'].isna()][['exon_id', 'Start', 'End']].set_index('exon_id')[['Start', 'End']].to_dict()

# Open transcriptome_relative_coordinates
with open('/Volumes/elegans/rna_sequencing/ref/transcript_exons.json', 'r') as dico:
    exonics_positions = json.loads(dico.read())

In [11]:
# This script converts a transcriptomic position into a genomic position based on a given isoform
def transcriptome_based_correction(transcript, transcriptome_position):
    
    exons = exonics_positions[transcript]

    for exon, i in exons.items():

        if i[0] <= transcriptome_position <= i[1]:

            delta = transcriptome_position - i[0]

            if strand[transcript] == '+':
                exon_start = gtf['Start'][f'{transcript}.e{exon}']
                genomic_position = exon_start + delta
            else:
                exon_start = gtf['End'][f'{transcript}.e{exon}']
                genomic_position = exon_start - delta

            return genomic_position

        else:
            if int(exon) < len(exons):
                continue
            else:
                return np.NaN

In [12]:
# Transcriptomic alignment positions are used to refine genomic positions
def correct_genomic_position(input_file):

    table = pd.read_csv(input_file, sep='\t')
    
    # correct start and end positons
    table['corrected_genomic_start'] = table.apply(lambda x: transcriptome_based_correction(x['isoform'], x['transcriptomic_start']), axis=1)
    table['corrected_genomic_end'] = table.apply(lambda x: transcriptome_based_correction(x['isoform'], x['transcriptomic_end']), axis=1)
    
    # useful ?
    table = table.loc[table['gene'].notnull()]
    
    # reorder columns
    final = table[['read', 'gene', 'isoform', 'chromosome', 'genomic_start', 'genomic_end',
                   'corrected_genomic_start', 'corrected_genomic_end',
                   'read_orientation', 'softclip', 'run']]
    
    # return table with corrected positions
    return final

In [13]:
# Run script
corrected_dataset = correct_genomic_position(input_file=f'dataset.tsv')

# clean up columns type
col_types = {'read':object, 'gene':object, 'isoform':object, 'chromosome':object,
             'genomic_start':float,'genomic_end':float,'corrected_genomic_start':float,'corrected_genomic_end':float,
             'read_orientation':object,'softclip':int,'run':object}

corrected_dataset = corrected_dataset.astype(col_types)

# save table
corrected_dataset.to_csv('dataset.tsv', sep='\t', index=None)

---

<br>

# 2. Splice Leader (SL) search

This section of the notebook corresponds to the code used for searching splice leader (SL) sequences in the different reads obtained from the sequencing of *C. elegans* transcriptome.

It is composed of three sub-sections:<br>
**a.** Extraction of the last 100bp of the 5' soft-clip sequence.<br>
**b.** Definition of the algorithm used for identifying a SL sequence.<br>
**c.** Inclusion of the SL search result to the dataset table.

## 2.a. Extracting 5' soft-clip sequences

In [14]:
# Extract last 100bp of 5' soft-clip sequence and first 2bp of primary alignment from transcriptomic alignments
# 

def five_prime_softclip(input_file, output_file):

    alignments = pysam.AlignmentFile(input_file, 'rb')

    with open(output_file, 'w+') as fasta:

        for read in alignments:

            if not read.is_secondary and not read.is_supplementary and not read.is_unmapped and read.seq is not None:

                name = read.query_name
                # equivalent to soft-clip length
                start = read.query_alignment_start

                # softclip longer than 100bp
                if start > 100:
                    seq = read.seq[start - 100:start + 2]

                # softclip smaller than 100bp
                elif start <= 100:
                    seq = read.seq[:start + 2]

                fasta.write(f'>{name}\n{seq}\n')

In [15]:
runs = ['SSP_1', 'SSP_2', 'SSP_3', 'SSP_4', 'SSP_5', 'SSP_6', 'SL1_1', 'NP_1', 'NP_2', 'NP_3', 'NP_4', 'NP_5']

for ID in runs:
    
    file = f'/Volumes/elegans/rna_sequencing/{ID}/{ID}-transcriptome_sorted.bam'
    out = f'/Volumes/elegans/rna_sequencing/{ID}/{ID}-five_prime_softclip.fasta'
    
    five_prime_softclip(input_file=file, output_file=out)

## 2.b. SL Search algorithm

In [16]:
DEFAULT_ALIGN_PARAMS = {'match': 1,
                        'mismatch': -1,
                        'gap_open': 2,    # penalty to create a gap
                        'gap_extend': 1}  # penalty to extend a gap (must have created before)


# Performs pairwise local alignment
def semi_global_alignment(reference, query, params=DEFAULT_ALIGN_PARAMS):

    subs_mat = parasail.matrix_create("ACGT", params['match'], params['mismatch'])
    alignment = parasail.sg_trace_striped_32(reference, query, params['gap_open'], params['gap_extend'], subs_mat)

    return alignment

In [17]:
# For each read, all known SL1 and SL2 sequences are matched against their 5' soft-clip sequence.
# A match is returned if the alignment score is higher or equal to 70% (sensitivity parameter) of the sequence length
# ex: a 10bp sequence will be accepted if the returned score is higher of equal at 7.

# If no match is returned, the evaluated SL sequence is shortened (on the 5' side) and the match is performed again.
# ex: sequence 'AAATTGTGTGTGT' returned no match, we then search 'ATTGTGTGTGT'

# The process is repeated until we reach a minimal sequence of 7bp. If no match is found, the SL is considered not found.
# If various SL sequences return the same score they are equally accepted

def search_splice_leaders(input_file, output_file, sensitivity=0.7):

    seq_file = open('/Volumes/elegans/rna_sequencing/ref/SL_sequences.fasta')
    SPLICELEADERS = {record.id: str(record.seq) for record in SeqIO.parse(seq_file, "fasta")}

    reads = {}

    for record in SeqIO.parse(input_file, "fasta"):

        ref = len(record.seq)

        aln_scores = {}
        position = {}
        distance = {}

        for sl_name in SPLICELEADERS:

            sl_seq = SPLICELEADERS[sl_name]

            sl_length = len(sl_seq)

            aln = semi_global_alignment(str(record.seq), sl_seq)
            score = aln.score

            if score < sl_length * sensitivity:

                for pos in reversed(range(7, sl_length, 1)):

                    aln = semi_global_alignment(str(record.seq), sl_seq[-pos:])
                    score = aln.score

                    if score < sensitivity * pos:
                        continue
                    else:
                        aln_scores[sl_name] = score
                        position[sl_name] = pos
                        distance[sl_name] = ref - (int(aln.end_query) + 1)
                        break
            else:
                aln_scores[sl_name] = score
                position[sl_name] = sl_length
                distance[sl_name] = ref - (int(aln.end_query) + 1)

        if len(aln_scores) > 0:

            #### Get best % match SL
            top_score = max(aln_scores.values())
            best_matches = [sl for sl, value in aln_scores.items() if value == top_score]

            if len(best_matches) == 1:

                sl_found = best_matches[0]
                distance = distance[sl_found]
                reads[record.id] = (sl_found, top_score, distance)

            else:

                distance = [dist for sl, dist in distance.items() if sl in best_matches]
                small_dist = min(distance)
                ix = [n for n, dist in enumerate(distance) if dist == small_dist]
                
                closest_match = [sl for n, sl in enumerate(best_matches) if n in ix]

                sl_found = ' / '.join(closest_match)

                reads[record.id] = (sl_found, top_score, small_dist)


    final = pd.DataFrame.from_dict(reads, orient='index')
    final.columns = ['SL','score','distance_to_start']
    final.index.name = 'read'
    final.to_csv(output_file, sep='\t', index=True)

In [18]:
runs = ['SSP_1', 'SSP_2', 'SSP_3', 'SSP_4', 'SSP_5', 'SSP_6', 'SL1_1', 'NP_1', 'NP_2', 'NP_3', 'NP_4', 'NP_5']

for ID in runs:

    file= f'/Volumes/elegans/rna_sequencing/{ID}/{ID}-five_prime_softclip.fasta'
    out= f'{ID}-SL_search.tsv'
    
    # run script
    search_splice_leaders(input_file=file, output_file=out)

## 2.c. Processing the result

In [19]:
# set new dataframe
sl_result = pd.DataFrame()

runs = ['SSP_1', 'SSP_2', 'SSP_3', 'SSP_4', 'SSP_5', 'SSP_6', 'SL1_1', 'NP_1', 'NP_2', 'NP_3', 'NP_4', 'NP_5']

for ID in runs:
    
    # open SL search result as dataframe
    df = pd.read_csv(f'{ID}-SL_search.tsv', sep='\t')
    
    # append to new dataframe
    sl_result = pd.concat([sl_result, df], axis=0)

In [20]:
# Merge SL result dataframe with dataset
dataset_SL = corrected_dataset.merge(sl_result, on='read',how='left')

In [21]:
# Returns a binary result (FOUND / NOT FOUND) based on the detection of a SL with high confidence or not
# Robust SL are defined based on their alignment score (score of 10 or higher)
# Or based on how close they are found to the start of the alignment

def robust_SL(SL, score, dist):
    
    if SL:
    
        if score > 9 or dist < 3:
            return 'FOUND'
        else:
            return 'NOT FOUND'
    else:
        return 'NOT FOUND'

In [22]:
# Find robust SL match
dataset_SL['ROBUST_SL'] = dataset_SL.apply(lambda x: robust_SL(SL=x['SL'], score=x['score'], dist=x['distance_to_start']), axis=1)

In [23]:
def robust_variant(SL, score, dist):

    if SL:
    
        # any match which scored 10 or plus
        if score > 9 :
            return 'FOUND'
        
        # matchs which score 8 or 9 IF immediatily near the ATG
        else:
            
            if score > 7 and dist < 3:
                return 'FOUND'
            
            else:
                return 'NOT FOUND'
    
    else:
        return 'NOT FOUND'

In [24]:
# Find robust SL match
dataset_SL['ROBUST_VARIANT'] = dataset_SL.apply(lambda x: robust_variant(SL=x['SL'], score=x['score'], dist=x['distance_to_start']), axis=1)

In [25]:
# Save result
dataset_SL.to_csv('dataset_+SL.tsv', sep='\t', index=None)

---

<br>

# 3. Hairpin Search

This section of the notebook corresponds to the code used for searching hairpin sequences (referred as **hairpin mimic**) sequences in non-SL reads.


**a.** Definition of the algorithm used for identifying hairpin mimics.<br>
**b.** Processing result of the search algorithm and integration to the dataset table.<br>
**c.** Determination of SL and hairpin mimic events in each gene.<br>

## 3.a. Hairpin mimic search algorithm

In [26]:
# store alignment files in dictionnary for easier manipulation
run_files = {}

runs = ['SSP_1', 'SSP_2', 'SSP_3', 'SSP_4', 'SSP_5', 'SSP_6', 'SL1_1', 'NP_1', 'NP_2', 'NP_3', 'NP_4', 'NP_5']

for ID in runs:
    
    file = f'/Volumes/elegans/rna_sequencing/{ID}/{ID}-transcriptome_sorted.bam'
    run_files[ID] = pysam.AlignmentFile(file,'rb')

In [27]:
DEFAULT_ALIGN_PARAMS = {'match': 1,
                        'mismatch': 0,
                        'gap_open': 2,    # penalty to create a gap
                        'gap_extend': 1}  # penalty to extend a gap (must have created before)


def semi_global_alignment(reference, query, params=DEFAULT_ALIGN_PARAMS):

    subs_mat = parasail.matrix_create("ACGT", params['match'], params['mismatch'])
    alignment = parasail.sg_trace_striped_32(reference, query, params['gap_open'], params['gap_extend'], subs_mat)

    return alignment

In [28]:
# write result directly to a .tsv file as each read is processed
with open('hairpin_search.tsv', 'w+') as out:
    
    # set columns names
    out.write('read\tHAIRPIN_SCORE\tHAIRPIN_SEARCH\n')
    
    # open transcriptomic alignment file
    for ID in runs:
        
        # open corresponding alignment file
        alignments = run_files[ID]
        
        
        # loop over alignments in file
        for read in alignments:
            
            # only evaluate primary alignments
            if not read.is_unmapped and not read.is_secondary and not read.is_supplementary and read.seq is not None:
                
                name = read.query_name
                
                if read.query_qualities is not None and read.is_reverse:
                    
                    start = read.query_alignment_start
                    
                    # if read have too short end the search is not performed
                    if start > 20:
                    
                        # softclip end 
                        softclip_region = str(Seq(read.seq[start-13:start+2]).reverse_complement())

                        # start of aligned sequence
                        aligned_region = str(read.seq[start+2:start+42])

                        # perform semi-global alignment and retrieve score
                        aln = semi_global_alignment(aligned_region, softclip_region)
                        score = aln.score

                        # if there is a match (at least score of 12 for a sequence of 15bp)
                        if score >= 12:
                            out.write(f'{name}\t{score}\tFOUND\n')
 
                        else:
                            out.write(f'{name}\t{score}\tNOT FOUND\n')

                     # if no search was performed
                    else:
                        out.write(f'{name}\t{0}\tNOT FOUND\n')


## 3.b. Processing the result

In [29]:
# open hairpin search table
hairpin_result = pd.read_csv('hairpin_search.tsv', sep='\t')

# Merge SL result dataframe with dataset
dataset_SL_hairpin = dataset_SL.merge(hairpin_result, on='read', how='left')

# save
dataset_SL_hairpin.to_csv('dataset_+SL_+hairpin.tsv', sep='\t', index=None)

## 3.c. Determination of SL and hairpin mimic events in each detected gene

In [None]:
# Compute stats per gene per start positions (%SL / %hairpin / %unidentified)

_GENE = []
_POS = []
_TOTAL = []
_SUBTOTAL = []
_SL = []
_SLpercent = []
_HAIRPIN = []
_HAIRPINpercent = []
_UNIDENTIFIED = []
_UNIDENTIFIEDpercent = []

_SL1 = []
_SL2 = []
_RATIO = []

for (gene, position), reads in dataset_SL_hairpin.groupby(['gene','corrected_genomic_start']):

    # gene
    _GENE.append(gene)
    
    # position
    _POS.append(int(position))
    
    # Total of reads at a given position
    total = len(reads)
    _TOTAL.append(total)
    
    # Sense reads from SL1_1 experiment are not used for counting SL1/SL2/Hairpin percentages
    reads = pd.concat([reads[reads['run'] != 'SL1_1'], 
                       reads[(reads['run'] == 'SL1_1') & (reads['read_orientation'] == 'antisense')]])
    
    # count nb of reads used for measuring percentages
    percent_tot = len(reads)
    _SUBTOTAL.append(percent_tot)
    
    if percent_tot > 0:
    
        #### count SL robust (no matter the variant)
        sl = reads[reads['ROBUST_SL'] == 'FOUND']
        _SL.append(len(sl))
        sl_percent = round(len(sl) / percent_tot * 100,  2)
        _SLpercent.append(sl_percent)
        
        
        # get details on SL1 and SL2 -> subgroup of ROBUST SL for which we have a ROBUST VARIANT
        variants = reads[reads['ROBUST_VARIANT'] == 'FOUND']
        # count SL1
        _sl1 = len(variants[variants['SL'].str.contains('SL1')])
        _SL1.append(_sl1)
        # count SL2 
        _sl2 = len(variants[~variants['SL'].str.contains('SL1')])
        _SL2.append(_sl2)
        # measure SL2/SL1 ratio
        if _sl1+_sl2 > 0:
            ratio = _sl2/(_sl1+_sl2)
        else:
            ratio = None
        _RATIO.append(ratio)
        

        #### Hairpin positive reads
        sl_reads = list(sl['read'])
        hairpin = reads[(reads['read_orientation'] == 'antisense') & (reads['read'].isin(sl_reads) == False) & (reads['HAIRPIN_SEARCH']=='FOUND')]
        _HAIRPIN.append(len(hairpin))
        hairpin_percent = round(len(hairpin) / percent_tot * 100, 2)
        _HAIRPINpercent.append(hairpin_percent)

        #### Unidentified reads
        hairpin_reads = list(hairpin['read'])
        unidentified = reads[reads['read'].isin(sl_reads+hairpin_reads) == False]
        _UNIDENTIFIED.append(len(unidentified))
        unidentified_percent = round(len(unidentified) / percent_tot * 100, 2)
        _UNIDENTIFIEDpercent.append(unidentified_percent)
    
    else:
        _SLpercent.append(0)
        _SL.append(0)

        _SL1.append(0)
        _SL2.append(0)
        _RATIO.append(None)
        
        _HAIRPIN.append(0)
        _HAIRPINpercent.append(0)
        
        _UNIDENTIFIED.append(0)
        _UNIDENTIFIEDpercent.append(100)

    
    
# create dataframe
result = pd.DataFrame(dict(gene=_GENE, position=_POS, total=_TOTAL, evaluated=_SUBTOTAL, SL=_SL, SL1=_SL1, SL2=_SL2, hairpin=_HAIRPIN, unidentified=_UNIDENTIFIED,
                           SLpercent=_SLpercent, hairpinpercent=_HAIRPINpercent, unidentifiedpercent=_UNIDENTIFIEDpercent, SL2_ratio=_RATIO))

result.columns = ['gene', 'position', 'total', 'evaluated', 'SL', 'SL1', 'SL2', 'hairpin', 'unidentified', '%SL', '%hairpin', '%unidentified', 'SL2_ratio']

result = result.sort_values(['gene','position'])

# save table
result.to_csv('SL_&_mimic_positions.tsv', sep='\t', index=None)