# Design oligo sequences of STRs used for lentiMPRA

### need to account for strandedness 24.07.30 😭

In [1]:
# imports
import argparse
import hashlib
import numpy as np
import pandas as pd
import os
import pyfaidx
import random
import subprocess
import sys

from trtools.utils import utils

In [2]:
# Global variables for oligo construction. These cannot change
# R2 primer-(var + genomic context)-minP-spacer-filler

# FIVE_PRIME_ADAPT = 'AGGACCGGATCAACT'

R2_PRIMER = 'GTGCTCTTCCGATCT'
GLOBAL_FILLER_SEQ = 'TAACCAGGCGTGTTAGCTGCTGTGCTGTCCTACGAGTAAACAGTAGAGTCC\
    GTGGGCACGCGAGCACGGTGAGTCGACTCTGGCCTCATCACCATTTAGTTTGCGCAAGCGCTCTTTTTA\
        TAGGACCTGTCTTACATCCCTCATTAACGGAATCGATTACCGGCTAGCGTTGAAATGGAGAAACCG\
            GCTTGCAGTCGAAA'
MIN_PROMOTER = 'CACTAGAGGGTATATAATGGAAGCTCGACTTCCAGCTTGGCAATCCGGTACTGT'
SPACER = 'GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG'
PROBE_LEN = 300

MIN_STR_LEN = 0
MAX_STR_LEN = 105

MAX_ALLELE_LEN = 5
MIN_FILLER_LEN = 0
REQUIRED_ELTS_LEN = len(R2_PRIMER) + len(MIN_PROMOTER) + len(SPACER)

MAX_VREG = PROBE_LEN - (REQUIRED_ELTS_LEN)

In [3]:
def create_probes (path_genome, path_bed_file):
    """
    creates oligo library to be used for MPRA experiment to study STRs
    parameter:
        1. path_genome
            - path to the genome of the organism of interest
        2. path_bed_file
            - path to the bed file 
                > must contain chr, start_str, end_str, rpt_unit
    output:
    """

    # create an empty list to be converted to a dataframe later
    array_probes_list = []

    # load in fasta files using pyfaidx and bed files using pandas
    genome = pyfaidx.Fasta(path_genome)
    bed_df = pd.read_csv(path_bed_file)

    bed_df['len_str'] = bed_df['end_str'] - bed_df['start_str']

    chromosome = bed_df['chrchrom']
    start = bed_df['start_str']
    end = bed_df['end_str']
    rptUnit = bed_df['rpt_unit']

    # figure out what is the maximum context length that can be generated

    # add in a column called max_str_size which is the longest str region that
    # will be generated (5 * lenght of rpt unit) + length of the str region
    bed_df['max_str_size'] = (MAX_ALLELE_LEN * bed_df['rpt_unit']).str.len() + \
        bed_df['len_str']
    
    # max_rpt_unit_size is the maximum of the max_str_size column
    max_rpt_unit_size = np.max(bed_df['max_str_size'])
    
    # chooses the smaller length between the maximum length of str and maximum
    # repeat unit length
    max_bp_len = np.min([MAX_STR_LEN, max_rpt_unit_size])

    # gives the maximum amount of context that can be generated; all oligos
    # follow this maximum
    ### could also generate maximum for each oligo (unsure if we want that...)
    max_context_size = PROBE_LEN - REQUIRED_ELTS_LEN - MIN_FILLER_LEN - \
        max_bp_len
    
    copy_num_variation = ['m5', 'p5']

    for (chrom, str_start, str_end, rpt) in zip(chromosome, start, end, rptUnit):
        left_flank = str(genome[chrom][str_start-int(np.ceil(max_context_size/2)):str_start]).upper()
        str_refseq = str(genome[chrom][str_start:str_end]).upper()
        right_flank = str(genome[chrom][str_end:str_end+int(np.floor(max_context_size/2))]).upper()

        ref_seq = chrom+','+str(str_start)+','+str(str_end)+','+rpt+\
                    ',ref,'+left_flank+','+str_refseq+','+right_flank
    

        for copy_num_var in copy_num_variation:
            if copy_num_var == "p5":
                str_p5seq = str_refseq + (5*rpt)
                p5_seq = chrom+','+str(str_start)+','+str(str_end)+','+rpt+\
                    ',p5,'+left_flank+','+str_p5seq+','+right_flank
                array_probes_list.append(p5_seq)
            if copy_num_var == "m5":
                str_m5seq = str_refseq[0:-1*(len(rpt) * 5)]
                m5_seq = chrom+','+str(str_start)+','+str(str_end)+','+rpt+\
                    ',m5,'+left_flank+','+str_m5seq+','+right_flank
                array_probes_list.append(m5_seq)

        array_probes_list.append(ref_seq)


    split_strings = [string.split(',') for string in array_probes_list]
    column_headers = ['chr', 'str start', 'str end', 'motif', 'allele', 'left', 
                    'str', 'right']
    context_df = pd.DataFrame(split_strings, columns=column_headers)
    context_df = context_df[context_df['str'].str.len() <= 100]

    context_df['probeseq'] = context_df['left'] + context_df['str'] + context_df['right']

    vreg_len = (context_df['probeseq']).str.len()
    vreg = context_df['probeseq']

    filler_len = []
    oligo_list = []
    size_check = []

    for i in vreg_len:
        filler_len.append(PROBE_LEN-i-REQUIRED_ELTS_LEN)


    for j, seq in zip(filler_len, vreg):
        assert(j >= MIN_FILLER_LEN)
        filler_seq = GLOBAL_FILLER_SEQ[0:j]
        size_check.append(R2_PRIMER + seq + MIN_PROMOTER + 
                          SPACER + filler_seq)
        oligo_list.append(R2_PRIMER + '\t' + seq + '\t' + MIN_PROMOTER + '\t' + 
                          SPACER + '\t' + filler_seq)
    
    for line in size_check:
        assert(len(line) == PROBE_LEN)
    
    split_strings = [string.split('\t') for string in oligo_list]

    column_headers = ['R2 primer', 'probeseq', 'minP', 'spacer', 
                    'filler']
    oligo_df = pd.DataFrame(split_strings, columns=column_headers)

    test_df = pd.merge(left=context_df, right=oligo_df)
    test_df['final_oligo'] = test_df['R2 primer'] + test_df['probeseq'] + \
        test_df['minP'] + test_df['spacer'] + test_df['filler']

    test_df = test_df.drop_duplicates(subset='final_oligo')

    return(test_df)


In [4]:
oligos = create_probes(path_genome='/Users/user/Desktop/MPRA/lentiSTR design/hg38.fa', 
                       path_bed_file='/Users/user/Desktop/MPRA/oligos_v1.bed')

In [5]:
oligos

Unnamed: 0,chr,str start,str end,motif,allele,left,str,right,probeseq,R2 primer,minP,spacer,filler,final_oligo
0,chr21,5129000,5129018,T,m5,GCTTGATCTGGGTTAGTCTGGGGGCAAGTGAGCCGCCGATTCTGTT...,TTTTTTTTTTTTT,GAGACGGAGTTTCCCTCTTGTTGCCCGGGCTGGAGTGCAATGGCGC...,GCTTGATCTGGGTTAGTCTGGGGGCAAGTGAGCCGCCGATTCTGTT...,GTGCTCTTCCGATCT,CACTAGAGGGTATATAATGGAAGCTCGACTTCCAGCTTGGCAATCC...,GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG,TAACCAGGCGTGTTAGCTGCTGTGCTGTCCTACGAGTAAACAGTAG...,GTGCTCTTCCGATCTGCTTGATCTGGGTTAGTCTGGGGGCAAGTGA...
1,chr21,5129000,5129018,T,p5,GCTTGATCTGGGTTAGTCTGGGGGCAAGTGAGCCGCCGATTCTGTT...,TTTTTTTTTTTTTTTTTTTTTTT,GAGACGGAGTTTCCCTCTTGTTGCCCGGGCTGGAGTGCAATGGCGC...,GCTTGATCTGGGTTAGTCTGGGGGCAAGTGAGCCGCCGATTCTGTT...,GTGCTCTTCCGATCT,CACTAGAGGGTATATAATGGAAGCTCGACTTCCAGCTTGGCAATCC...,GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG,TAACCAGGCGTGTTAGCTGCTGTGCTGTCCTACGAGTAAACAGTAG...,GTGCTCTTCCGATCTGCTTGATCTGGGTTAGTCTGGGGGCAAGTGA...
2,chr21,5129000,5129018,T,ref,GCTTGATCTGGGTTAGTCTGGGGGCAAGTGAGCCGCCGATTCTGTT...,TTTTTTTTTTTTTTTTTT,GAGACGGAGTTTCCCTCTTGTTGCCCGGGCTGGAGTGCAATGGCGC...,GCTTGATCTGGGTTAGTCTGGGGGCAAGTGAGCCGCCGATTCTGTT...,GTGCTCTTCCGATCT,CACTAGAGGGTATATAATGGAAGCTCGACTTCCAGCTTGGCAATCC...,GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG,TAACCAGGCGTGTTAGCTGCTGTGCTGTCCTACGAGTAAACAGTAG...,GTGCTCTTCCGATCTGCTTGATCTGGGTTAGTCTGGGGGCAAGTGA...
3,chr21,6468992,6469013,TTG,m5,CCTCCCTCAGAGCCTTCAGCCAGCCTGCGGGAGAGAAAGATGATAA...,GTTGTT,TGAGACAGAGTCTGTGTCACACAGGCTGGAGTGCAATGGTGAGATC...,CCTCCCTCAGAGCCTTCAGCCAGCCTGCGGGAGAGAAAGATGATAA...,GTGCTCTTCCGATCT,CACTAGAGGGTATATAATGGAAGCTCGACTTCCAGCTTGGCAATCC...,GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG,TAACCAGGCGTGTTAGCTGCTGTGCTGTCCTACGAGTAAACAGTAG...,GTGCTCTTCCGATCTCCTCCCTCAGAGCCTTCAGCCAGCCTGCGGG...
4,chr21,6468992,6469013,TTG,p5,CCTCCCTCAGAGCCTTCAGCCAGCCTGCGGGAGAGAAAGATGATAA...,GTTGTTTTTGTTGTTGTTGTTTTGTTGTTGTTGTTG,TGAGACAGAGTCTGTGTCACACAGGCTGGAGTGCAATGGTGAGATC...,CCTCCCTCAGAGCCTTCAGCCAGCCTGCGGGAGAGAAAGATGATAA...,GTGCTCTTCCGATCT,CACTAGAGGGTATATAATGGAAGCTCGACTTCCAGCTTGGCAATCC...,GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG,TAACCAGGCGTGTTAGCTGCTGTGCTGTCCTACGAGTAAACAGTAG...,GTGCTCTTCCGATCTCCTCCCTCAGAGCCTTCAGCCAGCCTGCGGG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,chr3,136449901,136449918,A,p5,AACTTTTTAGTAGTTATCTCATACTTATCAATTTCTGAAACTATTG...,TTTTTTTTTTTTTTTTTAAAAA,AGAGACAGGGTCTTGCACAGGCTGGAGTACAAATGGAGCGATCATA...,AACTTTTTAGTAGTTATCTCATACTTATCAATTTCTGAAACTATTG...,GTGCTCTTCCGATCT,CACTAGAGGGTATATAATGGAAGCTCGACTTCCAGCTTGGCAATCC...,GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG,TAACCAGGCGTGTTAGCTGCTGTGCTGTCCTACGAGTAAACAGTAG...,GTGCTCTTCCGATCTAACTTTTTAGTAGTTATCTCATACTTATCAA...
599,chr3,136449901,136449918,A,ref,AACTTTTTAGTAGTTATCTCATACTTATCAATTTCTGAAACTATTG...,TTTTTTTTTTTTTTTTT,AGAGACAGGGTCTTGCACAGGCTGGAGTACAAATGGAGCGATCATA...,AACTTTTTAGTAGTTATCTCATACTTATCAATTTCTGAAACTATTG...,GTGCTCTTCCGATCT,CACTAGAGGGTATATAATGGAAGCTCGACTTCCAGCTTGGCAATCC...,GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG,TAACCAGGCGTGTTAGCTGCTGTGCTGTCCTACGAGTAAACAGTAG...,GTGCTCTTCCGATCTAACTTTTTAGTAGTTATCTCATACTTATCAA...
600,chr10,44390336,44390351,A,m5,GATCACACCACTGCACTTCAGTCTAGGTGACAAGAGCAAAACTCTG...,AAAAAAAAAA,GGATATTATGAAGGCTACAGGGGAAGAGATGCATAGGGCGAGGTAT...,GATCACACCACTGCACTTCAGTCTAGGTGACAAGAGCAAAACTCTG...,GTGCTCTTCCGATCT,CACTAGAGGGTATATAATGGAAGCTCGACTTCCAGCTTGGCAATCC...,GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG,TAACCAGGCGTGTTAGCTGCTGTGCTGTCCTACGAGTAAACAGTAG...,GTGCTCTTCCGATCTGATCACACCACTGCACTTCAGTCTAGGTGAC...
601,chr10,44390336,44390351,A,p5,GATCACACCACTGCACTTCAGTCTAGGTGACAAGAGCAAAACTCTG...,AAAAAAAAAAAAAAAAAAAA,GGATATTATGAAGGCTACAGGGGAAGAGATGCATAGGGCGAGGTAT...,GATCACACCACTGCACTTCAGTCTAGGTGACAAGAGCAAAACTCTG...,GTGCTCTTCCGATCT,CACTAGAGGGTATATAATGGAAGCTCGACTTCCAGCTTGGCAATCC...,GCAAAGTGAACACATCGCTAAGCGAAAGCTAAG,TAACCAGGCGTGTTAGCTGCTGTGCTGTCCTACGAGTAAACAGTAG...,GTGCTCTTCCGATCTGATCACACCACTGCACTTCAGTCTAGGTGAC...
