In [None]:
import gc
import itertools
import matplotlib.pyplot as plt
import numpy as np
import random

import pandas as pd
import rpy2.rinterface as rinterface
import rpy2.robjects as robjects
import tqdm
import seaborn as sns
import scipy.stats as stats

from itertools import compress
from Bio import motifs
from Bio.Seq import Seq #, IUPAC
from collections import Counter
from os import listdir
from os.path import join
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from scipy.stats import ks_2samp
from statistics import mean, median

%load_ext rpy2.ipython

%matplotlib inline
pandas2ri.activate()
plt.ioff()

In [None]:
def readin_fastq(core_path,filename):
    seqsraw = pd.read_csv(join(core_path,filename),sep="\t",header=None)  # txt files are tab-separated that is why we say that the seperator is "tab" represented by "\t" (sep = "\t")
    seqs = seqsraw.iloc[range(1,seqsraw.shape[0],4),:]
    return(seqs)

In [None]:
core_path = '/path_to_fastq_files/'

files = listdir(core_path)
filesx = list(compress(files, ["libname" in i for i in files])) # e.g. 5p7 (use a string specific to the library fastq files)
ffiles= list(compress(filesx, ["R1" in i for i in filesx])) # for EXTRA-seq data, the relevant BC is in read1, only for the STARR-seq library 3.5 is the BC in read2 due to a different assembly strategy.

# split files into genomic (g) DNA and mRNA samples
dnafiles = list(compress(ffiles,['gDNA' in i for i in ffiles]))
print(dnafiles)
rnafiles = list(compress(ffiles,['mRNA' in i for i in ffiles]))
print(rnafiles)

repl = 5 # define number of replicates (corresponds to the number of fastq files for that library


In [None]:
# make sure the replicates are actually in order in the rnafiles and dnafiles list!

mRNA_seqs = [readin_fastq(core_path,rnafiles[x]) for x in range(repl)]
gDNA_seqs = [readin_fastq(core_path,dnafiles[x]) for x in range(repl)]


In [None]:
## function to extract and count BCs

pri_flank = 'GACAGACC' # this is the upstream seqeunce of the fixed exogenous primer that was placed upstream of the Barcode in the 5'UTR
loxp= 'ATAACTTC' # loxP seqeunce which is located downstream ot the barcode seqeunce

def count_BC(fastq,le,ri,start,end,coln = ['prim_flank','prim_flank_sh', 'lox_fl', 'BC']):
    
    # fastq = fastq file from mRNA_seqs or gDNA_seqs
    # le = primer flank seqeunce = pri_flank
    #ri = loxP seqeunce = loxp
    #start = the position where the BC starts in read 1, this is known and si 20 for EXTRA-seq libraries, be aware this number needs to be adjusted for the STARR-seq libraries (according to the respective design)
    #end = of barcode, depends on the number of Ns used in the Barcode (N= 10,12, or 14) 
    # coln = names of columuns the function extracts
    
    BC =  [fastq.iloc[x,0][start:end] for x in range(fastq.shape[0])] # extract barcode
    le_fl = [fastq.iloc[x,0][0:start] for x in range(fastq.shape[0])] # extract flanks
    le_fl_sh = [fastq.iloc[x,0][(start-8):start] for x in range(fastq.shape[0])] # extract flanks
    ri_fl = [fastq.iloc[x,0][end:(end+8)] for x in range(fastq.shape[0])] # extract flanks
    DF = pd.DataFrame(np.column_stack([le_fl,le_fl_sh, ri_fl, BC]), 
                               columns=coln) # combine all to data frame
    DFk = DF[ DF[coln[2]]==ri] # keep only BC seqs where flanks are correct

    DFf = DFk[ DFk[coln[1]]==le] # keep only BC seqs where flanks are correct
    cdf = pd.DataFrame.from_dict(Counter(DFf[coln[3]]), orient='index').reset_index() # count BCs
    return(cdf)

In [None]:
# example for 12bp BC library, i.e. 5.7 library
DFg = [count_BC(gDNA_seqs[x],pri_flank,loxp,20,32,coln = ['prim_flank','prim_flank_sh', 'lox_fl', 'BC']) for x in range(len(gDNA_seqs))]


### Save to file


In [None]:
#example
DFg[0].to_csv('/lib_path/count_data_BC12bp_read1_flank_checked_RMCE_5p7_19_8_7_cyc_gDNA_rep1_11_20_2023.csv') # file names includes information about PCR cycle numbers during amplification