Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import random
import csv
import math
import sys
import os
import numpy as np
import pathlib
from pathlib import Path


'''
Inputs are a BED format file with or without strand info, out directory, logical indicator for strand info, and sizes of BED file adjustments

Primary outputs are the set of new BED regions from input sizes, corresponding merged sites, merge sites adjusted to dyads, and dyads adjusted to input "footprint" size.

Outputs are then used for bedtools calculations to provide:
- original BED entries overlapped by each merged "footprint", printed together in a new file (-C flag)
- counts of the number of original dyads involved in each merged interval "footprint"
'''

infile = "/content/drive/MyDrive/Abf1_all_sf_MACS3_peaks_dyads.bed" #sys.argv[1] # A bed format file of dyads or regions (mp is calculated in conversion), dyads req. for calcs
outdir = "/content/drive/MyDrive/calcs" #sys.argv[2] # A directory for files to be placed and new directories to be made
strand = "FALSE" #sys.argv[3] # Logical (TRUE or FALSE) input file has strand info that should be copied? For range extension only
sizes = [0,1,2,3,4,5,6,7,8,9,10] #list(sys.argv[4:]) # A list of values to be used as symmetric modifiers of the "footprint" size during calcs

def sort_key(line): # A key to feed to the sort function that acts like bash sort
	fields = line.split()
	try:
		return fields[0], int(fields[1]), int(fields[2])
	except (IndexError, ValueError):
		return () # sort invalid lines together

os.mkdir(outdir)
out_name = Path(infile).stem
out_root = str(outdir + '/' + out_name)
sep = '_'
name_list = out_name.split(sep)[0:2]
name = sep.join(name_list)
srtout=str(out_root + '_sorted.bed')
with open(infile) as inlines:
  lines = inlines.readlines()
  inlines.close()

lines.sort(key=sort_key)

with open(srtout, 'w') as sortfile:
  sortfile.writelines(lines)
  sortfile.close()

#Build new bed regions based on input dyads and requested intervals
print('Converting dyads to ' + str(len(sizes)) + ' interval(s)')
for s in sizes:
	bed_name = str(out_root + '_' + str(2*int(s)) + '.bed')
	new_bed = open(bed_name, 'w+')
	with open(srtout) as file:
		values = csv.reader(file, delimiter = '\t')
		a = list(values)
		for i in range(len(a)):
			v = int(s)
			e = a[i]
			chr = e[0]
			m_p = math.floor(int(int(e[2]) - ((int(e[2]) - int(e[1]))/2)))
			if v <= m_p:
				b_v = str(m_p - v)
				a_v = str(m_p + v)
			else:
				b_v = str(0)
				a_v = str(m_p*2)
				print('Negative entry detected and fixed')

			new_bed.write(chr)
			new_bed.write('\t')
			new_bed.write(b_v)
			new_bed.write('\t')
			new_bed.write(a_v)
			new_bed.write('\t')
			new_bed.write(str(name + '_' + str(2*int(s)) + '_site_' + str(i + 1)))
			new_bed.write('\t')
			if strand == 'TRUE':
				new_bed.write('\t')
				new_bed.write(e[4])
				new_bed.write('\n')
			else:
				new_bed.write('\n')
	new_bed.close
	file.close

print('Dyads converted to BED intervals from requested ranges')

#Use bedtools to merge overlapping regions in each of the new bed files
merdir=str(outdir + '/merged_intervals')
os.mkdir(merdir)
for bfile in outdir:
  if os.path.isfile(bfile):
    mout = str(merdir + '/' + mname + '_merged.bed')
    with open(bfile) as fileb:
      bvals = csv.reader(fileb, delimiter = '\t')
      b = list(bvals)
      for l in range(len(b)):
        j = l + 1
        u = b[l]
        chrc = u[0]
        cu = u[1]
        cd = u[2]
        v = b[j]
        chrn = v[0]
        nu = v[1]
        nd = v[2]
        if chrc == chrn:
          state = "FALSE" #set the merge state to false to enagage optimizer
          while state == "FALSE":
            if int(cd) >= int(nu): #if end of current entry overlaps or touches start of next
              newf, news=cu, nd
              l = l + 1
              if newf < cu and news < nd:
                
print('Overlapping intervals merged for all BEDs')

#Remake dyads after merging overlaps
dydir=str(outdir + '/merged_dyads')
os.mkdir(dydir)
for mset in merdir:
  dname = Path(mset).stem
  dout = str(dydir + '/' + dname + '_dyads.bed')
  dyfile = open(dout, 'w+')
  if os.path.isfile(mset):
    with open(mset) as merval:
      vals = csv.reader(merval)
      m = list(vals)
      for r in range(len(m)):
        p = m[r]
        chrm = p[0]
        mc = math.floor(int(int(p[2]) - ((int(p[2]) - int(p[1]))/2)))
        dyfile.write(chrm)
        dyfile.write('\t')
        dyfile.write(str(mc))
        dyfile.write('\t')
        dyfile.write(str(mc + 1))
        dyfile.write('\t')
        dyfile.write(str(name + '_site_' + str(r + 1)))
        dyfile.write('\t')
        dyfile.write('\n')

print('Dyads calculated from merged regions for all BEDs')

#Make footprints using the region sizes defined at input and used to merge
fpdir=str(outdir + '/new_prints')
os.mkdir(fpdir)
for dset in dydir:
  if os.path.isfile(dset):
    fpname = Path(dset).stem
    fpout = str(fpdir + '/' + fpname + '_prints.bed')
    info = Path(fpout).stem
    fpfile = open(fpout, 'w+')
    fv = info.split('_')[-4]
    with open(dset) as dyval:
      dvs = csv.reader(dyval)
      d = list(dvs)
      for t in range(len(d)):
        q = d[t]
        chrd = q[0]
        ud = int(q[1]) - int(fv)
        dd = int(q[1]) + int(fv)
        fpfile.write(chrd)
        fpfile.write('\t')
        fpfile.write(str(ud))
        fpfile.write('\t')
        fpfile.write(str(dd))
        fpfile.write('\t')
        fpfile.write(str(name + '_site_' + str(t + 1)))
        fpfile.write('\t')
        fpfile.write('\n')

# Check how well each set of footprints in a directory covers the original dyad set
ovdir=str(outdir + '/dyad_cov')
for fpset in fpdir:
  if os.path.isfile(fpset):
    ovname = Path(fpset).stem
    ovout = str(ovdir + '/' + ovname + '_cov_all_dy.tsv')
    ctout = str(ovdir + '/' + ovname + '_counts.tsv')
    counts = open(ctout, 'w+')
    ovlp = open(ovout, 'w+')
    b=list(csv.reader(fpset, delimiter = '\t'))
    for j in range(len(b)):
      f=b[j]
      chrj=f[0]
      rup=f[1]
      rdown=f[2]
      dipc = 0 #Dyad in print counter for each dyad
      c=list(csv.reader(infile, delimiter = '\t'))
      for k in range(len(c)):
        g=c[k]
        chrk=g[0]
        mp=g[1]
        ddown=g[2]
        if chrj == chrk:
          if mp >= rup and mp <= rdown: # the dyad is contained in the entry
            dipc = dipc + 1
            ovlp.write(chrj)
            ovlp.write('\t')
            ovlp.write(str(rup))
            ovlp.write('\t')
            ovlp.write(str(rdown))
            ovlp.write('\t')
            ovlp.write(str(f[3]))
            ovlp.write('\t')
            ovlp.write(chrk)
            ovlp.write('\t')
            ovlp.write(str(mp))
            ovlp.write('\t')
            ovlp.write(str(ddown))
            ovlp.write('\n')
          else: # The midpoint is not in the region
              break
        else: # The entries aren't on the same chr
          break
		      #Record the number of times each print overlapped a dyad in each count set
      counts.write(str(f[3]))
      counts.write('\t')
      counts.write(str(dipc))
      counts.write('\n')
print('Footprint coverage of original dyads and counts calculated for all sets')

print('Run complete - Happy plotting!')


Converting dyads to 11 interval(s)
Dyads converted to BED intervals from requested ranges
Overlapping intervals merged for all BEDs
Dyads calculated from merged regions for all BEDs
Footprint coverage of original dyads and counts calculated for all sets
Run complete - Happy plotting!
