Skip to content
This repository has been archived by the owner on Oct 23, 2023. It is now read-only.

Commit

Permalink
Merge branch 'activeDev' of https://github.com/CCBR/Pipeliner into ac…
Browse files Browse the repository at this point in the history
…tiveDev
  • Loading branch information
jlac committed Apr 10, 2019
2 parents 50fed72 + 282b258 commit 4863a7b
Show file tree
Hide file tree
Showing 11 changed files with 1,601 additions and 1,733 deletions.
89 changes: 89 additions & 0 deletions Results-template/Scripts/jaccard_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env python3

"""
Name: jaccard_score.py
Created by: Tovah Markowitz
Date: 1/23/19
Purpose: To do all pairwise comparisons of bed/peak files given. Uses bedtools
to calculate a jaccard score for every comparison. All data is saved in a
single tab-delimited file.
"""

##########################################
# Modules
import optparse
from pybedtools import BedTool

##########################################
# Functions

def split_infiles(infiles):
# breaks the infile string with space-delimited file names and creates a list
infileList = infiles.split(" ")
return(infileList)

def loop_jaccard(infileList, genomefile):
""" Running bedtools. Uses two loops to do all possible pairwise comparisons of files
in a list. Reads in a bedtools approved file types, sorts the files, and calculates a
jaccard score.
"""
nfiles = len(infileList)
out = []
for z in range(nfiles):
fileA = infileList[z]
a = BedTool(fileA)
a = a.sort(g=genomefile)
for y in range(z+1,nfiles):
fileB = infileList[y]
b = BedTool(fileB)
b = b.sort(g=genomefile)
j = a.jaccard(b,g=genomefile)
j["fileA"] = fileA.split("/")[-1]
j["fileB"] = fileB.split("/")[-1]
if len(out) == 0:
keylist = list(j.keys())
keylist.sort()
out.append( "\t".join(keylist) )
tmp = [ str(j[key]) for key in keylist ]
out.append( "\t".join(tmp) )
return(out)

def write_out(out, outFile):
f = open(outFile, 'w')
f.write( "\n".join(out) )
f.close()

##########################################
# Main

def main():
desc="""
This function takes a space-delimited list of files (bed, bedgraph, gff, gtf, etc.)
and calculates all possible pairwise jaccard scores. From bedtools: 'Jaccard is the
length of the intersection over the union. Values range from 0 (no intersection) to
1 (self intersection)'. The columns of the output file are: fileA, fileB,
intersection, jaccard, n_intersections, and union-intersection.
"""

parser = optparse.OptionParser(description=desc)

parser.add_option('-i', dest='infiles', default='', help='A space-delimited list of \
input files for analysis.')
parser.add_option('-o', dest='outfile', default='', help='The name of the output file \
where all the jaccard score information will be saved.')
parser.add_option('-g', dest='genomefile', default='', help='The name of the .genome file.')

(options,args) = parser.parse_args()
infiles = options.infiles
outfile = options.outfile
genomefile = options.genomefile

infileList = split_infiles(infiles)
out = loop_jaccard(infileList, genomefile)
write_out(out, outfile)

if __name__ == '__main__':
main()


Loading

0 comments on commit 4863a7b

Please sign in to comment.