-
Notifications
You must be signed in to change notification settings - Fork 0
/
test-fragement-count-distribution.py
248 lines (197 loc) · 9.34 KB
/
test-fragement-count-distribution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/usr/bin/env python
# coding=utf8
"""
%prog <BAM> <BED> <TXT> <INT> [options]
Based on the input <BAM> file DE-peaks will be sub-sampled in the provided <BED> regions, <TXT> chromosmes to be used and their length tab separated,
<INT> defines the number of simulated replicates of the two samples
Creates a histogram of all read count distributions based on regions from <BED> of <INT> replicates for two samples, for all samples, for sample1 and sample2 and creates an MA-plot based on the supplied beta-values
@author: Thomas Eder
"""
from __future__ import print_function
from __future__ import division
import sys
import math
import pysam
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from builtins import range
from future.utils import lrange
from optparse import OptionParser
from random import random, randint, randrange
from scipy.stats import beta, laplace
from numpy.random import multivariate_normal, choice
from pybedtools import BedTool
MAX_TEST = 10000 #number of maximal test iterations, can influence runtime
#classes
class HelpfulOptionParser(OptionParser):
"""An OptionParser that prints full help on errors."""
def error(self, msg):
self.print_help(sys.stderr)
self.exit(2, "\n%s: error: %s\n" % (self.get_prog_name(), msg))
#helper functions
def _callback_list(option, opt, value, parser): #get a list of options and save it
try:
tmplist = list(map(lambda x: int(x), value.split(',')))
setattr(parser.values, option.dest, tmplist)
except Exception as e:
print("got exception: %r, please check input parameters" % (e,))
exit()
def _callback_matrix(option, opt, value, parser): #get a matrix of options and save it
try:
final_option=[]
for entry in value.split(';'):
final_option.append(list(map(lambda x: int(x), entry.split(','))))
setattr(parser.values, option.dest, final_option)
except Exception as e:
print("got exception: %r, please check input parameters" % (e,))
exit()
def _callback_list_float(option, opt, value, parser): #get a list of float options and save it
try:
tmplist = list(map(lambda x: float(x), value.split(',')))
setattr(parser.values, option.dest, tmplist)
except Exception as e:
print("got exception: %r, please check input parameters" % (e,))
exit()
def _scale_laplace(nfrags, read_frac, scale):
"""Scale beta result based on Laplace distribution"""
loc = 0.5 #fixed
top_nfrgs_scale = laplace.pdf(0.5, loc, scale)#make sure at postion 0.5 is 100%
scale_factor = 1 / top_nfrgs_scale
nfrags_scale = laplace.pdf(read_frac, loc, scale) *scale_factor #the bigger the difference from beta the smaller the sample...
if(nfrags_scale == 0):
nfrags = nfrags
else:
nfrags = int(nfrags *nfrags_scale) #new fragment scaling based on beta result
return nfrags
def _scale_lognorm(nfrags, read_frac, sigma, loc, scale):
"""Scale number_frags result based on Lognorm distribution"""
#loc=scale*-1
top_beta_scale = sp.stats.lognorm.pdf(1,s=sigma,loc=loc, scale=scale)#to get to 100% at size 10
scale_factor=1/top_beta_scale #this is the factor to set beta_frac to 100% at 1
beta_scale = sp.stats.lognorm.pdf(nfrags,s=sigma,loc=loc, scale=scale) *scale_factor
if(beta_scale == 0):
read_frac = read_frac
else:
read_frac = ((read_frac-0.5)*beta_scale)+0.5 #center and scale it based on beta_scale from exponential distri, so reduce to 0.5 for high values
return read_frac
def _scale_exp( nfrags, read_frac, loc, scale):
"""Scale number_frags result based on Exponential distribution"""
top_beta_scale = sp.stats.expon.pdf(loc,loc=loc, scale=scale)#to get to 100% at size loc
scale_factor=1/top_beta_scale #this is the factor to set beta_frac to 100% at loc
beta_scale = sp.stats.expon.pdf(nfrags,loc=loc, scale=scale) *scale_factor
read_frac_old = read_frac
if(beta_scale == 0):
read_frac = read_frac
else:
read_frac = ((read_frac-0.5)*beta_scale)+0.5 #center and scale it based on beta_scale from exponential distribution, we do this to reduce the read_frac towards 0.5 for high values
#print("nfrags: %s read_frac_in: %s scaling_factor: %s read_frac_out: %s" % (nfrags ,read_frac_old, beta_scale, read_frac))
return read_frac
#main
if __name__ == '__main__':
#handle input arguments
parser = HelpfulOptionParser(usage=__doc__)
parser.add_option("-c", "--chrom", default='chr19', dest="chromosome", type="string", help="Chromosome used for simulation [default: %default]")
parser.add_option("--frag-count-scaling", default="none", dest="frag_count_scaling", type="string", help="Scaling of fragment distribution, no scaling, scaling of beta result based on fragment counts (with exp) or scaling of fragment counts based on beta result (with laplace) : none , frag , beta [default: %default]")
parser.add_option("--frag-count-lp-scale", default=0.1, dest="frag_count_lp_sc", type="float", help="Scale for Laplace distribution if frag-count-scaling is frag [default: %default]")
parser.add_option("--frag-count-ex-loc", default=10, dest="frag_count_ex_lo", type="float", help="Loc for exponential distribution if frag-count-scaling is beta [default: %default]")
parser.add_option("--frag-count-ex-scale", default=100, dest="frag_count_ex_sc", type="float", help="Scale for exponential distribution if frag-count-scaling is beta [default: %default]")
parser.add_option("--beta", default=[0.5, 0.5], dest="beta_values", type="string", action='callback', callback=_callback_list_float, help="Alpha and Beta of Beta-distribution [default: %default]")
(options, args) = parser.parse_args()
num_args = 4
#read in required input arguments
if len(args) != num_args:
parser.error("At minimum %s parameters are requred to run this tool" % num_args)
input_bam_file = args[0]
inbed_filename = args[1]
in_chromsizes = args[2]
num_replicates = int(args[3])
#get sample bam file...
input_bam = pysam.AlignmentFile(input_bam_file,'rb')
subsample_chrom = options.chromosome
#read bed
input_bed = BedTool(inbed_filename)
region_count=1
fragment_counts=[]
sample_counts=[]
sumcounts=[]
#for each bed entry get reads...
sample1_replicate_counts = [0] * num_replicates
sample2_replicate_counts = [0] * num_replicates
sample1_replicate_list = [[] for i in lrange(num_replicates)]
sample2_replicate_list = [[] for i in lrange(num_replicates)]
multi_factor = 2*num_replicates #per read multiply its use, so that we can definitely fill 2 samples with x replicates
for region in input_bed:
sample1_count = 0
sample2_count = 0
#beta distribution to get aprox. same size or down regulated read number for sample x, here we deside which sample to prefer
read_frac = np.random.beta(options.beta_values[0], options.beta_values[1])
#get number of reads in this region, name id nfragments like in DCSsim
number_frags = input_bam.count(region.chrom, region.start, region.end)
#now choose which way we want to select the number of fragments
if( options.frag_count_scaling == "none"):
#no scaling, no changes in nfrags and beta
number_frags = number_frags
read_frac = read_frac
elif (options.frag_count_scaling == "beta") :#nfrags scaling via Laplace based on beta result, beta not changed
number_frags = _scale_laplace(number_frags, read_frac, options.frag_count_lp_sc)
elif (options.frag_count_scaling == "frag") :#nfrags scaling via lognorm distribution based on fragment counts, number_frags not changed
read_frac = _scale_exp(number_frags, read_frac, options.frag_count_ex_lo, options.frag_count_ex_sc)
else:
print("Unknown scaling method, %s, please choose 'none','frag' or 'beta', exiting now" % (frag_count_scaling))
exit(1)
#for each region change height based on beta distribution
read_counter=0
read_pool=[]
#first get all reads of this region
for read in input_bam.fetch(region.chrom, region.start, region.end):
read_pool.append(read)
#then randomly select on of the reads for x-times
for _ in lrange(number_frags):
for _ in lrange(multi_factor): #do this x times
if(read_counter > number_frags): #check if we meet calculated and scaled (if turned on) number of fragments/reads
break
read_counter+=1
#get one random read out of region-pool
random_index = randrange(len(read_pool))
read = read_pool[random_index]
if random() <= read_frac: #add to sample 1 or sample 2 and to list
sample1_count += 1
else:
sample2_count += 1
else:
continue
break
#save result for this protein
sample_counts.append([sample1_count,sample2_count])
sumcounts.append(number_frags)
#MA-plot
xma=[]
yma=[]
counts1=[]
counts2=[]
for entry in sample_counts:
counts1.append(entry[0])
counts2.append(entry[1])
if(entry[1] > 0 and entry[0] > 0):
x=math.log(entry[0]+entry[1]/2,2)#log2 mean
xma.append(x)
y=math.log(entry[0]/entry[1],2)#log2 fold-change
yma.append(y)
print("max: sum %s s1 %s s2 %s" % (max(sumcounts),max(counts1), max(counts2)))
#hist and ma-plot
fixed_range=[0,200]
fixed_bins=500
fig, axs = plt.subplots(4)
axs[0].set_title('fragments all samples')
axs[0].hist(sumcounts,density=False, bins=fixed_bins, range=fixed_range)
axs[1].set_title('fragments sample1')
axs[1].hist(counts1,density=False, bins=fixed_bins, range=fixed_range)
axs[2].set_title('fragments sample2')
axs[2].hist(counts2,density=False, bins=fixed_bins, range=fixed_range)
axs[3].set_title('MA-plot')
axs[3].scatter(xma, yma)
plt.subplots_adjust(hspace = 0.4)
plt.show()
#plt.scatter(xma, yma)
#plt.show()