/
compute_mutation_probabilities.py
executable file
·183 lines (147 loc) · 7.29 KB
/
compute_mutation_probabilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python
# Load required modules
import sys, os, argparse, json, numpy as np, multiprocessing as mp, random
from collections import defaultdict
# Load the weighted exclusivity test
this_dir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(this_dir)
from wext import *
# Argument parser
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument('-mf', '--mutation_file', type=str, required=True)
parser.add_argument('-wf', '--weights_file', type=str, required=False, default=None)
parser.add_argument('-pd', '--permutation_directory', type=str, required=False)
parser.add_argument('-np', '--num_permutations', type=int, required=True)
parser.add_argument('-si', '--start_index', type=int, required=False, default=1)
parser.add_argument('-q', '--swap_multiplier', type=int, required=False, default=100)
parser.add_argument('-nc', '--num_cores', type=int, required=False, default=1)
parser.add_argument('-s', '--seed', type=int, required=False, default=None)
parser.add_argument('-v', '--verbose', type=int, required=False, default=1, choices=range(5))
return parser
def permute_matrices_wrapper(args): return permute_matrices(*args)
def permute_matrices(edge_list, max_swaps, max_tries, seeds, verbose,
m, n, num_edges, indexToGene, indexToPatient):
# Initialize our output
observed = np.zeros((m, n))
permutations = []
for seed in seeds:
# Permute the edge list
permuted_edge_list = bipartite_edge_swap(edge_list, max_swaps, max_tries, seed, verbose,
m, n, num_edges)
# Recover the mapping of mutations from the permuted edge list
geneToCases = defaultdict(set)
indices = []
for edge in permuted_edge_list:
gene, patient = indexToGene[edge[0]], indexToPatient[edge[1]]
geneToCases[gene].add(patient)
indices.append( (edge[0]-1, edge[1]-1) )
# Record the permutation
observed[zip(*indices)] += 1.
geneToCases = dict( (g, list(cases)) for g, cases in geneToCases.iteritems() )
permutations.append( dict(geneToCases=geneToCases, permutation_number=seed) )
return observed/float(len(seeds)), permutations
def postprocess_weight_matrix(P, r, s):
assert np.shape(P)==(len(r), len(s))
# Find indices corresponding to entries of weight matrix with same marginals
marginals_to_indices = defaultdict(list)
for i, r_i in enumerate(r):
for j, s_j in enumerate(s):
marginals_to_indices[(r_i, s_j)].append((i, j))
# Average weights over entries of weight matrix with same marginals
P_mean = np.zeros(np.shape(P))
for marginals, indices in marginals_to_indices.items():
mean_value = float(sum(P[i, j] for i, j in indices))/float(len(indices))
for i, j in indices:
P_mean[i, j] = mean_value
return P_mean
def run( args ):
# Do some additional argument checking
if not args.weights_file and not args.permutation_directory:
sys.stderr.write('You must set the weights file or permutation directory, '\
'otherwise nothing will be output.')
sys.exit(1)
# Load mutation data
if args.verbose > 0:
print '* Loading mutation data...'
mutation_data = load_mutation_data( args.mutation_file )
genes, all_genes, patients, geneToCases, patientToMutations, params, hypermutators = mutation_data
geneToObserved = dict( (g, len(cases)) for g, cases in geneToCases.iteritems() )
patientToObserved = dict( (p, len(muts)) for p, muts in patientToMutations.iteritems() )
geneToIndex = dict( (g, i+1) for i, g in enumerate(all_genes) )
indexToGene = dict( (i+1, g) for i, g in enumerate(all_genes) )
patientToIndex = dict( (p, j+1) for j, p in enumerate(patients) )
indexToPatient = dict( (j+1, p) for j, p in enumerate(patients) )
edges = set()
for gene, cases in geneToCases.iteritems():
for patient in cases:
edges.add( (geneToIndex[gene], patientToIndex[patient]) )
edge_list = np.array(sorted(edges), dtype=np.int)
# Run the bipartite edge swaps
if args.verbose > 0:
print '* Permuting matrices...'
m = len(all_genes)
n = len(patients)
num_edges = len(edges)
max_swaps = int(args.swap_multiplier*num_edges)
max_tries = 10**9
if args.seed is not None:
random.seed(args.seed)
seeds = random.sample(xrange(1, 2*10**9), args.num_permutations)
# Run the bipartite edge swaps in parallel if more than one core indicated
num_cores = min(args.num_cores if args.num_cores != -1 else mp.cpu_count(), args.num_permutations)
if num_cores != 1:
pool = mp.Pool(num_cores)
map_fn = pool.map
else:
map_fn = map
wrapper_args = [ (edge_list, max_swaps, max_tries, seeds[i::num_cores], 0, m,
n, num_edges, indexToGene, indexToPatient) for i in range(num_cores) ]
results = map_fn(permute_matrices_wrapper, wrapper_args)
if num_cores != 1:
pool.close()
pool.join()
# Create the weights file
if args.weights_file:
if args.verbose > 0:
print '* Saving weights file...'
# Allow for small accumulated numerical errors
tol = 1e3*max(m, n)*args.num_permutations*np.finfo(np.float64).eps
# Merge the observeds
observeds = [ observed for observed, _ in results ]
P = np.add.reduce(observeds) / float(len(observeds))
# Verify the weights
for g, obs in geneToObserved.iteritems():
assert( np.abs(P[geneToIndex[g]-1].sum() - obs) < tol)
for p, obs in patientToObserved.iteritems():
assert( np.abs(P[:, patientToIndex[p]-1].sum() - obs) < tol)
# Construct mutation matrix to compute marginals
A = np.zeros(np.shape(P), dtype=np.int)
for i, j in edge_list:
A[i-1, j-1] = 1
r = np.sum(A, 1)
s = np.sum(A, 0)
# Post-process weight matrix to assign same weight to entries with same marginals
P = postprocess_weight_matrix(P, r, s)
# Verify the weights again
for g, obs in geneToObserved.iteritems():
assert( np.abs(P[geneToIndex[g]-1].sum() - obs) < tol)
for p, obs in patientToObserved.iteritems():
assert( np.abs(P[:, patientToIndex[p]-1].sum() - obs) < tol)
# Add pseudocounts to entries with no mutations observed; unlikely or impossible after post-processing step
P[P == 0] = 1./(2. * args.num_permutations)
# Output to file.
# The rows/columns preserve the order given by the mutation file.
np.save(args.weights_file, P)
# Save the permuted mutation data
if args.permutation_directory:
output_prefix = args.permutation_directory + '/permuted-mutations-{}.json'
if args.verbose > 0:
print '* Saving permuted mutation data...'
for _, permutation_list in results:
for permutation in permutation_list:
# Output in adjacency list format
with open(output_prefix.format(permutation['permutation_number']), 'w') as OUT:
permutation['params'] = params
json.dump( permutation, OUT )
if __name__ == '__main__': run( get_parser().parse_args(sys.argv[1:]) )