-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils_math.py
326 lines (220 loc) · 8.95 KB
/
utils_math.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#!/usr/bin/env python
# encoding: utf-8
"""
utils_math.py
Created by Loic Matthey on 2013-09-08.
Copyright (c) 2013 Gatsby Unit. All rights reserved.
"""
import numpy as np
import scipy.stats as spst
import scipy.interpolate as spint
from utils_fitting import fit_gaussian_mixture
############################ MATH AND STATS ##################################
def argmin_indices(array):
return np.unravel_index(np.nanargmin(array), array.shape)
def argmax_indices(array):
return np.unravel_index(np.nanargmax(array), array.shape)
def nanmean(array, axis=None):
if axis is not None:
return np.ma.masked_invalid(array).mean(axis=axis)
else:
return np.ma.masked_invalid(array).mean()
def nanmedian(array, axis=None):
if axis is not None:
return np.ma.extras.median(np.ma.masked_invalid(array), axis=axis)
else:
return np.ma.extras.median(np.ma.masked_invalid(array))
def nanstd(array, axis=None):
if axis is not None:
return np.ma.masked_invalid(array).std(axis=axis)
else:
return np.ma.masked_invalid(array).std()
def nanstats(array, axis=None):
return dict(mean=nanmean(array, axis), std=nanstd(array, axis), median=nanmedian(array, axis))
def dropnan(array):
'''
Take an array, put it in a MaskedArray with ~np.isfinite masking.
Returns the compressed() 1D array.
'''
return np.ma.masked_invalid(array).compressed()
def sample_invgamma(alpha, beta):
'''
Sample from an inverse gamma. numpy uses the shape/scale, not alpha/beta...
'''
return 1./np.random.gamma(alpha, 1./beta)
def sample_log_bernoulli(lp1, lp0):
'''
Sample a bernoulli from log-transformed probabilities
'''
if (lp0-lp1) < -500:
p1 = 1.
elif (lp0-lp1) > 500:
p1 = 0.
else:
p1 = 1./(1+np.exp(lp0-lp1))
return np.random.rand() < p1
def sample_discrete_logp(log_prob, K=2):
'''
Use the logistic link function to get back to probabilities (thanks Sam Roweis)
Also put a constant in it to avoid underflows
'''
b = - np.log(K) - np.max(log_prob)
prob = np.exp(log_prob+b)/np.sum(np.exp(log_prob+b))
cum_prob = np.cumsum(prob)
return np.where(np.random.rand() < cum_prob)[0][0] # Slightly faster than np.find
def tril_set(array, vector_input, check_sizes=False):
'''
Sets the lower triangular part of array with vector_input
Hope all sizes work...
'''
if check_sizes:
num_elements = np.sum(np.fromfunction(lambda i, j: i > j, array.shape))
assert vector_input.size == num_elements, "Wrong number of inputs, need %d" % num_elements
array[np.fromfunction(lambda i, j: i > j, array.shape)] = vector_input
def triu_set(array, vector_input, check_sizes=False):
'''
Sets the upper triangular part of array with vector_input
Hope all sizes work...
'''
if check_sizes:
num_elements = np.sum(np.fromfunction(lambda i, j: i < j, array.shape))
assert vector_input.size == num_elements, "Wrong number of inputs, need %d" % num_elements
array[np.fromfunction(lambda i, j: i < j, array.shape)] = vector_input
def triu_2_tril(array):
'''
Copy the upper triangular part of an array into its lower triangular part
'''
array[np.fromfunction(lambda i, j: i > j, array.shape)] = array[np.fromfunction(lambda i, j: i < j, array.shape)]
def powerlaw(x, amp, index):
'''
Computes the powerlaw:
y = amp * (x**index)
'''
return amp * (x**index)
def mean_std_distrib(xdata, ydata):
'''
Compute the mean and standard deviation of a distribution
'''
mean_fit = np.sum(ydata*xdata)/np.sum(ydata)
std_fit = np.sqrt(np.abs(np.sum((xdata - mean_fit)**2*ydata)/np.sum(ydata)))
return mean_fit, std_fit
def skewness_distrib(xdata, ydata):
'''
Compute the skewness of a distribution
'''
distrib = ydata / np.trapz(ydata, xdata)
mu, sigma = mean_std_distrib(xdata, distrib)
skewness = np.trapz(distrib*(xdata - mu)**3./sigma**3., xdata)
return skewness
def kurtosis_distrib(xdata, ydata):
'''
Compute the kurtosis for a distribution
'''
distrib = ydata / np.trapz(ydata, xdata)
mu, _ = mean_std_distrib(xdata, distrib)
kurtosis = np.trapz(distrib*(xdata - mu)**4., xdata)/np.trapz(distrib*(xdata - mu)**2., xdata)**2.
return kurtosis
def bimodality_coefficient(xdata, ydata):
'''
Compute Sarle's bimodality coefficient
beta = \skew^2 +1 / kurtosis
beta is:
1.0 for Bernoulli
0.33 for Gaussian
0.0 for heavy-tailed
'''
skewness = skewness_distrib(xdata, ydata)
kurtosis = kurtosis_distrib(xdata, ydata)
return (skewness**2. + 1.)/kurtosis
def ashman_d(xdata, ydata):
'''
Compute Ashman D coefficient for bimodality
Basically compares the positions of two modes
D = 2^0.5 |mu_1 - mu_2|/sqrt(sigma_1^2 + sigma_2^2)
For a mixture, D > 2 required for clean separation of distributions.
'''
mixture_params = fit_gaussian_mixture(xdata, ydata, should_plot=False, return_fitted_data=False)
D = 2**0.5 * np.abs(mixture_params[1] - mixture_params[3])/np.sqrt(mixture_params[2]**2. + mixture_params[4]**2.)
return D
def aic(K, LL):
'''
Compute Akaike Information Criterion
K: number of parameters
LL: loglikelihood of model
'''
return 2*K - 2.*LL
def bic(K, LL, N):
'''
Compute Bayesian Information Criterion
K: number of parameters
LL: loglikelihood of model
'''
return -2.*LL + K*np.log(N)
def KL_div(P, Q, axis=None):
'''
Compute the KL Divergence between P and Q, assuming they are discrete distribution summing to 1.
Not symmetric obviously:
D_KL[P || Q] = \sum P_i log(P_i/Q_i)
'''
return np.nansum(P*(np.log(P) - np.log(Q)), axis=axis)
def histogram_binspace(data, bins=20, norm='density', bound_x=np.pi):
'''
Compute the histogram given a number of bins or a set of bins.
'''
if np.isscalar(bins):
x = np.linspace(-bound_x, bound_x, bins)
else:
x = bins
bins = bins.size
# np.histogram wants the left-right boundaries
# x_edges = x - bound_x/bins
# x_edges = np.r_[x_edges, -x_edges[0]]
if norm == 'max':
bar_heights, _ = np.histogram(data, bins=x)
bar_heights = bar_heights.astype('float')/np.max(bar_heights.astype('float'))
elif norm == 'sum':
bar_heights, _ = np.histogram(data, bins=x)
bar_heights = bar_heights.astype('float')/np.sum(bar_heights.astype('float'))
elif norm == 'density':
bar_heights, _ = np.histogram(data, bins=x, density=True)
else:
raise ValueError('nom undefined')
return bar_heights, x[:-1] + bound_x/(bins-1), bins
def empirical_cdf_samples(samples, bins=41, norm='density'):
'''
Given a set of samples, will compute the Histogram and its Empirical CDF.
Useful for K-S style plots.
'''
result = dict(samples=samples.copy())
result['hist'], result['x'], result['bins'] = \
histogram_binspace(samples, bins=bins, norm=norm)
result['ecdf'] = np.cumsum(result['hist'])
result['ecdf'] /= np.max(result['ecdf'])
return result
def combine_pval_fisher_method(pvalues):
'''
Combine p-values using the Fisher Method:
f_score = -2 \\sum_i^K log(pi)
f_score ~ X^2_{2*K}
'''
fscore = np.sum(-2.*np.log(pvalues))
return spst.chi2.sf(fscore, 2*pvalues.size)
def interpolate_data_2d(all_points, data, param1_space_int=None, param2_space_int=None, interpolation_numpoints=200, interpolation_method='linear', mask_when_nearest=True, show_scatter=True, show_colorbar=True, mask_x_condition=None, mask_y_condition=None):
# Construct the interpolation
if param1_space_int is None:
param1_space_int = np.linspace(all_points[:, 0].min(), all_points[:, 0].max(), interpolation_numpoints)
if param2_space_int is None:
param2_space_int = np.linspace(all_points[:, 1].min(), all_points[:, 1].max(), interpolation_numpoints)
data_interpol = spint.griddata(all_points, data, (param1_space_int[None, :], param2_space_int[:, None]), method=interpolation_method)
if interpolation_method == 'nearest' and mask_when_nearest:
# Let's mask the points outside of the convex hull
# The linear interpolation will have nan's on points outside of the convex hull of the all_points
data_interpol_lin = spint.griddata(all_points, data, (param1_space_int[None, :], param2_space_int[:, None]), method='linear')
# Mask
data_interpol[np.isnan(data_interpol_lin)] = np.nan
# Mask it based on some conditions
if mask_x_condition is not None:
data_interpol[mask_x_condition(param1_space_int), :] = 0.0
if mask_y_condition is not None:
data_interpol[:, mask_y_condition(param2_space_int)] = 0.0
return data_interpol