-
Notifications
You must be signed in to change notification settings - Fork 11
/
findMI_KSG_stddev.m
115 lines (94 loc) · 4.62 KB
/
findMI_KSG_stddev.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
function [errorEstimate] = findMI_KSG_stddev(MIs, N, do_plot)
% function [errorEstimate] = findMI_KSG_stddev(MIs, N, do_plot)
%
% MIs - the output from findMI_KSG_subsampling. Cell array; see detailed
% description in findMI_KSG_subsampling, values in bits.
%
% N - scalar, total size of the data set: how many data points were there
% in the original X and Y variables, for which we are estimating I(X;Y)?
%
% do_plot -- binary scalar. If 1, plots for estimation of MI error bars will
% automatically be generated.
%
% errorEstimate - a scalar, the estimate of the standard deviation of the KSG
% mutual information estimator, in bits, at the full data set size N
%
%-----------------------------------------------------------------------------------------
% Copyright 2018 Caroline Holmes, Ilya Nemenman
%-----------------------------------------------------------------------------------------
% This program is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You should receive a copy of the GNU General Public License
% along with this program. See also <http://www.gnu.org/licenses/>.
%-----------------------------------------------------------------------------------------
% Please reference
%
% Holmes, C.M. & Nemenman, I. Estimation of mutual information for
% real-valued data with error bars and controlled bias.
% Submitted, 2018.
%
% A. Kraskov, H. Stogbauer, and P. Grassberger,
% Estimating mutual information.
% Phys. Rev. E 69 (6) 066138, 2004
%
% in your published research.
listSplitSizes = MIs(:,1); % the list of partition sizes
MIs = MIs(:,2); %now MIs only contains the mutual information data, and not
%the number of partitions
n = length(MIs); % number of different partition sizes used to partition the
%data
Nsamples = zeros(n, 1);
for i = 1:n
Nsamples(i) = floor(N./(listSplitSizes{i,1})); %how many data points were actually in each subset for each partition size
end
Xvals = 1./Nsamples; %because we want to look at dependence on 1/N, not N.
xActual = Xvals(1); %'actual' meaning with using the entire available data set
Xvals = Xvals(2:end); %We only want to do linear fits for where subsets of
%the data areused (where there is a nonzero variance)
listVariances = zeros(n - 1,1); %n-1 because there is no variance in the first row, which we assume is where the entire data set was used
for i = 2:n
listVariances(i - 1) = var(MIs{i,1});
end
%now need to fit line with slope = 1 to the above: we want the line to be
%of the form that log(variance) = log(Xvals) + b, and we only want to find
%b
%we are only using this line as a visual guide , so we will do this without
%worrying about how to properly weigh the points.
b = mean(log10(listVariances) -log10(Xvals));
%create the line we'll plot
Xs = linspace(xActual./2,max(Xvals), 100);
Ys = Xs .* (10^b); %and then we'll plot everything in log space
%The following estimated the error for the actual mutual information. We
%won't do this by projecting along the fitted line, but rather using a
%chi^2 distribution as described in the paper, although the two answers
%shouldn't be too different.
k = cell2mat(listSplitSizes(2:end)); %because we can't use the first entry here (listSplitSizes = 1) to help us predict the actual variance.
variancePredicted = sum((k-1)./k.*listVariances)./sum((k-1)); %The estimated variance of the mutual information at the full N
Sml=variancePredicted*N;
varS = 2*Sml^2/sum((k-1)); %Estimated variance of the variance of the estimate of the mutual information at full N
stdvar = sqrt(varS/N^2); %the error bars on our estimate of the variance
if do_plot == 1
figure
hold on
plot(Xvals, listVariances, 'x')
plot(Xs,Ys)
plot(xActual, variancePredicted, 'ks','MarkerFaceColor','k') %draws the projected point
plot([xActual xActual], [variancePredicted - stdvar, variancePredicted + stdvar],'k-')
ax = gca;
ax.XScale = 'log';
ax.YScale = 'log';
xlim([min(Xs) max(Xs)]);
xlabel('1/N')
ylabel('Variance of MI, in bits')
title('Estimating error bars')
legend('Variances from splitting data', '1/N scaling fit to variances from split data','Estimated variance for full data size')
end
errorEstimate = variancePredicted^.5;