/
predict_behavior_from_actflow.m
221 lines (186 loc) · 9.81 KB
/
predict_behavior_from_actflow.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
function [output]=predict_behavior_from_actflow(behav_outcome,use_rest,activity,use_pca,zscore_features)
%Author: Ravi Mill, rdm146@rutgers.edu
%Last update: 22 June 2020
%DESCRIPTION: predicts individual differences in a behavioral outcome measure (Y) from
%brain imaging features (X; task activations or restFC) via the leave-one-subject-out crossvalidated
%regression approach described in Mill, Gordon, Balota & Cole (2020):
%"Predicting dysfunctional age-related task activations from resting-state network alterations".
%INPUTS:
%behav_outcome: numeric vector, containing a to-be-predicted behavioral outcome measure (Y in the
%regression model) across subjects; dimensions=number of subjects,1
%use_rest: numeric binary, conveying number of tasks/states for input features;
%0=input features are task activations/vectorized FC from multiple
%tasks (separate models are then built from each task's features, as well as using cross-task
%features (1st principal component across tasks, estimated for each subject);
%1=features are task activations/FC from a single task/state (e.g. restFC)
%activity: numeric array, containing brain imaging features (X) that will be used to model behav_outcome;
%dimensions vary according to number of tasks/states set in use_rest:
%if use_rest=0 (multiple tasks), dimensions (3d)=number of activations/connections, number of tasks, number of subjects
%if use_rest=1 (single task e.g. restFC), dimensions(2d)=number of activations/connections, number of subjects
%use_pca: numeric binary; 1=use PCA dimensionality reduction when training
%the regression model (recommended if num_features > num_subs), number
%of PCs retained i.e. ncomps will then equal the rank of the training
%features (trainX);
%0=do not use PCA dimensionality reduction (recommended if num_features < num_subs)
%zscore_features: numeric, sets whether zscore normalization is applied during model training;
%0=no zscoring; 1=zscore features within each subject (this is noncircular); 2=zscore
%features (X) across subjects (using mean/stdev from training set to prevent circularity), and
%zscore the behav outcome (Y; again using mean/stdev from the train set);
%*note: zscore_features==2 is recommended for most applications.
%OUTPUTS:
%output: struct containing prediction accuracy for the trained model i.e.
%overlap of the predicted and actual behavioral outcome measure
%(Pearson r, and accompanying p value); note that the specific fields contained in output
%will depend on whether input features span multiple tasks/states, or
%a single/task state (determined by 'use_rest');
%e.g. if use_rest=0 (with two input task states), then the following outputs
%will be generated: task1_r, task1_p, task2_r, task2_p,
%PC1_crosstask_r, PC1_crosstask_p
%recommended usage for regional activation (num regions > num subs): [output]=predict_behavior_from_actflow(behav_outcome,0,activity,1,2)
%recommended usage for network-averaged activation (num networks < num subs): [output]=predict_behavior_from_actflow(behav_outcome,0,activity,0,2)
%recommended usage for regional restFC (num connections > num subs): [output]=predict_behavior_from_actflow(behav_outcome,1,activity,1,2)
%recommended usage for network-averaged restFC (e.g. GBC; num networks < num subs): [output]=predict_behavior_from_actflow(behav_outcome,1,activity,0,2)
%extract info from inputs
num_subs=length(behav_outcome);
if use_rest==1
num_tasks=1;
else
num_tasks=size(activity,2);
end
num_features=size(activity,1);
%predict behavioral outcome from input features via leave-one-subject-out crossval
%select number of models to run based on use_rest (i.e. whether
%single/multiple task state features were input)
if use_rest==0
%multiple tasks, including cross-task
cv_loops=num_tasks+1;
elseif use_rest==1
%single task only
cv_loops=1;
end
for t=1:cv_loops
%set output substruct name
if t==num_tasks+1
%predict from cross-task features
t_name='PC1_crosstask';
%compute cross-task features for each subject by taking 1st PC (latent variable approach)
act=zeros(num_subs,num_features);
for i=1:num_subs
i_sub=activity(:,:,i);
%demean prior to pca
demean_trainX=repmat(mean(i_sub,1),size(i_sub,1),1);
i_sub=i_sub-demean_trainX;
%run pca and take 1st PC
[PCALoadings,PCAScores,~,~,explained] = pca(i_sub);
act(i,:)=PCAScores(:,1);
end
else
t_name=['task',num2str(t)];
%pull out activity for this specific task
if num_tasks>1
act=squeeze(activity(:,t,:))';
else
act=activity';
end
end
%init behav variables
actual_test_Y=[];
pred_test_Y=[];
for s=1:num_subs
%set up LOSO crossval loop
s_inds=1:num_subs;
s_inds(s)=[];
%set up train and test variables, factoring in zscore inputs
if zscore_features==1
%within subject zscore, applied to features (X) only
t_act=zscore(act,0,2);
train_X=[t_act(s_inds,:),ones(num_subs-1,1)];
train_Y=behav_outcome(s_inds);
test_X=t_act(s,:);
test_Y=behav_outcome(s);
elseif zscore_features==2
%zscore features across subjects using training set mean/std, applied
%to features (X) and behav outcomes (Y)
X_mean=mean(act(s_inds,:),1);
X_std=std(act(s_inds,:));
Y_mean=mean(behav_outcome(s_inds));
Y_std=std(behav_outcome(s_inds));
%loop through X subtracting appropriate mean/std from each
%element, applies this normalisation to both train and test features
t_act=act;
for rr=1:num_features
t_act(:,rr)=(t_act(:,rr)-X_mean(rr))/X_std(rr);
end
%separate features into train/test (also add constant to trainX)
train_X=[t_act(s_inds,:),ones(num_subs-1,1)];
test_X=t_act(s,:);
%zscore behav Y, and separate into train/test
train_Y=(behav_outcome(s_inds)-Y_mean)/Y_std;
test_Y=(behav_outcome(s)-Y_mean)/Y_std;
elseif zscore_features==0
%no zscoring applied
t_act=act;
train_X=[t_act(s_inds,:),ones(num_subs-1,1)];
test_X=t_act(s,:);
train_Y=behav_outcome(s_inds);
test_Y=behav_outcome(s);
end
%train model, factoring in whether pca is applied to training features
if use_pca==1
%use pca regression
train_X=train_X(:,1:(end-1)); %remove constant ones prior to pca
%set max number of PCs, depends on num obs vs num feature vars
if num_subs>num_features
numComponents=num_features;
else
numComponents=min(size(train_X,1)-1,size(train_X,2)-1);
end
%demean prior to PCA
demean_trainX=repmat(mean(train_X,1),size(train_X,1),1);
train_X=train_X-demean_trainX;
%run pca
[PCALoadings,PCAScores,~,~,explained] = pca(train_X,'NumComponents',numComponents);
%check for rank deficiency of PCAScores (resulting from rank deficiency of input train_X),
%and if so adjust PCAScores accordingly
PCA_rank=rank(PCAScores);
if PCA_rank<numComponents
PCAScores=PCAScores(:,1:PCA_rank);
PCALoadings=PCALoadings(:,1:PCA_rank);
end
%regress after mean-centering trainY
%*note that mean-centering train_Y has no effect if zscoring was done across subs (i.e. zscore_features=2))
%also can ignore warning re lack of constant for PCAScores (given that PCAScores will be zero-mean)
b1=regress(train_Y-mean(train_Y),PCAScores);
%transform from PCA space back to original
beta=PCALoadings*b1;
%append constant to PCRbeta, see here for info https://stackoverflow.com/questions/11342759/constant-term-in-matlab-principal-component-regression-pcr-analysis
%*note appending constant has no effect if applied to data that has been zscored across subs (i.e. zscore_features=2)
beta=[beta;mean(train_Y)-mean(train_X)*beta];
elseif use_pca==0
%in cases of num obs < num features (or rank deficiency due to other reasons e.g. collinearity), this 'regress' call will
%use matlab's 'basic' regress solution (zero out redundant features using QR decomposition) which is a valid way
%to deal with rank deficiency, but one that will generate a warning
%*note that mean-centering train_Y below has no effect if data was zscored across features (i.e. zscore_features=2)
beta=regress(train_Y-mean(train_Y),train_X);
end
%apply trained model to test data
%apply same transformation to test data i.e. center test_X according to demean_trainX
%*note this has no effect if test_X/Y have already been zscored across subs (i.e. zscore_features=2)
demean_trainX=repmat(mean(train_X,1),size(train_X,1),1);
if use_pca==0
demean_trainX(:,end)=[]; %remove constant if present (only the case for non pca)
end
test_X=test_X-demean_trainX(1,:);
test_Y=test_Y-mean(train_Y);
%compute predY based on trained model
pred_Y=[test_X 1]*beta;
%store pred_Y, and actual_Y
pred_test_Y=[pred_test_Y;pred_Y];
actual_test_Y=[actual_test_Y;test_Y];
end
%compute model accuracy i.e. pred-to-actual behav overlap (i.e. yhat with y), and store
[r,p]=corrcoef(actual_test_Y,pred_test_Y);
output.([t_name,'_r'])=r(1,2);
output.([t_name,'_p'])=p(1,2);
end
end