This repository has been archived by the owner on Oct 29, 2019. It is now read-only.
/
draw_string_david_combination_network.m
executable file
·182 lines (144 loc) · 7.3 KB
/
draw_string_david_combination_network.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
%%%
% Run this file to see a demo of how to combine STRING network interactions
% with DAVID functional annotation clustering.
% STRING: go to the string website (http://string-db.org), click multiple
% names, enter your list of gene symbols, click save (disk icon below
% network), and save the "Text Summary (TXT - simple tab delimited
% flatfile)" on your local drive.
% DAVID: go to the david website (http://david.abcc.ncifcrf.gov/tools.jsp),
% select "upload" on the left panel, (step 1) paste your list of gene symbols into
% the field, (step 2) select "official_gene_symbol" as gene identifier,
% (step 3) clik on "gene list", and (step 4) submit list. On the left, be
% sure to select the correct species, and clik the "Select Species"
% buttion. Then click on the "functional annotation clustering" link, and
% click on the "functional annotation clustering" button. Set the
% classification stringency to "high" and press "rerun using options". Now
% click the "download file" link
% This exmaply only works if gene symbols are fed into DAVID, as STRING
% returns gene symbols.
% Berend Snijder, 2011.
%%%%%%%%%%%%%%%%%%%%%%%%%%
% a couple of parameters %
%%%%%%%%%%%%%%%%%%%%%%%%%%
% minimum global string interaction score
intMinString = 0.9;
% minimum within-david-cluster string interaction score
intWithinDavidMinString = 0;
% minimum david cluster overlap score
intMinDavidOverlap = 0.9;
% minimum david cluster enrichment score
intMinDavidEnrichment = 0.3;
% wether to run demo mode, or ask the user to locate the files...
boolRunDemo = true;
% boolRunDemo = false;
%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%
% Get the string network and the corresponding gene id and symbol seed
% lists.
if boolRunDemo
strSTRINGFile = which('new_string9_genesymbols.txt');
else
strSTRINGFile = '';
end
[matStringNetwork,matStringIdentifiers,cellStringGeneSymbols,cellENSPs] = parseStringNetwork(strSTRINGFile);
% Load the david functional annotation clustering data.
if boolRunDemo
strDAVIDFile = which('new_david_gene_symbols_high.txt');
else
strDAVIDFile = '';
end
[cellLabelPerCluster,cellGenesPerCluster,cellGenesPerClassPerCluster,matEnrichmentScorePerCluster,matOverlapMap]= parseDavidFAC(strDAVIDFile);
% we need to combine the genes listed in STRING file and those in the
% DAVID file to get to a minimal list of all present gene identifiers.
% Note that genes regularly disappear due to gene identifier mapping
% mismatches.
cellSeedGeneSymbols = unique(cat(1,cellGenesPerCluster{:},cellStringGeneSymbols));
% reformat string network according to full seed gene list
[~,b]=ismember(cellStringGeneSymbols,cellSeedGeneSymbols);
matStringNetwork2 = zeros(numel(cellSeedGeneSymbols));
matStringNetwork2(b,b) = matStringNetwork;
matStringNetwork = matStringNetwork2;
% identify kinase (or other 'special') cluster ids, that you want to
% disband.
matKinaseClusterIDs = find(cellfun(@(x) ~isempty(regexpi(x,'Nucleotide.*binding')),cellLabelPerCluster) | ...
cellfun(@(x) ~isempty(regexpi(x,'phosphate metabolic process')),cellLabelPerCluster) | ...
cellfun(@(x) ~isempty(regexpi(x,'phosphorylation')),cellLabelPerCluster));
% initialize combined DAVID and STRING graph
intDAVIDNodes = size(matOverlapMap,1);
intSTRINGNodes = size(matStringNetwork,1);
matCombinedGraph = zeros(intDAVIDNodes+intSTRINGNodes);
% add in string network above total threshold
matGRAPH1 = triu(matStringNetwork);
matGRAPH1(matGRAPH1<intMinString) = 0;
matCombinedGraph(1:intSTRINGNodes,1:intSTRINGNodes) = matGRAPH1;
% now we need to add the links between the genes and the clusters, but of
% course only to the strongest enriched networks...
matStringDavidLinks = zeros(intSTRINGNodes,intDAVIDNodes);
% add gene to david links, but skip those for kinase-like classes
cellAdjustedGenesPerCluster = cellGenesPerCluster;
cellAdjustedGenesPerCluster(matKinaseClusterIDs) = {[]};
cellGenesPerClassPerCluster(matKinaseClusterIDs,:) = {[]};
matGeneClusterID = nan(numel(cellSeedGeneSymbols),1);
cellGeneShape = cell(numel(cellSeedGeneSymbols),1);
matEmptyClassPerClusterIX = cellfun(@isempty,cellGenesPerClassPerCluster);
matClassPerClusterCount = sum(~matEmptyClassPerClusterIX,2);
for i = 1:numel(cellSeedGeneSymbols)
% this is the alternative function, look for specificity index of gene
% in cluster (i.e. how many
matPerClassPerClusterGeneCount = NaN(size(cellGenesPerClassPerCluster));
matPerClassPerClusterGeneCount(~matEmptyClassPerClusterIX) = single(cellfun(@(x) ismember(cellSeedGeneSymbols{i},x),cellGenesPerClassPerCluster(~matEmptyClassPerClusterIX)));
% different metrics according to which we could score... the first
% cluster with the highest mean presence of all classes is preferred.
matMeans = sum(matPerClassPerClusterGeneCount==1,2)./matClassPerClusterCount;
matMeans(isnan(matMeans))=0;
% "max" finds the indices of the maximum values of A, and returns them
% in output vector I. If there are several identical maximum values,
% the index of the first one (i.e. highest enriched) found is
% returned.
[intMaxValue,intCrossIX] = max(matMeans);
% if we don't find a maximum, set to [], i.e. flag it's unclustered
if intMaxValue == 0; intCrossIX = []; end
intCrossIX2 = intCrossIX;
matStringDavidLinks(i,intCrossIX) = 2;
% annotate david cluster to gene node mapping
if isempty(intCrossIX)
matGeneClusterID(i) = -1;
else
matGeneClusterID(i) = intCrossIX;
end
% create gene node shapes
if ismember(intCrossIX2,matKinaseClusterIDs)
cellGeneShape{i} = 'kinase';
elseif isempty(intCrossIX)
cellGeneShape{i} = 'homeless';
else
cellGeneShape{i} = 'regular';
end
end
%%%
% and add string interactions that are above a lower threshold for all
% genes present in the same david cluster.
for i = unique(matGeneClusterID')
a = find(matGeneClusterID==i);
matGRAPH1 = triu(matStringNetwork(a,a));
matGRAPH1(matGRAPH1<intWithinDavidMinString) = 0;
matCombinedGraph(a,a) = matGRAPH1;
end
%%%
%%%
% add links between genes and david classes
matCombinedGraph(1:intSTRINGNodes,intSTRINGNodes+1:end) = matStringDavidLinks;
% % keep a edge-feature to annotate gene-david links
matGeneToDavidEdges = zeros(size(matCombinedGraph));
matGeneToDavidEdges(1:intSTRINGNodes,intSTRINGNodes+1:end) = matStringDavidLinks;
% merge all labels for final combined graph
cellCombinedLabels = [cellSeedGeneSymbols;cellLabelPerCluster];
matCombinedClusterIDs = [matGeneClusterID;(1:intDAVIDNodes)'];
cellCombinedNodeShapes = [cellGeneShape;repmat({'cluster'},[intDAVIDNodes,1])];
% generate an attribute per gene that lists the class per gene
cellClusterPerGene = cell(size(cellCombinedLabels));
cellClusterPerGene(matCombinedClusterIDs>0) = cellLabelPerCluster(matCombinedClusterIDs(matCombinedClusterIDs>0));
cellClusterPerGene(matCombinedClusterIDs<=0) = {'Unclustered'};
% call cytoscape with predefined example vizmap file
strVizmapFile = which('vizmap_combined.props');
cytoscape(matCombinedGraph,cellCombinedLabels,matCombinedClusterIDs,cellCombinedNodeShapes,matGeneToDavidEdges,cellClusterPerGene,strVizmapFile)