-
Notifications
You must be signed in to change notification settings - Fork 14
/
tkfst.h
267 lines (206 loc) · 9.11 KB
/
tkfst.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#ifndef TKFST_INCLUDED
#define TKFST_INCLUDED
#include "hsm/em_matrix.h"
#include "indiegram/tripletscfg.h"
/// Loop rate matrix.
/*
* Hardcoded in.
*/
struct TKFST_loop_matrix : EM_matrix
{
/// constructor
TKFST_loop_matrix();
/// Show pi, x, m, U, Uinv.
void show (ostream& o) const;
};
/// Stem rate matrix.
/*
* Hardcoded in.
*/
struct TKFST_stem_matrix : EM_matrix
{
/// constructor
TKFST_stem_matrix();
/// Show pi, x, m, U, Uinv.
void show (ostream& o) const;
};
/// The TKFST parameters.
struct TKFST_Params
{
public:
/// singlet transitions probs
inline double K1() const { return lambda_1 / mu_1; }
inline double K2() const { return lambda_2 / mu_2; }
/// branch transition probs (alpha, beta, gamma)
inline double a1 (double t) const { return exp (-mu_1 * t); }
inline double b1 (double t) const { return lambda_1 * (1 - exp((lambda_1-mu_1)*t)) / (mu_1 - lambda_1*exp((lambda_1-mu_1)*t)); }
inline double g1 (double t) const { return 1 - mu_1 * (1 - exp((lambda_1-mu_1)*t)) / ((1-exp(-mu_1*t)) * (mu_1 - lambda_1*exp((lambda_1-mu_1)*t))); }
inline double a2 (double t) const { return exp (-mu_2 * t); }
inline double b2 (double t) const { return lambda_2 * (1 - exp((lambda_2-mu_2)*t)) / (mu_2 - lambda_2*exp((lambda_2-mu_2)*t)); }
inline double g2 (double t) const { return 1 - mu_2 * (1 - exp((lambda_2-mu_2)*t)) / ((1-exp(-mu_2*t)) * (mu_2 - lambda_2*exp((lambda_2-mu_2)*t))); }
/// stem construction probability
inline double pS() const { return stem_prob; }
/// constructor
TKFST_Params();
/// accessor methods
vector<Prob> loop_equil_dist();
array2d<Prob> loop_conditional_subst_mat (double t);
vector<Prob> stem_equil_dist();
array2d<Prob> stem_conditional_subst_mat (double t);
/// Dump all params.
void dump_params (ostream& o) const;
private:
/// read in parameters from file
void init_params();
double birth_rate_loop;
double death_rate_loop;
double birth_rate_stem;
double death_rate_stem;
double stem_prob;
double& lambda_1;
double& mu_1;
double& lambda_2;
double& mu_2;
TKFST_loop_matrix loop_matrix;
TKFST_stem_matrix stem_matrix;
};
struct TKFST_Triplet_Probs : TKFST_Params
{
public:
/// loop emission probs
inline Prob p1 (int xl) const { return _emit_loop[xl]; }
/// loop match probs
inline Prob m1_x (int wl, int xl) const { return _match_loop_x (wl, xl); }
inline Prob m1_y (int wl, int yl) const { return _match_loop_y (wl, yl); }
inline Prob m1_z (int wl, int zl) const { return _match_loop_z (wl, zl); }
/// stem emissions probs
/*
* xlr is the emit_idx hash for emission (xl, xr)
*/
inline Prob p2 (int xlr) const { return _emit_stem[xlr]; }
/// stem match probs
inline Prob m2_x (int wlr, int xlr) const { return _match_stem_x (wlr, xlr); }
inline Prob m2_y (int wlr, int ylr) const { return _match_stem_y (wlr, ylr); }
inline Prob m2_z (int wlr, int zlr) const { return _match_stem_z (wlr, zlr); }
// Dump all probability distributions.
void dump_trans_probs (ostream& o) const;
void dump_emit_probs (ostream& o) const;
private:
const double depth_x, depth_y, depth_z;
public:
/// branch lengths
const double& t;
const double& u;
const double& v;
// constructor
TKFST_Triplet_Probs (double t, double u, double v);
private:
/// Singlet emit probs
vector<Prob> _emit_loop;
vector<Prob> _emit_stem;
/// Branch match probs
/*
* Conditionally normalized.
*/
array2d<Prob> _match_loop_x;
array2d<Prob> _match_stem_x;
array2d<Prob> _match_loop_y;
array2d<Prob> _match_stem_y;
array2d<Prob> _match_loop_z;
array2d<Prob> _match_stem_z;
};
struct TKFST_Triplet_SCFG : Triplet_SCFG, TKFST_Triplet_Probs
{
public:
// Triplet_SCFG statistics:
// 231 = 169+61+1 effective total states
// 61 Bifurcation states (effectively deterministic)
// (61 possibly non-deterministic bifurc states)
// 106 Emit states
// 62 Null states
// 169 (non-bifurcation) states in TM
// 1790 transitions in TM
// 169^2 = 28561 (for comparison)
/// number of states (not counting Start and End)
static const int num_states = 229;
/// Triplet_SCFG state enumeration
/*
* Automatically generated by 'tripletSCFG.pl' --write.
*/
enum Triplet_states {
IL_DL_DL_DL = 0, IL_DL_DL_L = 1, IL_DL_DL_e = 2, IL_DL_L_WL = 3,
IL_DL_L_e = 4, IL_DL_ML_L = 5, IL_DL_e_DL = 6, IL_DL_e_L = 7,
IL_DL_e_e = 8, IL_L_WL_WL = 9, IL_L_WL_e = 10, IL_L_e_WL = 11,
IL_L_e_e = 12, IL_ML_DL_L = 13, IL_ML_L_WL = 14, IL_ML_L_e = 15,
IL_ML_ML_L = 16, IL_ML_e_L = 17, IL_e_DL_DL = 18, IL_e_DL_L = 19,
IL_e_DL_e = 20, IL_e_L_WL = 21, IL_e_L_e = 22, IL_e_ML_L = 23,
IL_e_e_DL = 24, IL_e_e_L = 25, IL_e_e_e = 26, IS_DS_DS_DS = 27,
IS_DS_DS_e = 28, IS_DS_e_DS = 29, IS_DS_e_e = 30, IS_e_DS_DS = 31,
IS_e_DS_e = 32, IS_e_e_DS = 33, IS_e_e_e = 34, L_L_L_L = 35,
L_L_L_WL = 36, L_L_L_e = 37, L_L_WL_WL = 38, L_L_WL_e = 39,
L_L_e_L = 40, L_L_e_WL = 41, L_L_e_e = 42, L_e_L_L = 43,
L_e_L_WL = 44, L_e_L_e = 45, L_e_e_L = 46, L_e_e_e = 47,
S_S_S_S = 48, S_S_S_e = 49, S_S_e_S = 50, S_S_e_e = 51,
S_e_S_S = 52, S_e_S_e = 53, S_e_e_S = 54, S_e_e_e = 55,
e_Li_e_e = 56, e_Si_e_e = 57, e_e_Li_e = 58, e_e_Si_e = 59,
e_e_e_Li = 60, e_e_e_Si = 61, IL_IL_WL_WL = 62, IL_IL_WL_e = 63,
IL_IL_e_WL = 64, IL_IL_e_e = 65, IL_ML_DL_DL = 66, IL_ML_DL_e = 67,
IL_ML_e_DL = 68, IL_ML_e_e = 69, L_IL_WL_WL = 70, L_IL_WL_e = 71,
L_IL_e_WL = 72, L_IL_e_e = 73, e_ILi_e_e = 74, IS_IS_WS_WS = 75,
IS_IS_WS_e = 76, IS_IS_e_WS = 77, IS_IS_e_e = 78, IS_MS_DS_DS = 79,
IS_MS_DS_e = 80, IS_MS_e_DS = 81, IS_MS_e_e = 82, S_IS_WS_WS = 83,
S_IS_WS_e = 84, S_IS_e_WS = 85, S_IS_e_e = 86, e_ISi_e_e = 87,
IS_MS_MS_DS = 88, IS_MS_MS_e = 89, IS_MS_MS_MS = 90, IS_MS_DS_MS = 91,
IS_MS_e_MS = 92, IL_ML_ML_DL = 93, IL_ML_ML_e = 94, IL_ML_ML_ML = 95,
IL_ML_DL_ML = 96, IL_ML_e_ML = 97, IL_DL_IL_WL = 98, IL_DL_IL_e = 99,
IL_DL_ML_DL = 100, IL_DL_ML_e = 101, IL_ML_IL_WL = 102, IL_ML_IL_e = 103,
IL_e_IL_WL = 104, IL_e_IL_e = 105, IL_e_ML_DL = 106, IL_e_ML_e = 107,
L_L_IL_WL = 108, L_L_IL_e = 109, L_e_IL_WL = 110, L_e_IL_e = 111,
e_e_ILi_e = 112, IS_DS_IS_WS = 113, IS_DS_IS_e = 114, IS_DS_MS_DS = 115,
IS_DS_MS_e = 116, IS_MS_IS_WS = 117, IS_MS_IS_e = 118, IS_e_IS_WS = 119,
IS_e_IS_e = 120, IS_e_MS_DS = 121, IS_e_MS_e = 122, S_S_IS_WS = 123,
S_S_IS_e = 124, S_e_IS_WS = 125, S_e_IS_e = 126, e_e_ISi_e = 127,
IS_DS_MS_MS = 128, IS_e_MS_MS = 129, IL_DL_ML_ML = 130, IL_e_ML_ML = 131,
IL_DL_DL_IL = 132, IL_DL_DL_ML = 133, IL_DL_ML_IL = 134, IL_DL_e_IL = 135,
IL_DL_e_ML = 136, IL_ML_DL_IL = 137, IL_ML_ML_IL = 138, IL_ML_e_IL = 139,
IL_e_DL_IL = 140, IL_e_DL_ML = 141, IL_e_ML_IL = 142, IL_e_e_IL = 143,
IL_e_e_ML = 144, L_L_L_IL = 145, L_L_e_IL = 146, L_e_L_IL = 147,
L_e_e_IL = 148, e_e_e_ILi = 149, IS_DS_DS_IS = 150, IS_DS_DS_MS = 151,
IS_DS_MS_IS = 152, IS_DS_e_IS = 153, IS_DS_e_MS = 154, IS_MS_DS_IS = 155,
IS_MS_MS_IS = 156, IS_MS_e_IS = 157, IS_e_DS_IS = 158, IS_e_DS_MS = 159,
IS_e_MS_IS = 160, IS_e_e_IS = 161, IS_e_e_MS = 162, S_S_S_IS = 163,
S_S_e_IS = 164, S_e_S_IS = 165, S_e_e_IS = 166, e_e_e_ISi = 167,
L_L_L_BiSiL = 168, BiSL_BmSL_BmSL_BpeL = 169, IL_ML_ML_BiSiL = 170, e_e_e_BiSiLi = 171,
L_e_BiSiL_e = 172, L_e_BiSiL_WL = 173, L_L_BiSiL_e = 174, BiSL_BpeL_e_BmSL = 175,
L_BiSiL_e_WL = 176, BiSL_BpeL_BpeL_e = 177, BiSL_BmSL_e_BpeL = 178, BiSL_BmSL_e_BmSL = 179,
IL_e_BiSiL_WL = 180, BiSL_BmSL_BpeL_BmSL = 181, BiSL_BmSL_BpeL_e = 182, IL_BiSiL_e_e = 183,
BiSL_e_BpeL_e = 184, BiSL_e_BmSL_BpeL = 185, L_BiSiL_e_e = 186, e_e_BiSiLi_e = 187,
e_BiSiLi_e_e = 188, BiSL_BmSL_BmSL_e = 189, IL_e_DL_BiSiL = 190, IL_DL_DL_BiSiL = 191,
IL_BiSiL_WL_WL = 192, L_L_e_BiSiL = 193, L_BiSiL_WL_WL = 194, BiSL_BmSL_BpeL_BpeL = 195,
L_BiSiL_WL_e = 196, IL_e_BiSiL_e = 197, IL_ML_BiSiL_e = 198, BiSL_e_BpeL_BpeL = 199,
BiSL_BpeL_BpeL_BmSL = 200, BiSL_BpeL_BpeL_BpeL = 201, L_e_e_BiSiL = 202, BiSL_BpeL_BmSL_e = 203,
BiSL_BpeL_e_BpeL = 204, BiSL_BmSL_BmSL_BmSL = 205, BiSL_e_BmSL_e = 206, IL_ML_BiSiL_WL = 207,
IL_DL_ML_BiSiL = 208, IL_BiSiL_e_WL = 209, L_L_BiSiL_WL = 210, BiSL_BpeL_BmSL_BmSL = 211,
BiSL_e_BmSL_BmSL = 212, IL_DL_e_BiSiL = 213, IL_e_e_BiSiL = 214, IL_ML_e_BiSiL = 215,
BiSL_BpeL_BmSL_BpeL = 216, BiSL_BpeL_e_e = 217, BiSL_e_BpeL_BmSL = 218, IL_BiSiL_WL_e = 219,
BiSL_e_e_BmSL = 220, BiSL_e_e_BpeL = 221, BiSL_e_e_e = 222, IL_ML_DL_BiSiL = 223,
BiSL_BmSL_e_e = 224, IL_e_ML_BiSiL = 225, IL_DL_BiSiL_WL = 226, L_e_L_BiSiL = 227,
IL_DL_BiSiL_e = 228, };
// constructor
TKFST_Triplet_SCFG (const double t, const double u, const double v);
private:
/// state initialization
void init_null_states();
void init_emit_unpaired_states();
void init_emit_paired_states();
void init_bifurc_states();
void init_ancestral_state_types();
/// transition initialization
void init_transition_scores();
/// transition initialization
/*
* Init transitions to account for bifurcations with possibly empty children.
*/
void init_direct_transition_scores();
};
#endif /* TKFST_INCLUDED */