-
Notifications
You must be signed in to change notification settings - Fork 0
/
CYP2D6.Rmd
251 lines (218 loc) · 10.4 KB
/
CYP2D6.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
```{r}
library(dplyr)
d6.pos <- read.table('pos_CYP2D6.txt',header=F) %>% rename(POS=V1) %>% distinct
d6.rsid <- read.table('rsid_CYP2D6.txt', header=F) %>% rename(RSID=V1)
# load vcf files
vcf<-read.table(gzfile('CYP2D6.vcf.gz'), stringsAsFactors = FALSE)[1:3] %>% rename(CHROM=V1,POS=V2,ID=V3)
# check how many SNPs were covered in phased vcf files
d6.vcf <- vcf %>% merge(.,d6.pos, by='POS')
```
# check which SNPs belong to one star allele in CYP2C19 using CPIC reference table
```{r}
ref <- readxl::read_excel('CYP2D6_allele_definition_table.xlsx', sheet=1) %>% .[,1:ncol(.)-1]
d6.pos <- rbind(c('POS'), d6.pos)
ref.pos <- ref %>%
rbind(d6.pos$POS, .) %>%
`colnames<-`(.[1, ]) %>%
.[8:nrow(.),] %>%
rename(allele=POS)
cand.pos <- intersect(as.character(d6.vcf$POS), names(ref.pos))
# include alleles that only contain SNPs in the candidate SNP list, this step could select star allele that do not contain any SNPs in the SNP list (such as *13, *5 so need to remove these snps in later steps)
allele1 <- ref.pos[apply(ref.pos, 1, function(x) {
non_na <- names(ref.pos)[which(!is.na(x))][-1]
allele_with_other <- names(ref.pos)[which(x=='S'|x=='Y'|x=='M'|x=='R'|x=='K'|x=='W')]
allele_names <- setdiff(non_na, allele_with_other)
all_present <- all(allele_names %in% cand.pos)
return(all_present)
}), ] %>% as.data.frame
# select SNPs shown in the vcf file from the reference, because all included SNPs have rsid, change this to colnames
rsid <- ref %>% rbind(d6.pos$POS, .) %>%
.[, c(names(.)[1], names(intersect(d6.vcf$POS, .[1,])))] %>%
`colnames<-`(.[6, ]) %>%
.[8:nrow(.),] %>%
rename(allele=rsID)
# remove alleles that have all NAs in the listed SNPs
allele2 <- rsid %>% .[rowSums(is.na(.)) != ncol(.)-1, 1]
allele.sel <- intersect(allele1$allele, allele2$allele)
# check functions of selected alleles
func <- readxl::read_excel("CYP2D6_allele_functionality_reference.xlsx", sheet=1)
func <- func %>%
`colnames<-`(.[1, ]) %>%
.[-1,1:ncol(.)-1] %>%
subset(`Allele/cDNA/rsID`%in% allele.sel)
func$`Activity Value (Optional)`
func$`Allele Clinical Functional Status (Required)`
```
# manually call CYP2D6 from genotype data
```{r}
# extract allele information (position, alternativeSNP ..)
info <- ref %>%
rbind(d6.pos$POS, .) %>%
.[, c(names(.)[1], names(intersect(d6.vcf$POS, .[1,])))] %>%
`colnames<-`(.[1, ]) %>%
.[2:6,]
d6.vcf <- info[5,-1] %>%
t %>%
as.data.frame %>%
tibble::rownames_to_column(., "POS") %>%
rename(RSID=V1)
vcf<-read.table(gzfile('CYP2D6.vcf.gz'), stringsAsFactors = FALSE) %>%
filter(V2 %in% d6.vcf$POS) %>%
merge(d6.vcf[,c('POS','RSID')], ., by.x='POS', by.y='V2') %>%
t %>%
as.data.frame() %>%
.[c(2,5,6,11:nrow(.)),] %>%
`colnames<-`(.[1, ]) %>%
.[-1, ]
# only leave genotype from vcf
vcf <- apply(vcf, 2, function(x) {do.call(rbind, strsplit(x,':'))[,1]}) %>% as.data.frame
# separate diplotype to haplotype, the star alleles are ordered from the lowest activity score to the highest
vcf <- vcf %>% mutate(hap_1='*1', hap_2='*1')
# *4 rs3892097
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs3892097, '\\|'))[,1] =='1' &
vcf$hap_1 =='*1')] <- '*4'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs3892097, '\\|'))[,2] =='1' &
vcf$hap_2 =='*1')] <- '*4'
# *3 rs35742686
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs35742686, '\\|'))[,1] =='1' &
vcf$hap_1 =='*1')] <- '*3'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs35742686, '\\|'))[,2] =='1' &
vcf$hap_2 =='*1')] <- '*3'
# *6 rs5030655
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs5030655, '\\|'))[,1] =='1' &
vcf$hap_1 =='*1')] <- '*6'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs5030655, '\\|'))[,2] =='1' &
vcf$hap_2 =='*1')] <- '*6'
# *69 rs1135840 rs28371725 rs16947 rs1065852
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs28371725, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,1] == '1' &
do.call(rbind, strsplit(vcf$rs1065852, '\\|'))[,1] == '1' &
vcf$hap_1 =='*1')] <- '*69'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,2] =='1'
& do.call(rbind, strsplit(vcf$rs28371725, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,2] == '1' &
do.call(rbind, strsplit(vcf$rs1065852, '\\|'))[,2] == '1' &
vcf$hap_2 =='*1')] <- '*69'
# *8 rs1135840 rs5030865 (C>A) rs16947
if ('rs5030865' %in% names(vcf) && vcf$rs5030865[2]=='A') {
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs5030865, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,1] == '1' &
vcf$hap_1 =='*1')] <- '*8'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs5030865, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,2] == '1' &
vcf$hap_2 =='*1')] <- '*8'
}
# *114 rs1135840 rs5030865 (C>T) rs16947 rs1065852
if ('rs5030865' %in% names(vcf) && vcf$rs5030865[2]=='T') {
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs5030865, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,1] == '1' &
do.call(rbind, strsplit(vcf$rs1065852, '\\|'))[,1] == '1' &
vcf$hap_1 =='*1')] <- '*114'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs5030865, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,2] == '1' &
do.call(rbind, strsplit(vcf$rs1065852, '\\|'))[,2] == '1' &
vcf$hap_2 =='*1')] <- '*114'
}
# *10 rs1135840 rs1065852
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs1065852, '\\|'))[,1] =='1' &
vcf$hap_1 =='*1')] <- '*10'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs1065852, '\\|'))[,2] =='1' &
vcf$hap_2 =='*1')] <- '*10'
# *17 rs1135840 rs28371706 (G>A) rs16947
if ('rs28371706' %in% names(vcf) && vcf$rs28371706[2]=='A') {
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs28371706, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,1] == '1' &
vcf$hap_1 =='*1')] <- '*17'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs28371706, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,2] == '1' &
vcf$hap_2 =='*1')] <- '*17'
}
# *41 rs1135840 rs28371725 rs16947
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs28371725, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,1] == '1' &
vcf$hap_1 =='*1')] <- '*41'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs28371725, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,2] == '1' &
vcf$hap_2 =='*1')] <- '*41'
# *14 rs1135840 rs5030865 (C>T) rs16947
if ('rs5030865' %in% names(vcf) && vcf$rs5030865[2]=='T') {
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs5030865, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,1] == '1' &
do.call(rbind, strsplit(vcf$rs1065852, '\\|'))[,1] == '1' &
vcf$hap_1 =='*1')] <- '*14'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs5030865, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,2] == '1' &
do.call(rbind, strsplit(vcf$rs1065852, '\\|'))[,2] == '1' &
vcf$hap_2 =='*1')] <- '*14'
}
# *9 rs5030656
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs5030656, '\\|'))[,1] =='1' &
vcf$hap_1 =='*1')] <- '*9'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs5030656, '\\|'))[,2] =='1' &
vcf$hap_2 =='*1')] <- '*9'
# *33 rs28371717
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs28371717, '\\|'))[,1] =='1' &
vcf$hap_1 =='*1')] <- '*33'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs28371717, '\\|'))[,2] =='1' &
vcf$hap_2 =='*1')] <- '*33'
# *35 rs1135840 rs16947 rs769258
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs769258, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,1] =='1' &
vcf$hap_1 =='*1')] <- '*35'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs769258, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,2] =='1' &
vcf$hap_2 =='*1')] <- '*35'
# *2 rs16947 rs1135840
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,1] =='1' &
do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,1] =='1' &
vcf$hap_1 =='*1')] <- '*2'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,2] =='1' &
do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,2] =='1' &
vcf$hap_2 =='*1')] <- '*2'
# *34 rs16947
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,1] =='1' &
vcf$hap_1 =='*1')] <- '*34'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs16947, '\\|'))[,2] =='1' &
vcf$hap_2 =='*1')] <- '*34'
# *39 rs1135840
vcf$hap_1[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,1] =='1' &
vcf$hap_1 =='*1')] <- '*39'
vcf$hap_2[which(do.call(rbind, strsplit(vcf$rs1135840, '\\|'))[,2] =='1' &
vcf$hap_2 =='*1')] <- '*39'
# activity value of each allele
vcf <- vcf%>% mutate(as_hap1 = ifelse(
grepl('^\\*4$|^\\*8$|^\\*40$|^\\*69$|^\\*114$|^\\*3$|^\\*6$', hap_1), 0, ifelse(
grepl('^\\*10$', hap_1), 0.25, ifelse(
grepl('^\\*14$|^\\*17$|^\\*41$|^\\*9$', hap_1), 0.5, 1
))), as_hap2 = ifelse(
grepl('^\\*4$|^\\*8$|^\\*40$|^\\*69$|^\\*114$|^\\*3$|^\\*6$', hap_2), 0, ifelse(
grepl('^\\*10$', hap_2), 0.25, ifelse(
grepl('^\\*14$|^\\*17$|^\\*41$|^\\*9$', hap_2), 0.5, 1
))),
# activity score
as = as_hap1 + as_hap2, diplotype = paste0(hap_1, '/', hap_2
))
# phenotype
vcf <- vcf %>% mutate(phenotype = ifelse(
as>=3, 'Ultrarapid', ifelse(
as==0, 'Poor', ifelse(
as<=1 & as>0, 'Intermediate', 'Normal'
)
)))
vcf <- vcf %>% .[3:nrow(vcf),]
```