<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-preparation" data-toc-modified-id="Data-preparation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data preparation</a></span></li><li><span><a href="#k-NN" data-toc-modified-id="k-NN-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>k-NN</a></span></li><li><span><a href="#Naive-Bayes" data-toc-modified-id="Naive-Bayes-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Naive Bayes</a></span></li><li><span><a href="#Decision-tree" data-toc-modified-id="Decision-tree-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Decision tree</a></span></li><li><span><a href="#Next" data-toc-modified-id="Next-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Next</a></span></li><li><span><a href="#ANN" data-toc-modified-id="ANN-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>ANN</a></span></li></ul></div>

In [1]:
source("../src/utils/custom_tools.R")
setup_environment("../src/utils")
source("predictors.R")


Attaching package: ‘neuralnet’

The following object is masked from ‘package:dplyr’:

    compute



In [2]:
im <- read.table("../data/cleaned_IMPACT_mutations_180508.txt",
                     sep = "\t", stringsAsFactors = FALSE, header = TRUE)
im <- add_features("../data", im, oncokb = TRUE)

In [3]:
set.seed(123)

## Data preparation

In [4]:
nrow(im)

In [5]:
n_row_impact <- 1000

In [6]:
impact <- im[1:n_row_impact,]

In [7]:
unique(lapply(impact, function(x) sum(is.na(x))) == 0)

In [8]:
impact$is_driver <- "no"
impact$is_driver[impact$oncogenic %in% c("Oncogenic", "Likely Oncogenic", "Predicted Oncogenic")] <- "yes"

In [9]:
get_table(impact$is_driver)

values,count,freq
no,628,62.8%
yes,372,37.2%
-- total --,1000,100%


In [10]:
impact <- remove_features(impact, c("Tumor_Sample_Barcode", "mut_key", "sample_mut_key", "frequency_in_normals",
                                    "is_a_hotspot", "is_a_3d_hotspot", "oncogenic"))

## k-NN

In [11]:
impact <- transform_categorical_features_to_integer(impact,
                            c("Hugo_Symbol", "Chromosome", "Consequence", "Variant_Type", "Reference_Allele",
                              "Tumor_Seq_Allele2", "cDNA_change", "HGVSp_Short", "confidence_class"))

In [12]:
impact <- min_max_normalize(impact,
             c("Start_Position", "End_Position", "t_depth", "t_vaf", "t_alt_count", "n_depth", "n_vaf", "n_alt_count",
               "t_ref_plus_count", "t_ref_neg_count", "t_alt_plus_count", "t_alt_neg_count", "sample_coverage"))

impact <- shuffle_rows(impact)

list_train_test <- split_train_test(impact, label_name = "is_driver")

impact_train       <- list_train_test[[1]]
impact_test        <- list_train_test[[2]]
impact_train_label <- list_train_test[[3]]
impact_test_label  <- list_train_test[[4]]

In [15]:
get_table(impact_train_label)
get_table(impact_test_label)

values,count,freq
no,26909,71.8%
yes,10591,28.2%
-- total --,37500,100%


values,count,freq
no,9020,72.2%
yes,3480,27.8%
-- total --,12500,100%


In [13]:
impact_pred <- model_knn(impact_train, impact_test, impact_train_label, sqrt(n_row_impact))

In [14]:
get_result_table(impact_test_label, impact_pred)
get_accuracy(impact_test_label, impact_pred)

               data_test_pred
data_test_label   no  yes
            no  8486  534
            yes 2693  787

74.18%, 50,000

## Naive Bayes

In [237]:
impact <- transform_categorical_features_to_factor(impact,
                            c("Hugo_Symbol", "Chromosome", "Consequence", "Variant_Type", "Reference_Allele",
                              "Tumor_Seq_Allele2", "cDNA_change", "HGVSp_Short", "confidence_class"))

In [238]:
impact <- remove_features(impact, c("Start_Position", "End_Position", "t_depth", "t_vaf", "t_alt_count", "n_depth", "n_vaf", "n_alt_count",
               "t_ref_plus_count", "t_ref_neg_count", "t_alt_plus_count", "t_alt_neg_count", "sample_coverage"))

In [239]:
impact <- cbind(model.matrix(~ impact$Hugo_Symbol + 0), model.matrix(~ impact$Chromosome + 0), impact$is_driver)
colnames(impact)[ncol(impact)] <- "is_driver"
head(impact)

Unnamed: 0,impact$Hugo_SymbolABL1,impact$Hugo_SymbolAKT1,impact$Hugo_SymbolALK,impact$Hugo_SymbolALOX12B,impact$Hugo_SymbolAPC,impact$Hugo_SymbolAR,impact$Hugo_SymbolARID1A,impact$Hugo_SymbolARID1B,impact$Hugo_SymbolARID2,impact$Hugo_SymbolARID5B,⋯,impact$Chromosome22,impact$Chromosome3,impact$Chromosome4,impact$Chromosome5,impact$Chromosome6,impact$Chromosome7,impact$Chromosome8,impact$Chromosome9,impact$ChromosomeX,Unnamed: 21
1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,yes
2,0,0,0,0,0,0,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,no
3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,no
4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,no
5,0,0,0,0,0,0,0,0,0,0,⋯,0,1,0,0,0,0,0,0,0,no
6,0,0,0,0,0,0,0,0,0,0,⋯,0,0,1,0,0,0,0,0,0,no


In [248]:
impact <- shuffle_rows(impact)

list_train_test <- split_train_test(impact, label_name = "is_driver")

impact_train       <- list_train_test[[1]]
impact_test        <- list_train_test[[2]]
impact_train_label <- list_train_test[[3]]
impact_test_label  <- list_train_test[[4]]

In [249]:
model <- naiveBayes(impact_train, impact_train_label, laplace = 1)

In [250]:
impact_pred <- predict(model, impact_test, type = "class")
head(impact_pred)

In [251]:
get_result_table(impact_test_label, impact_pred)
get_accuracy(impact_test_label, impact_pred)

ERROR: Error in table(data_test_label, data_test_pred): all arguments must have the same length


## Decision tree

In [147]:
impact$HGVSp_Short[impact$HGVSp_Short == ""] <- "unknown"
impact$cDNA_change[impact$cDNA_change == ""] <- "unknown"

In [148]:
impact <- transform_categorical_features_to_factor(impact,
                            c("Hugo_Symbol", "Chromosome", "Consequence", "Variant_Type", "Reference_Allele",
                              "Tumor_Seq_Allele2", "cDNA_change", "HGVSp_Short", "confidence_class"))

In [149]:
impact <- transform_categorical_features_to_factor(impact,
                            c("is_driver"))

In [150]:
impact <- shuffle_rows(impact)

list_train_test <- split_train_test(impact, label_name = "is_driver")

impact_train       <- list_train_test[[1]]
impact_test        <- list_train_test[[2]]
impact_train_label <- list_train_test[[3]]
impact_test_label  <- list_train_test[[4]]

In [131]:
model <- C5.0(impact_train, impact_train_label, trials = 10, rules = TRUE)

In [132]:
model


Call:
C5.0.default(x = impact_train, y = impact_train_label, trials = 10, rules
 = TRUE)

Rule-Based Model
Number of samples: 37500 
Number of predictors: 22 

Number of boosting iterations: 10 
Average number of rules: 5.4 

Non-standard options: attempt to group attributes


In [133]:
summary(model)


Call:
C5.0.default(x = impact_train, y = impact_train_label, trials = 10, rules
 = TRUE)


C5.0 [Release 2.07 GPL Edition]  	Wed Aug 22 16:38:07 2018
-------------------------------

Class specified by attribute `outcome'

Read 37500 cases (23 attributes) from undefined.data

-----  Trial 0:  -----

Rules:

Rule 0/1: (1565/1, lift 1.4)
	Hugo_Symbol in {ABL1, ABRAXAS1, AKT1, AKT2, AKT3, ALK, ALOX12B, AR,
                        ARAF, AURKA, AURKB, AXL, BBC3, BCL2, BCL2L1, BCL6,
                        BIRC3, BRAF, BRD4, BTK, CALR, CARD11, CCNE1, CD274,
                        CD276, CD79A, CD79B, CDK4, CDK8, CDKN2Ap14ARF, CENPA,
                        CHEK1, COP1, CRLF2, CSF1R, CSF3R, CTLA4, CTNNB1, CUL3,
                        CXCR4, DDR2, DIS3, DNAJB1, DNMT1, DNMT3B, DOT1L, E2F3,
                        EGFR, EIF4A2, EPHA3, EPHA5, EPHA7, EPHB1, ERBB3, ERBB4,
                        ERCC2, ERCC5, ERG, ESR1, ETV1, ETV6, EZH2, FGF19, FGF4,
                        FGFR1, FGFR2, FGFR3, 

In [134]:
impact_pred <- predict(model, impact_test, type = "class")

In [135]:
get_result_table(impact_test_label, impact_pred)
get_accuracy(impact_test_label, impact_pred)

               data_test_pred
data_test_label   no  yes
            no  8968   44
            yes  500 2988

96.78%, 50,000 (rule: 95.65%)

## Next

In [225]:
impact <- transform_categorical_features_to_factor(impact,
                            c("Hugo_Symbol", "Chromosome", "Consequence", "Variant_Type", "Reference_Allele",
                              "Tumor_Seq_Allele2", "cDNA_change", "HGVSp_Short", "confidence_class"))

In [226]:
impact <- transform_categorical_features_to_factor(impact,
                            c("is_driver"))

In [227]:
impact <- shuffle_rows(impact)

list_train_test <- split_train_test(impact, label_name = "is_driver")

impact_train       <- list_train_test[[1]]
impact_test        <- list_train_test[[2]]
impact_train_label <- list_train_test[[3]]
impact_test_label  <- list_train_test[[4]]

In [228]:
impact_train$is_driver <- impact_train_label

model <- JRip(is_driver ~ ., impact_train)

In [206]:
impact_pred <- predict(model, impact_test, type = "class")
get_result_table(impact_test_label, impact_pred)
get_accuracy(impact_test_label, impact_pred)

               data_test_pred
data_test_label   no  yes
            no  1664   96
            yes  121  619

91.32%, 10,000

## ANN

In [11]:
impact <- remove_features(impact,
                            c("Hugo_Symbol", "Chromosome", "Consequence", "Variant_Type", "Reference_Allele",
                              "Tumor_Seq_Allele2", "cDNA_change", "HGVSp_Short", "confidence_class"))

In [12]:
impact <- min_max_normalize(impact,
             c("Start_Position", "End_Position", "t_depth", "t_vaf", "t_alt_count", "n_depth", "n_vaf", "n_alt_count",
               "t_ref_plus_count", "t_ref_neg_count", "t_alt_plus_count", "t_alt_neg_count", "sample_coverage"))

impact <- shuffle_rows(impact)

list_train_test <- split_train_test(impact, label_name = "is_driver")

impact_train       <- list_train_test[[1]]
impact_test        <- list_train_test[[2]]
impact_train_label <- list_train_test[[3]]
impact_test_label  <- list_train_test[[4]]

In [13]:
str(impact_train)

'data.frame':	750 obs. of  13 variables:
 $ Start_Position  : num  0.03026 0.66986 0.00428 0.03023 0.58559 ...
 $ End_Position    : num  0.03026 0.66986 0.00428 0.03023 0.58559 ...
 $ t_depth         : num  0.0414 0.0414 0.0154 0.0391 0.026 ...
 $ t_vaf           : num  0.545 0.163 0.142 0.184 0.125 ...
 $ t_alt_count     : num  0.1222 0.0376 0.0121 0.0399 0.0181 ...
 $ n_depth         : num  0.0769 0.1643 0.2329 0.2483 0.2322 ...
 $ n_vaf           : num  0 0 0 0 0 0 0 0 0 0 ...
 $ n_alt_count     : num  0 0 0 0 0 0 0 0 0 0 ...
 $ t_ref_plus_count: num  0.0211 0.0519 0.0165 0.0321 0.0304 ...
 $ t_ref_neg_count : num  0.0236 0.0277 0.0166 0.0402 0.0234 ...
 $ t_alt_plus_count: num  0.1336 0.0593 0.0172 0.0442 0.0216 ...
 $ t_alt_neg_count : num  0.1171 0.0252 0.0122 0.0407 0.0195 ...
 $ sample_coverage : num  0.2383 0.1383 0.1958 0.1318 0.0808 ...


In [14]:
impact_train$is_driver <- c(0, 1)[match(impact_train_label, c("yes", "no"))]

In [15]:
f <- as.formula(paste("is_driver ~", paste(colnames(impact_train)[!colnames(impact_train) %in% "is_driver"], collapse = " + ")))
print(f)

is_driver ~ Start_Position + End_Position + t_depth + t_vaf + 
    t_alt_count + n_depth + n_vaf + n_alt_count + t_ref_plus_count + 
    t_ref_neg_count + t_alt_plus_count + t_alt_neg_count + sample_coverage


In [16]:
model <- neuralnet(f, data = impact_train)

In [17]:
plot(model)

In [41]:
impact_pred <- as.data.frame(compute(model, impact_test[, colnames(impact_test) != "is_driver"])$net.result)
impact_pred$is_driver <- "no"
impact_pred$is_driver[impact_pred[,1] >= 0.5] <- "yes"
head(impact_test_label)
head(impact_pred, 10)

impact_pred <- impact_pred$is_driver

Unnamed: 0,V1,is_driver
930,0.6428905576,yes
415,0.6670971452,yes
917,0.6598925401,yes
253,0.7485162814,yes
326,0.7175339086,yes
944,0.5000297747,yes
245,0.7170209091,yes
272,0.5321999682,yes
991,0.4609769524,no
803,0.6368063849,yes


In [42]:
get_result_table(impact_test_label, impact_pred)
get_accuracy(impact_test_label, impact_pred)

               data_test_pred
data_test_label  no yes
            no   19 147
            yes  21  63