In [1]:
library("randomForest")
cellNumber <- 187
load(paste("../2_variables/frames",as.character(cellNumber), "cells_frame.RData", sep="/"))
set.seed(1)
drosophila.df <- cells_frame

randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.


We regroup all natural drosophila tissues as one 'drosophila' type

In [2]:
type <- as.vector(cells_frame[,'type'])
for (i in seq(length(type))){
    if (type[i] == 'dNP' || type[i] == 'dWL' || type[i] == 'dWP'){
        type[i] <- 'drosophila'
    }
} 
drosophila.df$type <- as.factor(type)

## TDA variables

And see if we can classify them using all variables:

In [3]:
cond <- drosophila.df$type == 'cNT' | drosophila.df$type == 'cEE' | drosophila.df$type == 'drosophila'
data.class <- droplevels(drosophila.df[cond,])

In [4]:
training.cEE <- c()
training.cNT <- c()
training.drosophila <- c()
training.all <- c()

validation.cEE <- c()
validation.cNT <- c()
validation.drosophila <- c()
validation.all <- c()

for (i in seq(10000)){
    data_set_size <- floor(nrow(data.class)/4*3)

    indexes <- sample(1:nrow(data.class), size = data_set_size)

    training <- data.class[indexes,]
    validation1 <- data.class[-indexes,]

    rf <- randomForest(formula = type ~ ., data = training, ntree = 200)
    training.cEE <- c(training.cEE, rf$confusion['cEE',4])
    training.cNT <- c(training.cNT, rf$confusion['cNT',4])
    training.drosophila <- c(training.drosophila, rf$confusion['drosophila',4])
    pm <- rf$confusion
    training.all <- c(training.all, 1 - (pm[1,1] + pm [2,2] + pm[3,3])/sum(pm))
    
    prediction_for_table <- predict(rf,validation1[cond,])
    pm <- table(observed=validation1[cond,'type'],predicted=prediction_for_table)
    validation.cEE <- c(validation.cEE, 1 - pm['cEE', 'cEE']/sum(pm['cEE',]))
    validation.cNT <- c(validation.cNT, 1 - pm['cNT', 'cNT']/sum(pm['cNT',]))
    validation.drosophila <- c(validation.drosophila, 1 - pm['drosophila', 'drosophila']/sum(pm['drosophila',]))
    validation.all <- c(validation.all, 1 - (pm[1,1] + pm [2,2] + pm[3,3])/sum(pm))
        
    }

In [5]:
print('cEE error')
1-mean(training.cEE)
print('cNT error')
1-mean(training.cNT)
print('drosophila error')
1-mean(training.drosophila)
print('total error')
1-mean(training.all)

[1] "cEE error"


[1] "cNT error"


[1] "drosophila error"


[1] "total error"


In [6]:
# In rare cases there are not type of some tissue in the validation, creating a NaN in the sequence. 
# Since no error is committed in this cases, we change the NaN by 0s
validation.cEE[is.nan(validation.cEE)] = 0
validation.cNT[is.nan(validation.cNT)] = 0
validation.drosophila[is.nan(validation.drosophila)] = 0
validation.all[is.nan(validation.all)] = 0

print('cEE error')
1-mean(validation.cEE)
print('cNT error')
1-mean(validation.cNT)
print('drosophila error')
1-mean(validation.drosophila) 
print('global error')
1-mean(validation.all)

[1] "cEE error"


[1] "cNT error"


[1] "drosophila error"


[1] "global error"


## Network variables

In [7]:
load(paste("../2_variables/frames",as.character(cellNumber), "network_frame.RData", sep="/"))
network.df <- network_frame

We regroup all natural drosophila tissues as one 'drosophila' type

In [8]:
type <- as.vector(network_frame[,'type'])
for (i in seq(length(type))){
    if (type[i] == 'dNP' || type[i] == 'dWL' || type[i] == 'dWP'){
        type[i] <- 'drosophila'
    }
} 
network.df$type <- as.factor(type)

In [9]:
cond <- network.df$type == 'cNT' | network.df$type == 'cEE' | network.df$type == 'drosophila'
data.class <- droplevels(network.df[cond,])

In [10]:
training.cEE <- c()
training.cNT <- c()
training.drosophila <- c()
training.all <- c()

validation.cEE <- c()
validation.cNT <- c()
validation.drosophila <- c()
validation.all <- c()

for (i in seq(10000)){
    data_set_size <- floor(nrow(data.class)/4*3)

    indexes <- sample(1:nrow(data.class), size = data_set_size)

    training <- data.class[indexes,]
    validation1 <- data.class[-indexes,]

    rf <- randomForest(formula = type ~ ., data = training, ntree = 200)
    training.cEE <- c(training.cEE, rf$confusion['cEE',4])
    training.cNT <- c(training.cNT, rf$confusion['cNT',4])
    training.drosophila <- c(training.drosophila, rf$confusion['drosophila',4])
    pm <- rf$confusion
    training.all <- c(training.all, 1 - (pm[1,1] + pm [2,2] + pm[3,3])/sum(pm))
    
    prediction_for_table <- predict(rf,validation1[cond,])
    pm <- table(observed=validation1[cond,'type'],predicted=prediction_for_table)
    validation.cEE <- c(validation.cEE, 1 - pm['cEE', 'cEE']/sum(pm['cEE',]))
    validation.cNT <- c(validation.cNT, 1 - pm['cNT', 'cNT']/sum(pm['cNT',]))
    validation.drosophila <- c(validation.drosophila, 1 - pm['drosophila', 'drosophila']/sum(pm['drosophila',]))
    validation.all <- c(validation.all, 1 - (pm[1,1] + pm [2,2] + pm[3,3])/sum(pm))
        
    }

In [11]:
print('cEE error')
1-mean(training.cEE)
print('cNT error')
1-mean(training.cNT)
print('drosophila error')
1-mean(training.drosophila)
print('global error')
1-mean(training.all)

[1] "cEE error"


[1] "cNT error"


[1] "drosophila error"


[1] "global error"


In [12]:
# In rare cases there are not type of some tissue in the validation, creating a NaN in the sequence. 
# Since no error is committed in this cases, we change the NaN by 0s
validation.cEE[is.nan(validation.cEE)] = 0
validation.cNT[is.nan(validation.cNT)] = 0
validation.drosophila[is.nan(validation.drosophila)] = 0
validation.all[is.nan(validation.all)] = 0

print('cEE error')
1-mean(validation.cEE)
print('cNT error')
1-mean(validation.cNT)
print('drosophila error')
1-mean(validation.drosophila) 
print('global error')
1-mean(validation.all)

[1] "cEE error"


[1] "cNT error"


[1] "drosophila error"


[1] "global error"


## Mean and Variance

In [13]:
training.cEE <- c()
training.cNT <- c()
training.drosophila <- c()
training.all <- c()

validation.cEE <- c()
validation.cNT <- c()
validation.drosophila <- c()
validation.all <- c()

for (i in seq(10000)){
    data_set_size <- floor(nrow(data.class)/4*3)

    indexes <- sample(1:nrow(data.class), size = data_set_size)

    training <- data.class[indexes,]
    validation1 <- data.class[-indexes,]

    rf <- randomForest(formula = type ~ mean_degree + var_degree, data = training, ntree = 200)
    training.cEE <- c(training.cEE, rf$confusion['cEE',4])
    training.cNT <- c(training.cNT, rf$confusion['cNT',4])
    training.drosophila <- c(training.drosophila, rf$confusion['drosophila',4])
    pm <- rf$confusion
    training.all <- c(training.all, 1 - (pm[1,1] + pm [2,2] + pm[3,3])/sum(pm))
    
    prediction_for_table <- predict(rf,validation1[cond,])
    pm <- table(observed=validation1[cond,'type'],predicted=prediction_for_table)
    validation.cEE <- c(validation.cEE, 1 - pm['cEE', 'cEE']/sum(pm['cEE',]))
    validation.cNT <- c(validation.cNT, 1 - pm['cNT', 'cNT']/sum(pm['cNT',]))
    validation.drosophila <- c(validation.drosophila, 1 - pm['drosophila', 'drosophila']/sum(pm['drosophila',]))
    validation.all <- c(validation.all, 1 - (pm[1,1] + pm [2,2] + pm[3,3])/sum(pm))
        
    }

In [14]:
print('cEE error')
1-mean(training.cEE)
print('cNT error')
1-mean(training.cNT)
print('drosophila error')
1-mean(training.drosophila)
print('total error')
1-mean(training.all)

[1] "cEE error"


[1] "cNT error"


[1] "drosophila error"


[1] "total error"


In [15]:
# In rare cases there are not type of some tissue in the validation, creating a NaN in the sequence. 
# Since no error is committed in this cases, we change the NaN by 0s
validation.cEE[is.nan(validation.cEE)] = 0
validation.cNT[is.nan(validation.cNT)] = 0
validation.drosophila[is.nan(validation.drosophila)] = 0
validation.all[is.nan(validation.all)] = 0

print('cEE error')
1-mean(validation.cEE)
print('cNT error')
1-mean(validation.cNT)
print('drosophila error')
1-mean(validation.drosophila) 
print('global error')
1-mean(validation.all)

[1] "cEE error"


[1] "cNT error"


[1] "drosophila error"


[1] "global error"


## Mixed variables

In [16]:
mix_frame <- cbind(network_frame, cells_frame[,seq(2,length(cells_frame))])
mix.df <- mix_frame

In [17]:
type <- as.vector(mix_frame[,'type'])
for (i in seq(length(type))){
    if (type[i] == 'dNP' || type[i] == 'dWL' || type[i] == 'dWP'){
        type[i] <- 'drosophila'
    }
} 
mix.df$type <- as.factor(type)

In [18]:
cond <- mix.df$type == 'cNT' | mix.df$type == 'cEE' | mix.df$type == 'drosophila'
data.class <- droplevels(mix.df[cond,])

In [19]:
training.cEE <- c()
training.cNT <- c()
training.drosophila <- c()
training.all <- c()

validation.cEE <- c()
validation.cNT <- c()
validation.drosophila <- c()
validation.all <- c()

for (i in seq(10000)){
    data_set_size <- floor(nrow(data.class)/4*3)

    indexes <- sample(1:nrow(data.class), size = data_set_size)

    training <- data.class[indexes,]
    validation1 <- data.class[-indexes,]

    rf <- randomForest(formula = type ~ ., data = training, ntree = 200)
    training.cEE <- c(training.cEE, rf$confusion['cEE',4])
    training.cNT <- c(training.cNT, rf$confusion['cNT',4])
    training.drosophila <- c(training.drosophila, rf$confusion['drosophila',4])
    pm <- rf$confusion
    training.all <- c(training.all, 1 - (pm[1,1] + pm [2,2] + pm[3,3])/sum(pm))
    
    prediction_for_table <- predict(rf,validation1[cond,])
    pm <- table(observed=validation1[cond,'type'],predicted=prediction_for_table)
    validation.cEE <- c(validation.cEE, 1 - pm['cEE', 'cEE']/sum(pm['cEE',]))
    validation.cNT <- c(validation.cNT, 1 - pm['cNT', 'cNT']/sum(pm['cNT',]))
    validation.drosophila <- c(validation.drosophila, 1 - pm['drosophila', 'drosophila']/sum(pm['drosophila',]))
    validation.all <- c(validation.all, 1 - (pm[1,1] + pm [2,2] + pm[3,3])/sum(pm))
        
    }

In [20]:
print('cEE acc')
1-mean(training.cEE)
print('cNT acc')
1-mean(training.cNT)
print('drosophila acc')
1-mean(training.drosophila)
print('global acc')
1-mean(training.all)

[1] "cEE acc"


[1] "cNT acc"


[1] "drosophila acc"


[1] "global acc"


In [21]:
# In rare cases there are not type of some tissue in the validation, creating a NaN in the sequence. 
# Since no error is committed in this cases, we change the NaN by 0s
validation.cEE[is.nan(validation.cEE)] = 0
validation.cNT[is.nan(validation.cNT)] = 0
validation.drosophila[is.nan(validation.drosophila)] = 0
validation.all[is.nan(validation.all)] = 0

print('cEE acc')
1-mean(validation.cEE)
print('cNT acc')
1-mean(validation.cNT)
print('drosophila acc')
1-mean(validation.drosophila) 
print('global acc')
1-mean(validation.all)

[1] "cEE acc"


[1] "cNT acc"


[1] "drosophila acc"


[1] "global acc"


## Mean + Var + lan

In [28]:
mix_frame <- cbind(network_frame, cells_frame[,seq(2,length(cells_frame))])
#mix_frame <- cbind(network_frame, cells_frame[,c('PE.0.sup', 'lan.1.05.sub', 'lan.1.02.sup')])
mix.df <- mix_frame

In [29]:
type <- as.vector(mix_frame[,'type'])
for (i in seq(length(type))){
    if (type[i] == 'dNP' || type[i] == 'dWL' || type[i] == 'dWP'){
        type[i] <- 'drosophila'
    }
} 
mix.df$type <- as.factor(type)

In [30]:
cond <- mix.df$type == 'cNT' | mix.df$type == 'cEE' | mix.df$type == 'drosophila'
data.class <- droplevels(mix.df[cond,])

In [31]:
training.cEE <- c()
training.cNT <- c()
training.drosophila <- c()
training.all <- c()

validation.cEE <- c()
validation.cNT <- c()
validation.drosophila <- c()
validation.all <- c()

for (i in seq(10000)){
    data_set_size <- floor(nrow(data.class)/4*3)

    indexes <- sample(1:nrow(data.class), size = data_set_size)

    training <- data.class[indexes,]
    validation1 <- data.class[-indexes,]

    rf <- randomForest(formula = type ~  mean_degree + var_degree + lan.0.10.sub, 
                       data = training, ntree = 200)
    training.cEE <- c(training.cEE, rf$confusion['cEE',4])
    training.cNT <- c(training.cNT, rf$confusion['cNT',4])
    training.drosophila <- c(training.drosophila, rf$confusion['drosophila',4])
    pm <- rf$confusion
    training.all <- c(training.all, 1 - (pm[1,1] + pm [2,2] + pm[3,3])/sum(pm))
    
    prediction_for_table <- predict(rf,validation1[cond,])
    pm <- table(observed=validation1[cond,'type'],predicted=prediction_for_table)
    validation.cEE <- c(validation.cEE, 1 - pm['cEE', 'cEE']/sum(pm['cEE',]))
    validation.cNT <- c(validation.cNT, 1 - pm['cNT', 'cNT']/sum(pm['cNT',]))
    validation.drosophila <- c(validation.drosophila, 1 - pm['drosophila', 'drosophila']/sum(pm['drosophila',]))
    validation.all <- c(validation.all, 1 - (pm[1,1] + pm [2,2] + pm[3,3])/sum(pm))  
    }

In [32]:
print('cEE acc')
1-mean(training.cEE)
print('cNT acc')
1-mean(training.cNT)
print('drosophila acc')
1-mean(training.drosophila)
print('global acc')
1-mean(training.all)

[1] "cEE acc"


[1] "cNT acc"


[1] "drosophila acc"


[1] "global acc"


In [33]:
# In rare cases there are not type of some tissue in the validation, creating a NaN in the sequence. 
# Since no error is committed in this cases, we change the NaN by 0s
validation.cEE[is.nan(validation.cEE)] = 0
validation.cNT[is.nan(validation.cNT)] = 0
validation.drosophila[is.nan(validation.drosophila)] = 0
validation.all[is.nan(validation.all)] = 0

print('cEE acc')
1-mean(validation.cEE)
print('cNT acc')
1-mean(validation.cNT)
print('drosophila acc')
1-mean(validation.drosophila) 
print('global acc')
1-mean(validation.all)

[1] "cEE acc"


[1] "cNT acc"


[1] "drosophila acc"


[1] "global acc"
