Finding IEG expression in human datasets.
<br><br>
We will use the following definition from a biorvx preprint: [Hochgerner et al., (2022)](https://www.biorxiv.org/content/10.1101/2022.10.25.513733v1.full)...
<br><br>
__Cell activity score by high IEG expression (IEG score)__
<br><br>
*We defined a set of eight immediate early genes (IEGs): Arc, Bdnf, Btg2, Fos, Fosl2, Homer1, Npas4, Nr4a1, calculated the 90th percentile expression for each. A cell was called activated, or IEGhigh, if it expressed any of the genes above this 90th percentile. For every cell type, we calculated the fraction of activated cells per timepoint (8×5 matrix). We then summed the fraction of all eight genes per timepoint (1×5 vector) and defined the activity score as the maximum difference between CFC-samples (2h, 8h, 24h, recall) to HC control.*

In [17]:
library(randomForest)
library(rfUtilities)
library(Seurat) # not really needed for this analysis and may interfere with Summarized experiment which we need 
library(stringr)
library(sampler)
library(caTools)
library(pROC)
library(ggplot2)
library(stats)
library(Dict)
library(pheatmap)
library(caret)
library(data.table)
library(dplyr)
library(groupdata2)
#library(SCnorm) # install this one from github
#may require restarting session to run
#library(devtools)
#devtools::install_github("rhondabacher/SCnorm")
library(janitor)
#library(SummarizedExperiment) # may interfere with seurat
library(tibble)

In [2]:
#df <- as.data.frame(as.matrix(GetAssayData(seu, assay="integrated", slot="...")))

ERROR: Error in GetAssayData(seu, assay = "integrated", slot = "..."): object 'seu' not found


In [30]:
seurat_log1p_transform <- function(x, scale_factor = 10000){
    counts <- sum(x)
    x <- x/counts
    return(log1p(x)*scale_factor)
}

seurat.normalize <- function(df){
    df.cols <- colnames(df)
    df.rows <- rownames(df)
    df <- data.frame(apply(df,  MARGIN = 2, seurat_log1p_transform))
    colnames(df) <- df.cols
    rownames(df)<- df.rows
    return(df)
}

In [3]:
#functions
logplusone <- function(x){
  if (sum(x)==0){
  y <- x}else{
  y <- log(x+1)}    
  return( y )
}

In [4]:
log.norm <- function(df.in){
  #performs the lognorm transform and scales the data, removes NA's first
  if( sum(is.na(df.in)) ){
    df.in[is.na(df.in)] <- 0
  }
  df.out <- apply(df.in,
                  MARGIN = 1,
                  FUN = logplusone
  )
  df.out <- scale( t(df.out) ) #we need it transposed so that the scaling is done per gene not cell
  df.out <- data.frame( t(df.out) )
  colnames(df.out) <- rownames(df.in)
  return(df.out)
}

In [5]:
#modified original is in ChenClassifier.R in the EngramCellClassifier folder
celltype.lognorm <-function(countsdata, celltype.labels){
  #log normalizes within cell types in counts data
  #celltype labels and colnames of countsdata must have same order
  
  #retunrs a transposed and normalize dataframe 
  
  print("Normalizing cell type...")
  
  celltypes <- unique(celltype.labels)
  df.out <- data.frame(gene = rownames(countsdata))
  #df.out <- t(df.out)
  #colnames(df.out) <- rownames(countsdata)
  
  cell_names <- c('gene',colnames(countsdata)) # keep this for reorganizing later
  df.out.rownames <- c()
  for(type in celltypes){
    print(type[1])
    normalized.within.type <- log.norm(countsdata[,celltype.labels==type])
    normalized.within.type <- t(normalized.within.type) # lognorm flips its data
    normalized.within.type <- data.frame(normalized.within.type)
    normalized.within.type <- rownames_to_column(normalized.within.type, var ="gene")
    df.out <- left_join( df.out, normalized.within.type, by = 'gene' )
  }
  
  #df.out <- df.out[,2:dim(df.out)[2]]  # to keep original order
  df.out <- df.out[cell_names] %>% 
    select_if(~ !any(is.na(.))) %>%
    column_to_rownames(var = 'gene')
    
  return( data.frame(df.out) ) 
}

In [6]:
resample.randomForest <-function( df.in,
                                  under_represented_class,
                                  over_represented_class,
                                  proportion,
                                  batches, 
                                  trees){
  #NOTE: df.in should have a column called engram cell with the class labels i.e. postive or negative

  #this function resamples from our samples and retrains new models then combines them
  # this is too prevent over fitting on cells
  trees.per.batch <- as.integer(trees/batches)
  n.cells <- trunc( sum(df.in$Engramcell==under_represented_class)*proportion)
  batches <- c(1:batches)
  for( batch in batches){
    resample.set <- rbind(sample(which(df.in$Engramcell==under_represented_class), size = n.cells),
                          sample(which(df.in$Engramcell==over_represented_class), size = n.cells)
                          )
    resample.set <- df.in[resample.set,]
    
    # creates rf.model
    if(batch==1){
      rf.model <- randomForest(x = resample.set[,1:(length(resample.set)-1)],
                               y = resample.set$Engramcell,
                               ntree = trees.per.batch)
    }
    #trains new models in rf.fit and combines tham with rf.model
    if(batch>1){
      rf.fit = randomForest(x = resample.set[,1:(length(resample.set)-1)],
                            y = resample.set$Engramcell,
                            ntree = trees.per.batch)
      rf.model <- randomForest::combine(rf.fit, rf.model)
    }
  }#end of for loop over batches
  
  return(rf.model)
}


make.predictions.df <- function(classifier.object, 
                                test_df,
                                meta.data.label.column,
                                label = c("Active","Inactive")
                                ){
  #generate predictions for making classifier summary
  predictions <- as.data.frame(predict(classifier.object, test_df[,1:(length(test_df))], type = "prob"))
  predictions$predict <- names(predictions)[1:2][apply(predictions[,1:2], 1, which.max)] #1:2 for the number of classes
  predictions$observed <- meta.data.label.column #this should be changed if you want to make this functions more modular
  colnames(predictions)[1:2] <- c("label_pos","label_neg")
  predictions$engramobserved <- ifelse(predictions$observed==label[1], 1, 0)
  predictions$inactiveobserved <- ifelse(predictions$observed==label[2], 1, 0)
  return(predictions)
}


# assess a single run of resampled.randomforest
assessment <- function(predictions.df, 
                       label = c("Active","Inactive") 
                       ){
  # returns a vector of assessments to be used to make dataframe summarizing classifiers performance
  # can be used to make df of all calssifiers trained in a single run
  TP <- sum((predictions.df$predict == label[1])&(predictions.df$observed == label[1]))
  TN <- sum((predictions.df$predict == label[2])&(predictions.df$observed == label[2]))
  FN <- sum((predictions.df$predict == label[2])&(predictions.df$observed == label[1]))
  FP <- sum((predictions.df$predict == label[1])&(predictions.df$observed == label[2]))
  
  #precision and recall as well as sumamry stats F1
  precision <- TP/(TP+FP)
  recall <- TP/(TP+FN)
  F1.score = 2 * (precision * recall) / (precision + recall)
  FPR <- FP/(TN+FP)
  FNR <- FN/(TP+FN)
  
  #getting auc
  roc.engramcell <- roc(predictions.df$engramobserved, as.numeric(predictions.df$label_pos) )
  AUC <- auc(roc.engramcell)
  
  return( c(F1.score, AUC, precision, recall, FPR, FNR,
            TP, FN, TN, FP) )
}



resampled.randomForest.crossvalidated <-function(data,
                                                 under.represented.class,
                                                 over.represented.class,
                                                 folds,
                                                 trees.total,
                                                 proportion.each.batch=0.8,
                                                 batches.per.fold=20){
  # takes a data frame with a label column assumed to be named Engramcell, data$Engramcell
  # returns a model that has been k-fold cross validated, with an attribute called Assessment
  # assessment has the performance metrics of all the folds and a column of means and SD's for each
  # metric
  #NOTE: ROC curve needs to be implemented
  
  folds.obj <- createFolds(data$Engramcell, k = folds)
  loops <- c(1:folds)
  for( i in loops ){
    #create indices
    test.idx <- folds.obj[[i]]
    # needs to be a list so it can act as an index
    train.idx <- which(!(rownames(data) %in% test.idx) )
    
    #split data for this fold
    training_set <- data[train.idx,]
    testing_set <- data[test.idx,]
    
    # divvies up number of trees
    trees.in.the.fold = as.integer(trees.total/folds)
    if ( ( trees.total%%(batches.per.fold*folds) )>0  ){ 
      stop("Number of trees does not devide evenly by batches and folds.")
    }
    # we still need to settle on stuff to 
    rf.this_fold <- resample.randomForest(df.in = training_set,
                                          under_represented_class = under.represented.class,
                                          over_represented_class = over.represented.class,
                                          proportion= proportion.each.batch,
                                          batches = batches.per.fold, 
                                          trees = trees.in.the.fold)
    
    if(i == 1){
      rf.out <- rf.this_fold
      pred <- make.predictions.df(rf.this_fold, testing_set[1:(length(testing_set)-1)], testing_set$Engramcell)
      assess <- assessment( pred ) 
      fold.performance <- data.frame(assess )
      rownames(fold.performance) <- c("F1 Score", "AUC", "Precision", "Recall",
                                      "FPR", "FNR", "True Positives", "False Negatives", 
                                      "True Negatives", "False Positives")
    }else{
      rf.out <- randomForest::combine(rf.out, rf.this_fold)
      # we need votes for all cells to calculate
      pred <- make.predictions.df(rf.this_fold, testing_set[1:(length(testing_set)-1)], testing_set$Engramcell)
      assess <- assessment( pred ) 
      fold.performance[,ncol(fold.performance) + 1] <- assess
    }
    
  }# end of for loop
  colnames(fold.performance) <- names(folds.obj)
  fold.performance$Mean <- apply(fold.performance,MARGIN=1,  FUN = mean)
  fold.performance$SigDiff <- apply(fold.performance,MARGIN=1,  FUN = sd)
  rf.out$Assessment <- fold.performance
  
  #votes needs to be updated to make roc curve
  rf.out$votes <- predict(object = rf.out, newdata = data, type = 'vote', norm.votes = FALSE)
  return(rf.out)
}

resample.regularizedRF <- function( df.in,
                                    under_represented_class,
                                    over_represented_class,
                                    proportion,
                                    batches, 
                                    trees){
  #NOTE: df.in should have a column called engram cell with the class labels i.e. postive or negative
  
  #this function resamples from our samples and retrains new models then combines them
  # this is too prevent over fitting on cells
  trees.per.batch <- as.integer(trees/batches)
  n.cells <- trunc( sum(df.in$Engramcell==under_represented_class)*proportion)
  batches <- c(1:batches)
  for( batch in batches){
    resample.set <- rbind(sample(which(df.in$Engramcell==under_represented_class), size = n.cells),
                          sample(which(df.in$Engramcell==over_represented_class), size = n.cells)
    )
    resample.set <- df.in[resample.set,]
    
    # creates rf.model
    if(batch==1){
      rf.model <- RRF(x = resample.set[,1:(length(resample.set)-1)],
                      y = resample.set$Engramcell,
                      ntree = trees.per.batch)
    }
    #trains new models in rf.fit and combines tham with rf.model
    if(batch>1){
      rf.fit = RRF(x = resample.set[,1:(length(resample.set)-1)],
                   y = resample.set$Engramcell,
                   ntree = trees.per.batch)
      rf.model <- RRF::combine(rf.fit, rf.model)
    }
  }#end of for loop over batches
  
  return(rf.model)
}

#
resampled.regularizedRF.crossvalidated <-function(data,
                                                  under.represented.class,
                                                  over.represented.class,
                                                  folds,
                                                  trees.total,
                                                  proportion.each.batch=0.8,
                                                  batches.per.fold=20){
  # takes a data frame with a label column assumed to be named Engramcell, data$Engramcell
  # returns a model that has been k-fold cross validated, with an attribute called Assessment
  # assessment has the performance metrics of all the folds and a column of means and SD's for each
  # metric
  #NOTE: ROC curve needs to be implemented
  
  folds.obj <- createFolds(data$Engramcell, k = folds)
  loops <- c(1:folds)
  for( i in loops ){
    #create indices
    test.idx <- folds.obj[[i]]
    # needs to be a list so it can act as an index
    train.idx <- which(!(rownames(data) %in% test.idx) )
    
    #split data for this fold
    training_set <- data[train.idx,]
    testing_set <- data[test.idx,]
    
    # divvies up number of trees
    trees.in.the.fold = as.integer(trees.total/folds)
    if ( ( trees.total%%(batches.per.fold*folds) )>0  ){ 
      stop("Number of trees does not devide evenly by batches and folds.")
    }
    # we still need to settle on stuff to 
    rf.this_fold <- resample.regularizedRF(df.in = training_set,
                                           under_represented_class = under.represented.class,
                                           over_represented_class = over.represented.class,
                                           proportion= proportion.each.batch,
                                           batches = batches.per.fold, 
                                           trees = trees.in.the.fold)
    
    if(i == 1){
      rf.out <- rf.this_fold
      pred <- make.predictions.df(rf.this_fold, testing_set[1:(length(testing_set)-1)], testing_set$Engramcell)
      assess <- assessment( pred ) 
      fold.performance <- data.frame(assess )
      rownames(fold.performance) <- c("F1 Score", "AUC", "Precision", "Recall",
                                      "FPR", "FNR", "True Positives", "False Negatives", 
                                      "True Negatives", "False Positives")
    }else{
      rf.out <- RRF::combine(rf.out, rf.this_fold)
      # we need votes for all cells to calculate
      pred <- make.predictions.df(rf.this_fold, testing_set[1:(length(testing_set)-1)], testing_set$Engramcell)
      assess <- assessment( pred ) 
      fold.performance[,ncol(fold.performance) + 1] <- assess
    }
    
  }# end of for loop
  colnames(fold.performance) <- names(folds.obj)
  fold.performance$Mean <- apply(fold.performance,MARGIN=1,  FUN = mean)
  fold.performance$SigDiff <- apply(fold.performance,MARGIN=1,  FUN = sd)
  rf.out$Assessment <- fold.performance
  
  #votes needs to be updated to make roc curve
  rf.out$votes <- predict(object = rf.out, newdata = data, type = 'vote', norm.votes = FALSE)
  return(rf.out)
}

In [7]:
# we're just gonna use the genes from Neuroestimator first 
# they did not provide a lsit so I ma just reading off of figure1 and not all the genes were legible

# non target genes that were highly influential Homer1, Bdnf, Ntrk2, Jun, Cyr61,
# I added Rgs4 because I know it's important "Rgs4",

bahl2022_IEGs <- list("Per1", "Dusp1","Fosb", "Btg2", "Erg2", "Npas4", "Grasp", "Tiparp", "Nr4a3", "Rgs2", "Crem",
                      "Arc", "Fos", "Fbxso33", "Nr4a2", "Junb","Erg3", "Fosl2", "Egr1","Nr4a1")
other_important_IEGs <- list("Homer1","Bdnf","Ntrk2","Jun","Cyr61","Rgs4", "Bdnf")

target_IEGs <- c(bahl2022_IEGs, other_important_IEGs)

In [8]:
# human mouse gene orthologs
hg_to_mm <- read.table("/home/acampbell/PavLabEngrams/EngramCellClassifier/hg_mm_1to1_ortho_genes_DIOPT-v8.tsv", sep = '\t', header = TRUE)

#

In [9]:
iegs_hg <- hg_to_mm$Symbol_hg[hg_to_mm$Symbol_mm %in% target_IEGs]

There are no studies which have completely fresh human brain all of it was fresh frozen and then shipped.  We also have several post mortem brain datasets.  There is one fresh human brain datasets.

#dont need it for now, markdown
abi_multipleareas_counts <- read.csv('/home/acampbell/test_datasets/ABI_Human_FreshBrain/multiple_areas_smartseq', 
                                    header =TRUE)

#dont need it for now, markdown
abi_multipleareas_meta <- read.csv('/home/acampbell/test_datasets/ABI_Human_FreshBrain/multiple_areas_metadata',
                                  header =  TRUE)

In [7]:
#keep.the.neurons.idx = abi_multipleareas_meta$class_label=="GABAergic"|abi_multipleareas_meta$class_label=="Glutamatergic"

In [None]:
abi_multipleareas_neurons <- data.frame(t(abi_multipleareas_counts[keep.the.neurons.idx,])) %>% 
  row_to_names(row_number = 1) 

#%>%
#  mutate_if(is.character, as.numeric)

In [None]:
write.csv(abi_multipleareas_neurons, '/home/acampbell/test_datasets/ABI_Human_FreshBrain/abi_multipleareas_neurons.csv')

In [10]:
ayhan_counts <- read.table('/home/acampbell/test_datasets/Ayhan2021_GSE160189/GSE160189_Hippo_Counts.csv.gz',
                     sep = ",", header = TRUE)

In [18]:
ayhan_counts <- data.frame(ayhan_counts) %>% column_to_rownames(var = 'gene')

In [20]:
print(dim(ayhan_counts))
#head(ayhan_counts,5)
ayhan_counts[1:5,1:10]

[1]  17180 131325


Unnamed: 0_level_0,P57_AAAGTAGGTCCAGTAT,P57_AACCATGGTAAACACA,P57_AACGTTGCACTTAACG,P57_AACTCTTCATACTCTT,P57_AACTTTCAGAGACTTA,P57_AAGACCTGTCCAGTTA,P57_AAGACCTGTCTGATCA,P57_AAGCCGCAGATCCGAG,P57_AATCCAGCAGCTATTG,P57_ACACCAAGTAAGTAGT
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
A1BG,0,0,0,0,0,0,0,0,0,0
A1CF,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,0,0
A2ML1,0,0,0,0,0,0,0,0,0,0
A3GALT2,0,0,0,0,0,0,0,0,0,0


In [21]:
ayhan_meta <- read.table('/home/acampbell/test_datasets/Ayhan2021_GSE160189/meta.tsv',
                     sep = '\t' , header = TRUE)

In [23]:
ayhan_meta <- data.frame(ayhan_meta)

In [25]:
df apply(df,  MARGIN = 2, FUN = seurat_norm)

In [25]:
head(ayhan_meta)

Unnamed: 0_level_0,Cell,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,group,age,sex,epilepsy_duration,batch,version,donor,seurat_clusters,epilepsy_frequency,Cluster
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<int>,<dbl>,<chr>
1,P57_AAACCTGAGAGTAAGG,P57_woXYMT,1300,908,0.5120702,posterior,60,F,37,Batch3,v2,Case57,11,3,Olig4
2,P57_AAACCTGAGTTACCCA,P57_woXYMT,1390,958,0.0,posterior,60,F,37,Batch3,v2,Case57,7,3,Olig2
3,P57_AAACCTGAGTTTAGGA,P57_woXYMT,2604,1598,0.4533434,posterior,60,F,37,Batch3,v2,Case57,14,3,Pyr1
4,P57_AAACCTGCAGCAGTTT,P57_woXYMT,2077,1253,0.140779,posterior,60,F,37,Batch3,v2,Case57,10,3,Den.Gyr1
5,P57_AAACCTGCAGTAGAGC,P57_woXYMT,651,500,0.3025719,posterior,60,F,37,Batch3,v2,Case57,7,3,Olig2
6,P57_AAACCTGGTAAATGAC,P57_woXYMT,584,468,0.1658375,posterior,60,F,37,Batch3,v2,Case57,4,3,Olig2


In [27]:
length(match(colnames(ayhan_counts),ayhan_meta$Cell))

In [112]:
ayhan_counts_sorted <- ayhan_counts[match(rownames(ayhan_counts),ayhan_meta$Cell), ]

In [106]:
ayhan_meta <- data.frame(ayhan_meta)
ayhan_meta <- t(ayhan_meta)
ayhan_meta <- ayhan_meta[colnames(ayhan_counts)]
ayhan_meta <- ayhan_meta <- t(ayhan_meta)

In [31]:
test <- seurat.normalize(ayhan_counts[,1:3])

In [32]:
test

Unnamed: 0_level_0,P57_AAAGTAGGTCCAGTAT,P57_AACCATGGTAAACACA,P57_AACGTTGCACTTAACG
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
A1BG,0.00000,0.00000,0.00000
A1CF,0.00000,0.00000,0.00000
A2M,0.00000,0.00000,0.00000
A2ML1,0.00000,0.00000,0.00000
A3GALT2,0.00000,0.00000,0.00000
A4GALT,0.00000,0.00000,0.00000
AAAS,0.00000,0.00000,0.00000
AACS,0.00000,0.00000,0.00000
AADAT,0.00000,0.00000,0.00000
AAED1,0.00000,0.00000,0.00000


In [21]:
head(ayhan_meta,5)

Unnamed: 0_level_0,Cell,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,group,age,sex,epilepsy_duration,batch,version,donor,seurat_clusters,epilepsy_frequency,Cluster
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<dbl>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<int>,<dbl>,<chr>
1,P57_AAACCTGAGAGTAAGG,P57_woXYMT,1300,908,0.5120702,posterior,60,F,37,Batch3,v2,Case57,11,3,Olig4
2,P57_AAACCTGAGTTACCCA,P57_woXYMT,1390,958,0.0,posterior,60,F,37,Batch3,v2,Case57,7,3,Olig2
3,P57_AAACCTGAGTTTAGGA,P57_woXYMT,2604,1598,0.4533434,posterior,60,F,37,Batch3,v2,Case57,14,3,Pyr1
4,P57_AAACCTGCAGCAGTTT,P57_woXYMT,2077,1253,0.140779,posterior,60,F,37,Batch3,v2,Case57,10,3,Den.Gyr1
5,P57_AAACCTGCAGTAGAGC,P57_woXYMT,651,500,0.3025719,posterior,60,F,37,Batch3,v2,Case57,7,3,Olig2


In [16]:
ayhan_counts <- data.frame(ayhan_counts) #%>% column_to_rowname(var = 'gene')
ayhan_counts <- column_to_rownames(ayhan_counts, var = 'gene')

ERROR: Error in column_to_rownames(ayhan_counts, var = "gene"): could not find function "column_to_rownames"


In [86]:
ayhan_counts <- ayhan_counts %>% column_to_rownames(var = 'gene')

In [87]:
ayhan_neurons_idx <- which(ayhan_meta$Cluster %in% c('Den.Gyr1', 'Den.Gyr2', 'In1', 'In2', 'In3') )

In [100]:
#ayhan_meta$Cluster %in% c('Den.Gyr1', 'Den.Gyr2', 'In1', 'In2', 'In3')
#ayhan_neurons <- ayhan_counts[,c(1,which(ayhan_meta$Cluster %in% c('Den.Gyr1', 'Den.Gyr2', 'In1', 'In2', 'In3') )+1)]
ayhan_neurons <- ayhan_counts[,ayhan_neurons_idx]

In [101]:
#modified original is in ChenClassifier.R in the EngramCellClassifier folder
celltype.lognorm <-function(countsdata, celltype.labels){
  #log normalizes within cell types in counts data
  #celltype labels and colnames of countsdata must have same order
  
  #retunrs a transposed and normalize dataframe 
  
  print("Normalizing cell type...")
  
  celltypes <- unique(celltype.labels)
  df.out <- data.frame(gene = rownames(countsdata))
  #df.out <- t(df.out)
  #colnames(df.out) <- rownames(countsdata)
  
  cell_names <- c('gene',colnames(countsdata)) # keep this for reorganizing later
  df.out.rownames <- c()
  for(type in celltypes){
    print(type[1])
    normalized.within.type <- log.norm(countsdata[,celltype.labels==type])
    normalized.within.type <- t(normalized.within.type) # lognorm flips its data
    normalized.within.type <- data.frame(normalized.within.type)
    normalized.within.type <- rownames_to_column(normalized.within.type, var ="gene")
    df.out <- left_join( df.out, normalized.within.type, by = 'gene' )
  }
  
  #df.out <- df.out[,2:dim(df.out)[2]]  # to keep original order
  df.out <- df.out[cell_names] %>% 
    select_if(~ !any(is.na(.))) %>%
    column_to_rownames(var = 'gene')
  
  return( data.frame(df.out) ) 
}

In [114]:
#normalize within cell types
neurons_type <- ayhan_meta$Cluster[ayhan_meta$Cluster %in% c('Den.Gyr1', 'Den.Gyr2', 'In1', 'In2', 'In3')]
ayhan.ieg.idx <- rownames(ayhan_neurons)[rownames(ayhan_neurons) %in% iegs_hg]

In [102]:
ayhan_neurons_wthintypelognorm <- ayhan_neurons[ayhan.ieg.idx, ]

In [103]:
ayhan_neurons_wthintypelognorm <- celltype.lognorm(ayhan_neurons_wthintypelognorm,
                                                   neurons_type)
dim(ayhan_neurons_wthintypelognorm)

[1] "Normalizing cell type..."
[1] "Den.Gyr1"
[1] "In1"
[1] "Den.Gyr2"
[1] "In2"
[1] "In3"


In [97]:
ayhan_counts_wthintypelognorm <-  celltype.lognorm(ayhan_counts[ayhan.ieg.idx,],
                 ayhan_meta$Cluster)

In [113]:
#colnames(ayhan_neurons_wthintypelognorm)==ayhan_meta$Cell[ayhan_meta$Cell %in% colnames(ayhan_neurons_wthintypelognorm)]
#ayhan_meta$Cell[ayhan_meta$Cell %in% colnames(ayhan_neurons_wthintypelognorm)]
sum(ayhan_meta$Cell==colnames(ayhan_counts_sorted) )

"longer object length is not a multiple of shorter object length"


In [96]:
table(ayhan_meta$Cluster[ayhan_meta$Cell %in% colnames(ayhan_neurons_wthintypelognorm)])


  Astro1   Astro2   Astro3 Den.Gyr1 Den.Gyr2 Den.Gyr3     Endo      In1 
    1403      266       52      764       95       46       61      249 
     In2      In3   Micro1   Micro2   Micro3     OPC1     OPC2     OPC3 
     199       54     1267      635       63     1499       28       16 
    OPC4    Olig1    Olig2    Olig3    Olig4    Olig5     Pyr1     Pyr2 
      19     3950     1377      404      450      189      411      175 

In [81]:
colnames(ayhan_meta)

In [79]:
ayhan_meta$Cell[1:10]

In [83]:
# we need to modify the cell type lables as some neurons have been dropped
test <- c()

for (cell_id in colnames(ayhan_neurons_wthintypelognorm) ){
    
    test <- c(test,ayhan_meta$Cluster[ayhan_meta$Cell==cell_id])
}

print(test[1:10])
print(table(test) )
#neurons_type <- neurons_type[which(colnames(ayhan_neurons_wthintypelognorm) %in% colnames(ayhan_neurons)) ]

 [1] "Olig1" "Olig1" "Olig1" "Olig2" "Olig1" "Olig2" "Olig1" "Olig1" "Olig1"
[10] "Olig1"
test
  Astro1   Astro2   Astro3 Den.Gyr1 Den.Gyr2 Den.Gyr3     Endo      In1 
    1403      266       52      764       95       46       61      249 
     In2      In3   Micro1   Micro2   Micro3     OPC1     OPC2     OPC3 
     199       54     1267      635       63     1499       28       16 
    OPC4    Olig1    Olig2    Olig3    Olig4    Olig5     Pyr1     Pyr2 
      19     3950     1377      404      450      189      411      175 


In [45]:
sum(is.na(ayhan_neurons_wthintypelognorm))

In [30]:
dim(ayhan_neurons_idx[,c(1, ayhan_neurons_idx + 1) ])

ERROR: Error in ayhan_neurons_idx[, c(1, ayhan_neurons_idx + 1)]: incorrect number of dimensions


In [13]:
abi_multipleareas <- SummarizedExperiment::SummarizedExperiment(assays =
                                       list("Counts"=abi_multipleareas_counts))

In [None]:
marsh <- read.table(paste(marsh_path,GSM4774667_pEXP27sHSrCTXPMiDONORAd20191001dapi_barcodes.tsv.gz)

In [7]:
marsh_path <- '/home/acampbell/test_datasets/Marsh2022_GSE157760/'

expression_matrix <- ReadMtx(mtx = paste(marsh_path,'GSM4774667_pEXP27sHSrCTXPMiDONORAd20191001dapi_matrix.mtx.gz', sep =""),
    features = paste(marsh_path,'GSM4774667_pEXP27sHSrCTXPMiDONORAd20191001dapi_features.tsv.gz', sep =""),
  cells = paste(marsh_path,'GSM4774667_pEXP27sHSrCTXPMiDONORAd20191001dapi_barcodes.tsv.gz', sep ="")
)

as(<dgTMatrix>, "dgCMatrix") is deprecated since Matrix 1.5-0; do as(., "CsparseMatrix") instead



In [None]:
expression_matrix <- ReadMtx(
  mtx = paste(marsh_path,GSM4774668_pEXP27sHSrCTXPMiDONORBd20191001dapi_matrix.mtx.gz, sep =""),
    features = paste(marsh_path,GSM4774668_pEXP27sHSrCTXPMiDONORBd20191001dapi_features.tsv.gz, sep =""),
  cells = paste(marsh_path,GSM4774668_pEXP27sHSrCTXPMiDONORBd20191001dapi_barcodes.tsv.gz, sep ="")
)

expression_matrix <- ReadMtx(
  mtx = paste(marsh_path,GSM4774669_pEXP27sHSrCTXPMiDONORCd20191001dapi_matrix.mtx.gz, sep =""),
    features = paste(marsh_path,GSM4774669_pEXP27sHSrCTXPMiDONORCd20191001dapi_features.tsv.gz, sep =""),
  cells = paste(marsh_path,GSM4774669_pEXP27sHSrCTXPMiDONORCd20191001dapi_barcodes.tsv.gz, sep ="")
)

expression_matrix <- ReadMtx(
  mtx = ,
    features = ,
  cells = 
)

expression_matrix <- ReadMtx(
  mtx = ,
    features = ,
  cells = 
)

paste(marsh_path,GSM4774670_pEXP35sHSrCTXPMiDONORAd20191001dapi2_barcodes.tsv.gz, sep ="")
<- paste(marsh_path,GSM4774670_pEXP35sHSrCTXPMiDONORAd20191001dapi2_features.tsv.gz, sep ="")
<- paste(marsh_path,GSM4774670_pEXP35sHSrCTXPMiDONORAd20191001dapi2_matrix.mtx.gz, sep ="")
<- paste(marsh_path,GSM4774671_pEXP35sHSrCTXPMiDONORBd20191001dapi2_barcodes.tsv.gz, sep ="")
<- paste(marsh_path,GSM4774671_pEXP35sHSrCTXPMiDONORBd20191001dapi2_features.tsv.gz, sep ="")
<- paste(marsh_path,GSM4774671_pEXP35sHSrCTXPMiDONORBd20191001dapi2_matrix.mtx.gz, sep ="")
<- paste(marsh_path,GSM4774672_pEXP35sHSrCTXPMiDONORCd20191001dapi2_barcodes.tsv.gz, sep ="")
<- paste(marsh_path,GSM4774672_pEXP35sHSrCTXPMiDONORCd20191001dapi2_features.tsv.gz, sep ="")
<- paste(marsh_path,GSM4774672_pEXP35sHSrCTXPMiDONORCd20191001dapi2_matrix.mtx.gz, sep ="")

         
pEXP20sHSrCTXt0iSAMPLEAd20190601L1
    

In [None]:
pEXP20sHSrCTXt0iSAMPLEAd20190601L1

In [None]:
#this will com later after we get all the sahred genes across the human experiments
iegs_df <- data.frame('Symbol_mm'= rownames(hochgernerDGC_counts)[hoch_target.idx],
                     'HC_ninetyth_percentile' =thresh_ninetyth)