# Code to cluster cells by GLIF parameters using iterative binary clustering and generate confusion matrices
### Teeter et al. 2018
#### This notebook runs the iterative binary appoach to generate clusters from the GLIF parameter and electrophysiology features. This code outputs the confusion matrices between the GLIF parameter clusterings and the electrophysiological feature clustering, corresponding to Supplemental Figure 13 in the paper. 

### 1) Install required packages

In [11]:
if (!require(ape)) {install.packages("ape", repos = "http://cran.us.r-project.org")}
if (!require(e1071)) {install.packages("e1071", repos = "http://cran.us.r-project.org")}
if (!require(gplots)) {install.packages("gplots", repos = "http://cran.us.r-project.org")}
if (!require(mclust)) {install.packages("mclust", repos = "http://cran.us.r-project.org")}
if (!require(gtools)) {install.packages("gtools", repos = "http://cran.us.r-project.org")}
require(ape)
require(e1071)
require(gplots)
require(mclust)
require(gtools)

Loading required package: gtools

Attaching package: 'gtools'

The following object is masked from 'package:e1071':

    permutations



### 2) Load data and metadata

In [2]:
###model parameters
dat=read.table("GLIF_param_plus_spike_features_7_27_17.csv",sep="\t",as.is=T,row.names=1,check.names=F,header=T)
metadata=dat[,1:2]
fulldat=dat[,-c(1:2)]

###Cre line metadata
crecols=read.csv("cre_colors.csv",as.is=T,header=F)
newcols=rgb(crecols[,2:4],maxColorValue = 255)
names(newcols)=crecols[,5]
colvec=newcols[match(metadata$cre,crecols[,1])]
cre_order=c("Htr3a","Ndnf","Vip","Sst","Pvalb","Nkx2-1","Chat","Chrna2","Cux2","Nr5a1","Scnn1a-Tg2","Scnn1a-Tg3","Rorb","Rbp4","Ntsr1","Ctgf")
  
###features
featdat=read.table("features_7_27_17.csv",as.is=T,row.names=1,check.names=F,sep=",",header=T)
featmetadata=featdat[,1:2]
featfulldat=featdat[,c("tau","ri","vrest","threshold_i_long_square","threshold_v_long_square","peak_v_long_square","fast_trough_v_long_square","trough_v_long_square","upstroke_downstroke_ratio_long_square","upstroke_downstroke_ratio_short_square","sag","f_i_curve_slope","latency","max_burstiness_across_sweeps")]





### 3) Apply log transform to skewed parameters/features

In [3]:
###model parameters
for (ii in 1:ncol(fulldat)) {
  if (min(fulldat[,ii])*max(fulldat[,ii])>0) {
    if (min(fulldat[,ii])>0) {
      if (skewness(fulldat[,ii])>skewness(log10(fulldat[,ii]))) {
        fulldat[,ii]=log10(fulldat[,ii])
      }
    } else {
      if (skewness(-fulldat[,ii])>skewness(log10(-fulldat[,ii]))) {
        fulldat[,ii]=log10(-fulldat[,ii])
      }
    }
  }
}
fulldat_all=fulldat

###features
for (ii in 1:ncol(featfulldat)) {
  if (min(featfulldat[,ii])*max(featfulldat[,ii])>0) {
    if (min(featfulldat[,ii])>0) {
      if (skewness(featfulldat[,ii])>skewness(log10(featfulldat[,ii]))) {
        featfulldat[,ii]=log10(featfulldat[,ii])
      }
    } else {
      if (skewness(-featfulldat[,ii])>skewness(log10(-featfulldat[,ii]))) {
        featfulldat[,ii]=log10(-featfulldat[,ii])
      }
    }
  }
}
featfulldat_all=featfulldat

### 4) Load clustering functions

In [5]:
###function to separate data into two clusters and check for cluster separation using SVM-based prediction
cluster_into_two=function(fulldat,startseed,meth='ward.D') {
  fulldat=scale(fulldat[,apply(fulldat,2,var)>0])
  hc=hclust(as.dist(1-cor(t(fulldat),method="pearson")),method=meth)
  clustids=cutree(hc,2)
  outlist=list()
  ###assess predictability using SVM prediction###
   fraction_incorrect=c()
   inds1=which(clustids==1)
   inds2=which(clustids==2)
   if (length(inds1)>5 & length(inds2)>5) {
     sampfrac1=round(0.5*length(inds1))
     sampfrac2=round(0.5*length(inds2))
     for (tt in 1:100) {
       set.seed(tt+startseed)
       sampvec=c(sample(inds1,sampfrac1),sample(inds2,sampfrac2))
       setcols=which(apply(fulldat[sampvec,],2,var)>0)
       svmpred=predict(svm(x=fulldat[sampvec,setcols],y=clustids[sampvec],type="C-classification"),fulldat[-sampvec,setcols])
       conf=table(svmpred,clustids[-sampvec])
       fraction_incorrect=c(fraction_incorrect,(conf[2,1]+conf[1,2])/sum(conf))
     }
   } else {
     fraction_incorrect=c(1,1)
     fraction_incorrect_rand=c(1,1)
   }
   outlist[['fraction_incorrect']]=fraction_incorrect
   outlist[['clustids']]=clustids
  return(outlist)
}

###function to cluster iteratively using binary splits
recursive_clustering=function(keepcols,fulldat_all,fraclim=0.2,splitlim=50,startseed,outlist,methall="ward.D") {
  clustmat=fulldat_all[,keepcols]
  tempout=cluster_into_two(clustmat,meth=methall,startseed)
  if (!is.na(tempout$fraction_incorrect[1])) {
    if (max(tempout$fraction_incorrect,na.rm=T)<=fraclim) {
      outlist$clustnames[names(tempout$clustids)]=paste(outlist$clustnames[names(tempout$clustids)],tempout$clustids,sep="_")
      outlist$fracmat=rbind(outlist$fracmat,tempout$fraction_incorrect)
      for (ii in 1:2) {
        if (length(which(tempout$clustids==ii))>=10) {
        outlist=recursive_clustering(keepcols,fulldat_all[names(tempout$clustids)[tempout$clustids==ii],],fraclim=fraclim,splitlim=splitlim,startseed+ii,outlist)
        }
      }
    }
  }
  return(outlist)
}

### 5) Run clustering on GLIF parameters and electrophysiological features
#### This generates the comparison data for the confusion matrices in the next cell.

In [6]:
###specify prefix for output file names###
pref="iterative_binary_clustering_2018"
parametersets=c("Features","GLIF1","GLIF2","GLIF3","GLIF4")
fraclimval=0.2  ###maximum fraction of incorrectly classified cells in test set (see recursive_clustering function in cell 3)
methall='ward.D'
for (nameval in parametersets) {
  if (nameval=="GLIF1") {keepcols=c(1,3,4,5,8)}
  if (nameval=="GLIF2") {keepcols=c(1,3,4,5,8,9,10)}
  if (nameval=="GLIF3") {keepcols=c(2,3,4,5,6,7,8)}
  if (nameval=="GLIF4") {keepcols=c(2,3,4,5,6,7,8,9,10)}
  if (nameval=="Features") {keepcols=1:ncol(featfulldat_all)}
  
  if (nameval %in% c("Features")) {
    startmat=featfulldat_all
  } else {
    startmat=fulldat_all
  }  
    
  print(paste0("clustering ",nameval," model, using the following parameters: ",paste(colnames(startmat)[keepcols],collapse=",")))
  startnames=rep("1",nrow(startmat))
  names(startnames)=rownames(startmat)
  outlist=list()
  outlist$clustnames=startnames
  outlist$fracmat=c()
  allclusts=recursive_clustering(keepcols,startmat,fraclim=fraclimval,splitlim=splitlimval,startseed=1,outlist=outlist)
  temptab=table(allclusts$clustnames[intersect(names(allclusts$clustnames),rownames(metadata))],metadata$cre[match(intersect(names(allclusts$clustnames),rownames(metadata)),rownames(metadata))])
  colnames(temptab)=sapply(strsplit(colnames(temptab),"-"), `[`, 1)
  temptab=cbind(temptab,paste("Cluster ",rev(1:nrow(temptab)),sep=''))
  write.csv(temptab,file=paste0("composition_",pref,"_",nameval,".csv"))
  temptab=allclusts$clustnames
  write.csv(temptab,file=paste0("cluster_ids_",pref,"_",nameval,".csv"))  
}

[1] "clustering Features model, using the following parameters: tau,ri,vrest,threshold_i_long_square,threshold_v_long_square,peak_v_long_square,fast_trough_v_long_square,trough_v_long_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_short_square,sag,f_i_curve_slope,latency,max_burstiness_across_sweeps"
[1] "clustering GLIF1 model, using the following parameters: R_input,C,El,th_inf,spike_cut_length"
[1] "clustering GLIF2 model, using the following parameters: R_input,C,El,th_inf,spike_cut_length,reset_slope,reset_intercept"
[1] "clustering GLIF3 model, using the following parameters: R_ASC,C,El,th_inf,total charge 1/300+1/100,total charge 1/3+1/10+1/100,spike_cut_length"
[1] "clustering GLIF4 model, using the following parameters: R_ASC,C,El,th_inf,total charge 1/300+1/100,total charge 1/3+1/10+1/100,spike_cut_length,reset_slope,reset_intercept"


### 6) Generate confusion matrices between each of the GLIF clusterings and the electrophysiology feature clustering
#### This generates the four panels in Supplemental Figure 13

In [13]:
pref="iterative_binary_clustering_2018"
parametersets=c("GLIF1","GLIF2","GLIF3","GLIF4")
cre_voi=c()
cre_ari=c()
cre_mean_voi=c()
cre_mean_ari=c()
featclust=read.csv(paste0("cluster_ids_",pref,"_Features.csv"),as.is=T,row.names=1)
featcomp=read.csv(paste0("composition_",pref,"_Features.csv"),as.is=T) 
pdf(paste0("Fig_Supp13_confusion_matrices_",pref,".pdf"),useDingbats=F)  
for (nameval in parametersets) {
  glifclust=read.csv(paste0("cluster_ids_",pref,"_",nameval,".csv"),as.is=T)
  glifclust=glifclust[match(rownames(featclust),glifclust[,1]),]
  temptab=table(glifclust[,2],featclust[,1])
  glifcomp=read.csv(paste0("composition_",pref,"_",nameval,".csv"),as.is=T)
  rowvec=glifcomp[match(rownames(temptab),glifcomp[,1]),ncol(glifcomp)]
  colvec=featcomp[match(colnames(temptab),featcomp[,1]),ncol(featcomp)]
  temptab=cbind(temptab,rowvec)
  temptab=rbind(temptab,c(colvec,''))
  write.csv(temptab,file=paste0("confusion_matrix_",pref,"_features_",nameval,".csv"))
  
  ###make plot###
  plottab=temptab
  rownames(plottab)=temptab[,ncol(temptab)]
  colnames(plottab)=temptab[nrow(temptab),]
  plottab=plottab[-nrow(plottab),-ncol(plottab)]
  plottab=t(apply(plottab,1,as.numeric))
  rownames(plottab)=paste0("GLIF ",temptab[-nrow(temptab),ncol(temptab)])
  rownames(plottab)=gsub("Cluster ","C",rownames(plottab))
  colnames(plottab)=paste0("Feature ",temptab[nrow(temptab),-ncol(temptab)])
  colnames(plottab)=gsub("Cluster ","C",colnames(plottab))
  plottab=plottab[mixedorder(rownames(plottab)),mixedorder(colnames(plottab))]
  texttab=plottab
  texttab[texttab==0]=''
  heatmap.2(plottab,col=colorRampPalette(c("white","orange","red")),scale='none',trace='none',cellnote=texttab,notecol="black",Rowv=F,Colv=F,cexRow=0.9,cexCol=0.9,key=F)
}
dev.off()

In heatmap.2(plottab, col = colorRampPalette(c("white", "orange", : Discrepancy: Colv is FALSE, while dendrogram is `column'. Omitting column dendogram.