# Code to cluster cells by GLIF and spike shape parameters using affinity propagation
### Teeter et al. 2018
#### This notebook runs affinity propataion to generate clusters from the GLIF and spike-shape parameters. This code outputs the GLIF and feature cluster vs. Cre line composition figures, corresponding to Supplemental Figure 14 in the paper. 

### 1) Install required packages

In [1]:
if (!require(ape)) {install.packages("ape", repos = "http://cran.us.r-project.org")}
if (!require(e1071)) {install.packages("e1071", repos = "http://cran.us.r-project.org")}
if (!require(gplots)) {install.packages("gplots", repos = "http://cran.us.r-project.org")}
if (!require(mclust)) {install.packages("mclust", repos = "http://cran.us.r-project.org")}
if (!require(apcluster)) {install.packages("apcluster", repos = "http://cran.us.r-project.org")}
require(ape)
require(e1071)
require(gplots)
require(mclust)
require(apcluster)

Loading required package: ape
Loading required package: e1071
Loading required package: gplots

Attaching package: 'gplots'

The following object is masked from 'package:stats':

    lowess

Loading required package: mclust
Package 'mclust' version 5.2
Type 'citation("mclust")' for citing this R package in publications.
Loading required package: apcluster

Attaching package: 'apcluster'

The following object is masked from 'package:stats':

    heatmap



### 2) Load data and metadata

In [2]:
###model parameters
dat=read.table("GLIF_param_plus_spike_features_7_27_17.csv",sep="\t",as.is=T,row.names=1,check.names=F,header=T)
metadata=dat[,1:2]
fulldat=dat[,-c(1:2)]

###Cre line metadata
crecols=read.csv("cre_colors.csv",as.is=T,header=F)
newcols=rgb(crecols[,2:4],maxColorValue = 255)
names(newcols)=crecols[,5]
colvec=newcols[match(metadata$cre,crecols[,1])]
cre_order=c("Htr3a","Ndnf","Vip","Sst","Pvalb","Nkx2-1","Chat","Chrna2","Cux2","Nr5a1","Scnn1a-Tg2","Scnn1a-Tg3","Rorb","Rbp4","Ntsr1","Ctgf")
  
###features
featdat=read.table("features_7_27_17.csv",as.is=T,row.names=1,check.names=F,sep=",",header=T)
featmetadata=featdat[,1:2]
featfulldat=featdat[,c("tau","ri","vrest","threshold_i_long_square","threshold_v_long_square","peak_v_long_square","fast_trough_v_long_square","trough_v_long_square","upstroke_downstroke_ratio_long_square","upstroke_downstroke_ratio_short_square","sag","f_i_curve_slope","latency","max_burstiness_across_sweeps")]





### 3) Apply log transform to skewed parameters/features

In [3]:
###model parameters
for (ii in 1:ncol(fulldat)) {
  if (min(fulldat[,ii])*max(fulldat[,ii])>0) {
    if (min(fulldat[,ii])>0) {
      if (skewness(fulldat[,ii])>skewness(log10(fulldat[,ii]))) {
        fulldat[,ii]=log10(fulldat[,ii])
      }
    } else {
      if (skewness(-fulldat[,ii])>skewness(log10(-fulldat[,ii]))) {
        fulldat[,ii]=log10(-fulldat[,ii])
      }
    }
  }
}
fulldat_all=fulldat

###features
for (ii in 1:ncol(featfulldat)) {
  if (min(featfulldat[,ii])*max(featfulldat[,ii])>0) {
    if (min(featfulldat[,ii])>0) {
      if (skewness(featfulldat[,ii])>skewness(log10(featfulldat[,ii]))) {
        featfulldat[,ii]=log10(featfulldat[,ii])
      }
    } else {
      if (skewness(-featfulldat[,ii])>skewness(log10(-featfulldat[,ii]))) {
        featfulldat[,ii]=log10(-featfulldat[,ii])
      }
    }
  }
}
featfulldat_all=featfulldat

### 4) Load clustering and clustering overlap functions

In [4]:
###function to run affinity propagation clustering
runaffprop=function(dat,k) {
  mmm=apclusterK(function (x){x=1-cor(t(x),method="pearson")},dat,K=k,seed=1)
  outvec=rep(0,nrow(dat))
  print(dim(ncol(dat)))
  for (ii in 1:length(mmm)) {
    outvec[mmm[[ii]]]=ii
  }
  outlist=list()
  outlist[['cluster']]=outvec
  return(outlist)
}

###function to calculate Variation of Information or Adjusted Rand Index
calc_cluster_diff=function(xvec,yvec,functype=1,credistmat=c(),clustdistmat=c()) {
  if (functype==1) {
    totaltab=table(xvec,yvec)
    rowmat=sweep(totaltab,1,rowSums(totaltab),"/")  
    colmat=sweep(totaltab,2,colSums(totaltab),"/")
    summat=(totaltab*(log(rowmat)+log(colmat)))
    sumval=sum(summat[totaltab>0])/length(xvec)
    return(-sumval) 
  } else {
    return(adjustedRandIndex(xvec,yvec))
  }
}

###function to calculate score based on 100 random permutations
rand_cluster_diff=function(xvec,yvec,functype=1,credistmat=c(),clustdistmat=c()) {
  allvals=rep(0,100)
  for (ii in 1:100) {
    set.seed(ii)
    allvals[ii]=calc_cluster_diff(xvec,sample(yvec),functype,credistmat,clustdistmat)
  }
  return(allvals)
}

### 5) Run clustering on GLIF parameters, GLIF parameters+spike shape features, and electrophysiological features
#### This generates the first four panels of Supplemental Figure 14.

In [5]:
###specify prefix for output file names###
pref="affinity_propagation_clustering_2018"
parametersets=c("Features","Featuresnospike","GLIF1","GLIF2","GLIF3","GLIF4","GLIF1_spike_shape","GLIF2_spike_shape","GLIF3_spike_shape","GLIF4_spike_shape")
pdf(paste0("Fig_Supp14_cluster_diagram_",pref,".pdf"),useDingbats=F,width=12,height=10)
for (nameval in parametersets) {
  if (nameval=="GLIF1") {keepcols=c(1,3,4,5,8)}
  if (nameval=="GLIF2") {keepcols=c(1,3,4,5,8,9,10)}
  if (nameval=="GLIF3") {keepcols=c(2,3,4,5,6,7,8)}
  if (nameval=="GLIF4") {keepcols=c(2,3,4,5,6,7,8,9,10)}
  if (nameval=="GLIF1_spike_shape") {keepcols=c(1,3,4,5,8,13:16)}
  if (nameval=="GLIF2_spike_shape") {keepcols=c(1,3,4,5,8,9,10,13:16)}
  if (nameval=="GLIF3_spike_shape") {keepcols=c(2,3,4,5,6,7,8,13:16)}
  if (nameval=="GLIF4_spike_shape") {keepcols=c(2,3,4,5,6,7,8,9,10,13:16)}
  if (nameval=="Features") {keepcols=1:ncol(featfulldat_all)}
  if (nameval=="Featuresnospike") {keepcols=c(1,2,3,4,5,8,11,12,13,14)}
  
  if (nameval %in% c("Features","Featuresnospike")) {
    startmat=featfulldat_all
  } else {
    startmat=fulldat_all
  }

  print(paste0("clustering ",nameval," model, using the following parameters: ",paste(colnames(startmat)[keepcols],collapse=",")))
   newstart=scale(startmat[,apply(startmat,2,var)>0])
  testnumclusts=1:25
  
  ##Uncomment to run full range of affinity propagation parameters to identify optimal number of clusters. This takes a long
  ##time to run, so to simply reproduce the paper figures, jump straight to the hard-coded cluster numbers in the uncommented lines below.
  #allclustgap[[nameval]]=clusGap(newstart[,keepcols],runaffprop,K.max=25)  
     
  if (nameval=="GLIF1") {numclust=7}
  if (nameval=="GLIF2") {numclust=5}
  if (nameval=="GLIF3") {numclust=13}
  if (nameval=="GLIF4") {numclust=11}
  if (nameval=="Features") {numclust=18}
  if (nameval=="Featuresnospike") {numclust=19}
  if (nameval=="GLIF1_spike_shape") {numclust=10}
  if (nameval=="GLIF2_spike_shape") {numclust=10}
  if (nameval=="GLIF3_spike_shape") {numclust=16}
  if (nameval=="GLIF4_spike_shape") {numclust=15}

  clustout=apclusterK(function (x){x=1-cor(t(x),method="pearson")},newstart[,keepcols],K=numclust,seed=0,prc=0,verbose=T)
  allclusts=rep(0,nrow(startmat))
  names(allclusts)=rownames(startmat)
  print(c(nameval,numclust,length(clustout)))
  for (ii in 1:length(clustout)) {
    allclusts[clustout[[ii]]]=ii
  }
  temptab=table(allclusts[intersect(names(allclusts),rownames(metadata))],metadata$cre[match(intersect(names(allclusts),rownames(metadata)),rownames(metadata))])
  colnames(temptab)=sapply(strsplit(colnames(temptab),"-"), `[`, 1)
  temptab=cbind(temptab,paste("Cluster ",rev(1:nrow(temptab)),sep=''))
  write.csv(temptab,file=paste0("composition_",pref,"_",nameval,".csv"))
  temptab=allclusts
  write.csv(temptab,file=paste0("cluster_ids_",pref,"_",nameval,".csv"))
  if (grepl("spike_shape",nameval)) {                                                            
      outtab2=read.csv(paste0("composition_",pref,"_",nameval,".csv"),as.is=T,row.names=1,check.names=F)
      outtab2=outtab2[,-ncol(outtab2)]
      colnames(outtab2)[grep("Scnn1a",colnames(outtab2))]=c("Scnn1a-Tg2","Scnn1a-Tg3")
      colnames(outtab2)[grep("Nkx2",colnames(outtab2))]="Nkx2-1"
      outtab2=outtab2[,cre_order]
      xvals=matrix(rep(1:ncol(outtab2),each=nrow(outtab2)),nrow=nrow(outtab2))
      yvals=matrix(rep(1:nrow(outtab2),ncol(outtab2)),nrow=nrow(outtab2))
      basecols=newcols[colnames(outtab2)]
      colvals=matrix(basecols[rep(1:ncol(outtab2),each=nrow(outtab2))],nrow=nrow(outtab2))
      outtab2=100*sweep(as.matrix(outtab2),2,colSums(outtab2),"/")
      dfx = data.frame(x=c(xvals), y=c(yvals), sizeval=sqrt(c(as.matrix(outtab2))),colsplot=c(colvals))
      dfx = dfx[dfx$sizeval>0,]
      par(fig=c(0.3,1,0,1), new=FALSE)
      plot(c(1,ncol(outtab2)),c(1,nrow(outtab2)),pch='',xlab='',ylab='',xaxt='n',yaxt='n',main=nameval)
      abline(h=1:nrow(outtab2),v=1:ncol(outtab2),col='grey')
      with(dfx, symbols(x=x, y=y, circles=sizeval, inches=1/4, ann=F, bg=as.character(colsplot), fg="black", xlab=colnames(outtab),add=T,xlim=c(1,ncol(outtab2)),ylim=c(1,nrow(outtab2)),xaxt='n',yaxt='n'))
      axis(1, at=1:ncol(outtab2),labels=colnames(outtab2),las=2,cex.axis=0.9)
      axis(2, at=1:nrow(outtab2),label=paste("Cluster ",rev(1:nrow(outtab2)),sep=''),las=2)
  }
}
dev.off()

[1] "clustering Features model, using the following parameters: tau,ri,vrest,threshold_i_long_square,threshold_v_long_square,peak_v_long_square,fast_trough_v_long_square,trough_v_long_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_short_square,sag,f_i_curve_slope,latency,max_burstiness_across_sweeps"
Trying p = 1.498409 
   Number of clusters: 72 
Trying p = -2.484667 
   Number of clusters: 11 
Trying p = -0.271847 (bisection step no. 1 )
   Number of clusters: 18 

Number of clusters: 18 for p = -0.271847 
[1] "Features" "18"       "18"      
[1] "clustering Featuresnospike model, using the following parameters: tau,ri,vrest,threshold_i_long_square,threshold_v_long_square,trough_v_long_square,sag,f_i_curve_slope,latency,max_burstiness_across_sweeps"
Trying p = 1.484039 
   Number of clusters: 70 
Trying p = -2.928119 
   Number of clusters: 13 
Trying p = -0.4769202 (bisection step no. 1 )
   Number of clusters: 21 
Trying p = -1.702519 (bisection step no. 2 )

### 6) Calculate Adjusted Rand and Adjusted Variation of Information Indices between all clusterings and Cre line segregation
#### This generates the lower left panel of Supplemental Figure 14

In [6]:
pref="affinity_propagation_clustering_2018"
parametersets=c("GLIF1","GLIF2","GLIF3","GLIF4","Featuresnospike","Features","GLIF1_spike_shape","GLIF2_spike_shape","GLIF3_spike_shape","GLIF4_spike_shape")
cre_voi=c()
cre_ari=c()
cre_mean_voi=c()
cre_mean_ari=c()
featclust=featmetadata
for (nameval in parametersets) {
  glifclust=read.csv(paste0("cluster_ids_",pref,"_",nameval,".csv"),as.is=T)
  glifclust=glifclust[match(rownames(featclust),glifclust[,1]),]
  cre_voi=c(cre_voi,calc_cluster_diff(featclust[,1],glifclust[,2],1))
  cre_ari=c(cre_ari,calc_cluster_diff(featclust[,1],glifclust[,2],2))
  rand_voi=rand_cluster_diff(featclust[,1],glifclust[,2],1)
  rand_ari=rand_cluster_diff(featclust[,1],glifclust[,2],2)
  cre_mean_voi=c(cre_mean_voi,mean(rand_voi))
  cre_mean_ari=c(cre_mean_ari,mean(rand_ari))
}
pdf(paste0("Fig_Supp14_comparison_to_Cre_lines_",pref,".pdf"),useDingbats=F)
par(mar = c(5,5,2,5))
plot(1:10,cre_mean_voi[1:10]-cre_voi[1:10],type="l",col="black",ylab="Adjusted VOI score",xaxt='n',xlab='',main="Comparison between model/feature clusters and Cre line partitioning",ylim=c(0,max(cre_mean_voi[1:10]-cre_voi[1:10])))
axis(side=1,at=1:10,labels=c("GLIF1","GLIF2","GLIF3","GLIF4","Features, no\nspike-shape", "Features","GLIF1+spike shape","GLIF2+spike shape","GLIF3+spike shape","GLIF4+spike shape"),las=2)
par(new = T)
plot(1:10, cre_ari[1:10]-cre_mean_ari[1:10], type="l", col="red", axes=F, xlab=NA, ylab=NA,ylim=c(0,max(cre_ari[1:10]-cre_mean_ari[1:10])))
axis(side=4,labels=F)
at = axTicks(4)
mtext(side = 4, text = at, at = at, col = "red", line = 1)
mtext(side = 4, line = 3, 'Adjusted Rand Index',col='red')
legend("topleft",c("Adjusted VOI","Adjusted Rand Index"),fill=c("black","red"))
dev.off()

### 7) Calculate Adjusted Rand and Adjusted Variation of Information Indices between GLIF clusterings and electrophysiological feature clustering
#### This generates the lower right panel of Supplemental Figure 14

In [7]:
pref="affinity_propagation_clustering_2018"
parametersets=c("GLIF1","GLIF2","GLIF3","GLIF4","Featuresnospike","GLIF1_spike_shape","GLIF2_spike_shape","GLIF3_spike_shape","GLIF4_spike_shape")
all_voi=c()
all_ari=c()
mean_voi=c()
mean_ari=c()
featclust=read.csv(paste0("cluster_ids_",pref,"_Features.csv"),as.is=T,row.names=1)
for (nameval in parametersets) {
  glifclust=read.csv(paste0("cluster_ids_",pref,"_",nameval,".csv"),as.is=T)
  glifclust=glifclust[match(rownames(featclust),glifclust[,1]),]
  all_voi=c(all_voi,calc_cluster_diff(featclust[,1],glifclust[,2],1))
  all_ari=c(all_ari,calc_cluster_diff(featclust[,1],glifclust[,2],2))
  rand_voi=rand_cluster_diff(featclust[,1],glifclust[,2],1)
  rand_ari=rand_cluster_diff(featclust[,1],glifclust[,2],2)
  mean_voi=c(mean_voi,mean(rand_voi))
  mean_ari=c(mean_ari,mean(rand_ari))
}
pdf(paste0("Fig_Supp14_comparison_to_feature_clustering_",pref,".pdf"),useDingbats=F)
par(mar = c(5,5,2,5))
plot(1:9,mean_voi[1:9]-all_voi[1:9],type="l",col="black",ylab="Adjusted VOI score",xaxt='n',xlab='',main="Comparison between model/feature clusters and Cre line partitioning",ylim=c(0,max(mean_voi[1:9]-all_voi[1:9])))
axis(side=1,at=1:9,labels=c("GLIF1","GLIF2","GLIF3","GLIF4","Features, no\nspike-shape","GLIF1+spike shape","GLIF2+spike shape","GLIF3+spike shape","GLIF4+spike shape"),las=2)
par(new = T)
plot(1:9, all_ari[1:9]-mean_ari[1:9], type="l", col="red", axes=F, xlab=NA, ylab=NA,ylim=c(0,max(all_ari[1:9]-mean_ari[1:9])))
axis(side=4,labels=F)
at = axTicks(4)
mtext(side = 4, text = at, at = at, col = "red", line = 1)
mtext(side = 4, line = 3, 'Adjusted Rand Index',col='red')
legend("topleft",c("Adjusted VOI","Adjusted Rand Index"),fill=c("black","red"))
dev.off()