In [63]:
import sys
import pandas as pd
import numpy as np
import hdbscan

In [64]:
# Load up code to run ICIM
# Available from : https://github.com/felixhorns/FlyPN
sys.path.append("../code/ICIM")
import sct
reload(sct)
# Used because of nature of ICIM library
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
# The input for ICIM is log2 transformed data in practice
# See manuscript, github
df = pd.read_csv("../data/02_filtered_kenyon_cells/CLEAN_LOG2TRANSFORM_kenyon_cells.csv")
df = df.set_index("symbol")

In [8]:
# Load up metadata associated with these cells
metadata = pd.read_csv("../data/02_filtered_kenyon_cells/metadata_kenyon_cells.csv")
metadata = metadata.set_index("CellID")

In [47]:
# ICIM takes a filtered dataset as one of its arguments
# Filtering requirements are taken from an up-to-date tutorial
# from the hemberg lab's tutorial on seurat, to mimic the presumed
# level of filtering used for the original seurat analsyis
# Source https://hemberg-lab.github.io/scRNA.seq.course/seurat-chapter.html#normalization

# filtered_df
f_df = df.copy(deep=True)

# seuset <- CreateSeuratObject(
#     raw.data = counts(deng),
#     min.cells = 3, 
#     min.genes = 200
# )

# Check to make sure each gene is present in at least 3 cells
def check_min_num_cells(row):
    num_pos = 0
    for gene_count in row:
        if gene_count > 0:
            num_pos += 1
            if num_pos >= 3:
                return True
    return False
   

min_num_cells = f_df.apply(check_min_num_cells, axis=1)
f_df = f_df[min_num_cells]

In [50]:
# Check to make sure each cell has at least 200 genes
def check_min_num_genes(col):
    num_pos = 0
    for gene_count in col:
        if gene_count > 0:
            num_pos += 1
            if num_pos >= 200:
                return True
    return False

min_num_genes = f_df.apply(check_min_num_genes, axis=0)
f_df = f_df.loc[:, min_num_genes]

In [52]:
f_df.shape

(8255, 2848)

In [56]:
# Load the ICIM analysis object with associated data
# See https://github.com/felixhorns/FlyPN/blob/master/analysis/GH146_Fig2.ipynb 
# for full featured example

reload(sct)
from scipy.cluster import hierarchy
myICIM = sct.ICIM(f_df, df, TFs=[], CSMs=[], exclude=[], N=100,
                  correlation_cutoff=0.5,
                  min_hits=3,
                  exclude_max=2,
                  dropout_rate_low=0.3,
                  dropout_rate_high=1.0,
                  metric="correlation",
                  stop_condition="linkage_dist",
                  N_stop=50,
                  linkage_dist_stop=0.2)

In [57]:
# Run step command to iterate over first run of ICIM algo
# QC / Status Check
myICIM.step("0", verbose=True)

Found 23 genes
Child populations 2845 3


['00']

In [58]:
# Run full ICIM pipeline
myICIM.calc(verbose=True)

Initial step
Found 23 genes
Child populations 2845 3

00
Found 25 genes
Child populations 2251 594

000
Found 21 genes
Child populations 1845 406

0001
Found 32 genes
Child populations 405 1

00010
Found 32 genes
Child populations 378 27

000100
Found 34 genes
Child populations 367 11

0001000
Found 33 genes
Child populations 366 1

00010000
Found 33 genes
Child populations 333 33

000100001
Found 245 genes
Failed linkage distance condition. Stopping.

000100000
Found 39 genes
Child populations 328 5

0001000000
Found 33 genes
Child populations 325 3

00010000000
Found 26 genes
Child populations 317 8

000100000000
Found 23 genes
Child populations 311 6

0001000000000
Found 24 genes
Child populations 309 2

00010000000000
Found 21 genes
Child populations 308 1

000100000000000
Found 21 genes
Child populations 303 5

0001000000000000
Found 22 genes
Child populations 302 1

00010000000000000
Found 22 genes
Child populations 294 8

000100000000000000
Found 17 genes
Child populations 288 6

Found 8 genes
Child populations 195 2

000000100000
Found 8 genes
Child populations 171 24

0000001000001
Found 241 genes
Failed linkage distance condition. Stopping.

0000001000000
Found 5 genes

00001
Found 29 genes
Child populations 580 1

000010
Found 29 genes
Child populations 557 23

0000101
Found 320 genes
Failed linkage distance condition. Stopping.

0000100
Found 25 genes
Child populations 533 24

00001000
Found 22 genes
Child populations 530 3

000010000
Found 22 genes
Child populations 528 2

0000100000
Found 21 genes
Child populations 526 2

00001000000
Found 21 genes
Child populations 525 1

000010000000
Found 20 genes
Child populations 516 9

0000100000000
Found 20 genes
Child populations 513 3

00001000000000
Found 20 genes
Child populations 505 8

000010000000000
Found 19 genes
Child populations 477 28

0000100000000000
Found 24 genes
Child populations 476 1

00001000000000000
Found 24 genes
Child populations 475 1

000010000000000000
Found 24 genes
Child populations 44

In [59]:
genes_KC_ICIM = myICIM.get_all_markers()
print "Genes found by ICIM", len(genes_KC_ICIM)

Genes found by ICIM 267


In [60]:
with open("../data/03_ICIM_analysis/KC_genes_ICIM.txt", 'w') as out:
    for x in genes_KC_ICIM:
        out.write(x + "\n")

In [61]:
# Subset data to genes desired
# Twice filtered df
f2_df = f_df.loc[genes_KC_ICIM]

# Calculate TSNE
reload(sct)
from sklearn.manifold import TSNE
myTSNE = sct.TSNE(f2_df, df, metadata)
myTSNE.calc_TSNE(perplexity=10, learning_rate=250, early_exaggeration=4.0, method="exact", random_state=1)

[t-SNE] Computed conditional probabilities for sample 1000 / 2848
[t-SNE] Computed conditional probabilities for sample 2000 / 2848
[t-SNE] Computed conditional probabilities for sample 2848 / 2848
[t-SNE] Mean sigma: 0.100808
[t-SNE] Iteration 50: error = 22.0726243, gradient norm = 0.0517549 (50 iterations in 17.880s)
[t-SNE] Iteration 100: error = 20.6186237, gradient norm = 0.0479926 (50 iterations in 15.477s)
[t-SNE] Iteration 150: error = 20.5975731, gradient norm = 0.0359180 (50 iterations in 15.572s)
[t-SNE] Iteration 200: error = 20.4600786, gradient norm = 0.0500425 (50 iterations in 14.969s)
[t-SNE] Iteration 250: error = 20.4023608, gradient norm = 0.0406528 (50 iterations in 15.267s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 20.402361
[t-SNE] Iteration 300: error = 2.6382573, gradient norm = 0.0023594 (50 iterations in 16.429s)
[t-SNE] Iteration 350: error = 2.4133272, gradient norm = 0.0005949 (50 iterations in 16.707s)
[t-SNE] Iteration 400: err

In [77]:
# Using HDBSCAN to call clusters automatically, generate metadata for them
clusterer = hdbscan.HDBSCAN(min_cluster_size=30, min_samples=3).fit(myTSNE.X_tsne)
labels_HDBSCAN = clusterer.labels_

In [78]:
print "Clusters", max(labels_HDBSCAN)+1

Clusters 7


In [79]:
zero_axis = myTSNE.X_tsne[:,0]
one_axis = myTSNE.X_tsne[:,1]
myTSNE.df_libs['x'] = zero_axis
myTSNE.df_libs['y'] = one_axis
myTSNE.df_libs['hdb_clust'] = labels_HDBSCAN
myTSNE.df_libs.to_csv("../data/03_ICIM_analysis/KC_ICIM_TSNE_data.csv")

In [None]:
from __future__ import division
import sys
import random
import copy
import math
import json

import numpy as np
import pandas as pd
import scipy

%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib as mpl

import seaborn as sns

sys.path.append("../code/ICIM")
import sct
reload(sct)

sns.set_style("ticks")
sns.set_context("talk")

output_dir = "out/"
output_suffix = ""
output_formats = [".png", ".pdf"]

def save_figure(fig, name):
    for output_format in output_formats:
        fig.savefig(output_dir + "/" + name + output_suffix + output_format)
    return None

mpl.rc('savefig', dpi=300)



In [None]:
df = pd.read_csv("../data/STAWP_Fly_AdultBrain_KC.csv")
df = df.set_index('symbol')

In [None]:
meta = pd.read_csv("../data/STAWP_Fly_AdultBrain_KC_metadata.csv")
meta = meta.set_index("CellID")

In [None]:
meta.head()

In [None]:
X = df
# Generate a list of genes which have no counts across all Kenyon Cells
genes_to_remove = list(df[df.apply(sum, axis=1) == 0].index)
X = X.drop(genes_to_remove)

In [None]:
myDispersion = sct.dispersion(X)
myDispersion.calc_dispersion() # calculate overdispersion
genes_overdispersed = myDispersion.get_hits(N=500)
Y = X.loc[genes_overdispersed.index]

In [None]:
# Calculate PCA
from sklearn.decomposition import PCA
myPCA = sct.PCA(Y, df, 10) # number of significant PCs was determined using shuffling performed elsewhere in parallel fashion
myPCA.pca()
Y_pca = pd.DataFrame(myPCA.X_pca.T)

In [None]:
myPCA.X_pca.T

In [None]:
# Calculate TSNE
reload(sct)
from sklearn.manifold import TSNE
myTSNE_PCA = sct.TSNE(Y_pca, df, meta)
myTSNE_PCA.calc_TSNE(perplexity=30, learning_rate=500)

In [None]:
# Plot TSNE colored by expression
fig, ax = plt.subplots(1, 1, figsize=(6,4))
sc = myTSNE_PCA.plot(fig, ax, colorBy="Age", colorMode="Age", cmap="Reds")
ax.set_aspect("equal")
# save_figure(fig, "GH146TSNE_acj6")%%!

In [None]:
reload(sct)
from scipy.cluster import hierarchy
myICIM = sct.ICIM(X, df, TFs=[], CSMs=[], exclude=[], N=100,
                  correlation_cutoff=0.5,
                  min_hits=3,
                  exclude_max=2,
                  dropout_rate_low=0.3,
                  dropout_rate_high=1.0,
                  metric="correlation",
                  stop_condition="linkage_dist",
                  N_stop=50,
                  linkage_dist_stop=0.2)

In [None]:
myICIM.calc(verbose=True)

In [None]:
genes_KC_ICIM = myICIM.get_all_markers()
print "Genes found by ICIM", len(genes_KC_ICIM)

In [None]:
with open("../data/genes_KC_ICIM.txt", 'w') as out:
    for x in genes_KC_ICIM:
        out.write(x + "\n")

In [None]:
# Subset data to genes desired
Y = X.loc[genes_KC_ICIM]

# Calculate TSNE
reload(sct)
from sklearn.manifold import TSNE
myTSNE = sct.TSNE(Y, df, meta)
myTSNE.calc_TSNE(perplexity=10, learning_rate=250, early_exaggeration=4.0, method="exact", random_state=1)

In [None]:
# Plot TSNE colored by expression levels
# Note that some of these plots are shown in Figure 5
fig, ax = plt.subplots(1, 1, figsize=(6,4))
myTSNE.plot(fig, ax, colorMode="Age")

In [None]:
zero_axis = myTSNE.X_tsne[:,0]
one_axis = myTSNE.X_tsne[:,1]

In [None]:
myTSNE.df_libs['x_axis'] = zero_axis
myTSNE.df_libs['y_axis'] = one_axis

In [None]:
myTSNE.df_libs.to_csv("../data/STAWP_tnse.csv")