In [239]:
import sys
import pandas as pd
import numpy as np
import hdbscan

In [240]:
# Load up code to run ICIM
# Available from : https://github.com/felixhorns/FlyPN
sys.path.append("../code/ICIM")
import sct
reload(sct)
# Used because of nature of ICIM library
pd.options.mode.chained_assignment = None  # default='warn'

In [241]:
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib as mpl

import seaborn as sns

In [53]:
# Original untransformed data, generated and saved here
#df = pd.read_csv("../data/08_tss_expression_matrix_ICIM/exon_expression_matrix.csv")
#df['symbol'] = df['Unnamed: 0']
#df = df.set_index('symbol')
#df = df.drop('Unnamed: 0', axis=1)
#df.to_csv("../data/08_tss_expression_matrix_ICIM/log2_exon_expression_matrix.csv")

In [242]:
df = pd.read_csv("../data/08_tss_expression_matrix_ICIM/log2_exon_expression_matrix.csv", index_col='symbol')

In [243]:
df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,2823,2824,2825,2826,2827,2828,2829,2843,2844,2845
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FBgn0000008_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FBgn0000008_11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FBgn0000008_12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.584963,0.0
FBgn0000008_13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,4.523562,0.0
FBgn0000008_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [244]:
# # Set up metadata
# cell_number_info = pd.read_csv("../data/06_tss_data/indexed_barcodes.tsv", sep='\t', header=None)
# cell_number_info.columns = ['cell_number', 'dna_index']
# cell_number_info = cell_number_info.set_index('dna_index')
# # Load up metadata associated with these cells
# metadata = pd.read_csv("../data/02_filtered_kenyon_cells/metadata_kenyon_cells.csv")
# metadata = metadata.set_index("CellID")
# metadata['dna_index'] = metadata.index.str.split('-').str[0]
# metadata = metadata.reset_index().set_index('dna_index')
# metadata = metadata.join(cell_number_info)
# metadata = metadata.reset_index().set_index("cell_number")
# # make csv
# metadata.to_csv("../data/08_tss_expression_matrix_ICIM/tss_metadata.csv")

In [220]:
df.index.unique()

Index([u'FBgn0000008_10', u'FBgn0000008_11', u'FBgn0000008_12',
       u'FBgn0000008_13', u'FBgn0000008_2', u'FBgn0000008_3', u'FBgn0000008_5',
       u'FBgn0000008_6', u'FBgn0000008_7', u'FBgn0000008_8',
       ...
       u'FBgn0285963_15', u'FBgn0285963_2', u'FBgn0285963_3', u'FBgn0285963_4',
       u'FBgn0285963_7', u'FBgn0285963_8', u'FBgn0285963_9', u'FBgn0286028_1',
       u'FBgn0286029_1', u'FBgn0286031_1'],
      dtype='object', name=u'symbol', length=26727)

In [246]:
df.shape

(26727, 2832)

In [247]:
# ICIM takes a filtered dataset as one of its arguments
# Filtering requirements are taken from an up-to-date tutorial
# from the hemberg lab's tutorial on seurat, to mimic the presumed
# level of filtering used for the original seurat analsyis
# Source https://hemberg-lab.github.io/scRNA.seq.course/seurat-chapter.html#normalization

# filtered_df
f_df = df.copy(deep=True)

In [248]:
# Pull out G-KC cells to cluster independently
gkc = list(metadata[metadata.cell_type_id == 8].index)
f_df = f_df.loc[:,f_df.columns.astype(int).isin(gkc)]

df = df.loc[:, df.columns.astype(int).isin(gkc)]

In [249]:
f_df.shape

(26727, 1633)

In [250]:
df.shape

(26727, 1633)

In [251]:
# Check to make sure each gene is present in at least 3 cells
def check_min_num_cells(row):
    num_pos = 0
    for gene_count in row:
        if gene_count > 0:
            num_pos += 1
            if num_pos >= 3:
                return True
    return False
   

min_num_cells = f_df.apply(check_min_num_cells, axis=1)
f_df = f_df[min_num_cells]

In [252]:
# Check to make sure each cell has at least 200 genes
def check_min_num_genes(col):
    num_pos = 0
    for gene_count in col:
        if gene_count > 0:
            num_pos += 1
            if num_pos >= 200:
                return True
    return False

min_num_genes = f_df.apply(check_min_num_genes, axis=0)
f_df = f_df.loc[:, min_num_genes]

In [253]:
f_df = f_df[f_df.apply(lambda x : sum(x) > 0, axis=1)]

In [296]:
f_df.shape

(15588, 1633)

In [304]:
metadata = pd.read_csv("../data/08_tss_expression_matrix_ICIM/tss_metadata.csv", index_col="cell_number")
metadata = metadata[~metadata.index.duplicated(keep='first')]
metadata = metadata[metadata.index.isin(df.columns.astype(int))]
metadata = metadata.loc[f_df.columns.astype(int), :]
metadata.index.name = 'cell_number'

In [315]:
metadata.head()

Unnamed: 0_level_0,dna_index,CellID,Age,Gender,Genotype,Replicate,nGene,nUMI,cell_type_id,is_kc
cell_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,ACATACGAGGGCTTCC,ACATACGAGGGCTTCC-DGRP-551_0d_r1,0,Female,DGRP-551,DGRP-551_0d_Rep1,1328,3340.0,8.0,1
2,ACCCACTTCACTCTTA,ACCCACTTCACTCTTA-DGRP-551_0d_r1,0,Female,DGRP-551,DGRP-551_0d_Rep1,1613,4580.0,8.0,1
4,ACTTACTAGTGGTAAT,ACTTACTAGTGGTAAT-DGRP-551_0d_r1,0,Male,DGRP-551,DGRP-551_0d_Rep1,1174,2942.0,8.0,1
5,ACTTGTTCATGGTTGT,ACTTGTTCATGGTTGT-DGRP-551_0d_r1,0,Male,DGRP-551,DGRP-551_0d_Rep1,1410,3620.0,8.0,1
6,ACTTTCATCAATAAGG,ACTTTCATCAATAAGG-DGRP-551_0d_r1,0,Male,DGRP-551,DGRP-551_0d_Rep1,1378,3995.0,8.0,1


In [310]:
myDispersion = sct.dispersion(f_df)
myDispersion.calc_dispersion() # calculate overdispersion
genes_overdispersed = myDispersion.get_hits(N=500)
Y = f_df.loc[genes_overdispersed.index]

In [311]:
# Calculate PCA
from sklearn.decomposition import PCA
myPCA = sct.PCA(Y, df, 10) # number of significant PCs was determined using shuffling performed elsewhere in parallel fashion
myPCA.pca()
Y_pca = pd.DataFrame(myPCA.X_pca.T)

In [314]:
Y_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1623,1624,1625,1626,1627,1628,1629,1630,1631,1632
0,-2.721786,-1.484415,-3.117884,-1.41444,-3.643357,-0.298351,-2.865528,-1.794715,-1.025318,-2.675897,...,3.126858,-1.831003,5.304234,5.052587,12.025104,-0.022078,-0.185952,1.104435,-1.265768,1.612173
1,-1.409384,0.172254,0.564134,-0.785802,-0.013567,0.609561,-2.190181,0.438058,-1.364876,-0.324162,...,0.421149,-0.129703,0.33088,-2.187068,1.058413,1.20479,-1.61158,1.489644,-1.693053,6.11011
2,1.124409,-4.453701,-0.640346,-2.314901,-0.314516,-1.51911,-1.245312,-0.98245,-2.305887,1.678134,...,0.58617,4.444507,1.613338,2.316761,1.50578,3.084725,2.61831,1.486096,0.416008,1.712449
3,0.247597,-1.916844,0.518319,-2.770106,0.471145,-0.307356,0.913596,-3.72289,0.797924,0.073781,...,-0.787814,2.02441,2.780921,2.351031,3.881788,2.486067,-3.556465,1.269041,1.219304,-3.954776
4,1.664582,-1.94979,-2.165724,-1.742308,-1.98972,-1.150563,-0.357715,-0.172018,-0.621676,-2.099463,...,0.992937,-0.211066,0.924185,-0.076872,2.18325,0.959127,1.488387,2.710169,2.262502,-2.387486


In [313]:
# Calculate TSNE
reload(sct)
from sklearn.manifold import TSNE
myTSNE_PCA = sct.TSNE(Y_pca, df, metadata)
myTSNE_PCA.calc_TSNE(perplexity=30, learning_rate=500)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1633 samples in 0.001s...
[t-SNE] Computed neighbors for 1633 samples in 0.048s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1633
[t-SNE] Computed conditional probabilities for sample 1633 / 1633
[t-SNE] Mean sigma: 0.225017
[t-SNE] Computed conditional probabilities in 0.050s
[t-SNE] Iteration 50: error = 18.0440941, gradient norm = 0.0042151 (50 iterations in 2.184s)
[t-SNE] Iteration 100: error = 18.0412197, gradient norm = 0.0003722 (50 iterations in 1.707s)
[t-SNE] Iteration 150: error = 18.0416584, gradient norm = 0.0003513 (50 iterations in 1.693s)
[t-SNE] Iteration 200: error = 18.0417385, gradient norm = 0.0002401 (50 iterations in 1.669s)
[t-SNE] Iteration 250: error = 18.0416260, gradient norm = 0.0003136 (50 iterations in 1.662s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 18.041626
[t-SNE] Iteration 300: error = 1.6185501, gradient norm = 0.0006149 (50 iterations in 1.685s)

In [316]:
# Using HDBSCAN to call clusters automatically, generate metadata for them
clusterer = hdbscan.HDBSCAN(min_cluster_size=30, min_samples=3).fit(myTSNE_PCA.X_tsne)
labels_HDBSCAN = clusterer.labels_

In [317]:
print "Clusters", max(labels_HDBSCAN)+1

Clusters 11


In [318]:
myTSNE_PCA.df_libs.shape

(1633, 10)

In [319]:
myTSNE_PCA.X_tsne.shape

(1633, 2)

In [320]:
zero_axis = myTSNE_PCA.X_tsne[:,0]
one_axis = myTSNE_PCA.X_tsne[:,1]
myTSNE_PCA.df_libs['x'] = zero_axis
myTSNE_PCA.df_libs['y'] = one_axis
myTSNE_PCA.df_libs['hdb_clust'] = labels_HDBSCAN
myTSNE_PCA.df_libs.to_csv("../data/08_tss_expression_matrix_ICIM/G_KC_PCA_TSNE_data.csv")

In [321]:
# Load the ICIM analysis object with associated data
# See https://github.com/felixhorns/FlyPN/blob/master/analysis/GH146_Fig2.ipynb 
# for full featured example

reload(sct)
from scipy.cluster import hierarchy
myICIM = sct.ICIM(f_df, df, TFs=[], CSMs=[], exclude=[], N=100,
                  correlation_cutoff=0.5,
                  min_hits=3,
                  exclude_max=2,
                  dropout_rate_low=0.3,
                  dropout_rate_high=1.0,
                  metric="correlation",
                  stop_condition="linkage_dist",
                  N_stop=50,
                  linkage_dist_stop=0.2)

In [322]:
# Run step command to iterate over first run of ICIM algo
# QC / Status Check
myICIM.step("0", verbose=True)

Found 4 genes


[]

In [64]:
# Run full ICIM pipeline
myICIM.calc(verbose=True)

Initial step
Found 4 genes



In [65]:
genes_KC_ICIM = myICIM.get_all_markers()
print "Genes found by ICIM", len(genes_KC_ICIM)

Genes found by ICIM 0


In [None]:
df.shape