In [2]:
import pandas as pd
import numpy as np
import sys
import hdbscan

In [212]:
# Load up code to run ICIM
# Available from : https://github.com/felixhorns/FlyPN
sys.path.append("../code/ICIM")
import sct
reload(sct)
# Used because of nature of ICIM library
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# Set up to pull out the list of genes in the pruned expression table
df = pd.read_csv("../data/02_filtered_kenyon_cells/CLEAN_LOG2TRANSFORM_kenyon_cells.csv", index_col='symbol')

In [214]:
# ended up with a random row nan whose entire row is 0 counts - FAKE GENE
print('Shape of df with nan index : ' + str(df[df.index.isna()].shape))
print('total sum of expression values : ' + str(sum(df[df.index.isna()].iloc[0])))
print('drop row')
df = df[~df.index.isna()]

Shape of df with nan index : (1, 2848)
total sum of expression values : 0.0
drop row


In [216]:
df.shape

(17472, 2848)

In [224]:
# Will generate each list of genes seperately in case it is necc. to analyze them seperately.
# After creation will compile them all into a seperate list

gene_list = []

In [4]:
# Nicotinic receptor component subtype genes
# Keep the gene (index column) only for those genes which contain the 'nAChR'
# These genes are discussed at http://flybase.org/reports/FBgg0000202.html
nAChR = list(df[df.index.str.contains('nAChR', na=False)].index)

In [226]:
# Muscarinic receptor component subtype genes
# Keep the gene (index column) only for those genes which contain the 'mAChR'
# These genes are discussed at http://flybase.org/reports/FBgg0000187.html
mAChR = list(df[df.index.str.contains('mAChR', na=False)].index)

In [227]:
# Defective proboscis extension reponse genes
# Keep the gene (index column) only for those genes which contain the 'dpr'
# These genes are discussed at http://flybase.org/reports/FBgg0000529.html
dpr = list(df[df.index.str.contains('dpr', na=False)].index)

In [228]:
# Dpr interacting proteins
# These come in two batches - DIP & two unnamed genes ['CG31814', 'CG45781']
# These genes are discussed at http://flybase.org/reports/FBgg0000530
dip = list(df[df.index.str.contains('DIP-', na=False)].index) + list(df[df.index.isin(['CG31814', 'CG45781'])].index)

In [229]:
dscam = list(df[df.index.str.contains('Dscam', na=False)].index)

In [230]:
for geneset in [nAChR, mAChR, dpr, dip, dscam]:
    gene_list.extend(geneset)

In [231]:
# TSNE Projection will be completed with # genes:
len(gene_list)

49

In [266]:
# Generate a filtered df based on these genes
f_df = df.copy(deep=True)
f_df = f_df.loc[gene_list]

In [267]:
f_df.shape

(49, 2848)

In [268]:
# Check to make sure each gene is present in at least 3 cells
def check_min_num_cells(row):
    num_pos = 0
    for gene_count in row:
        if gene_count > 0:
            num_pos += 1
            if num_pos >= 3:
                return True
    return False
   

min_num_cells = f_df.apply(check_min_num_cells, axis=1)
f_df = f_df[min_num_cells]

In [270]:
f_df.shape

(47, 2848)

In [274]:
# This is a cell which has no expression for the associated synaptic organizing genes
# It has to be removed from both the unfiltered/filtered df AND the metadata df
f_df.loc[:,f_df.apply(sum) == 0].columns

Index([u'CTACACCTCAGAGGTG-DGRP-551_9d_r1'], dtype='object')

In [None]:
# remove any cells which have zero counts for all genes in gene_list
df = df.drop('CTACACCTCAGAGGTG-DGRP-551_9d_r1', axis=1)
f_df = f_df.drop('CTACACCTCAGAGGTG-DGRP-551_9d_r1', axis=1)

In [284]:
df.shape

(17472, 2847)

In [256]:
f_df.shape

(47, 2847)

In [285]:
# load up metadata that corresponds to full df
metadata = pd.read_csv("../data/02_filtered_kenyon_cells/metadata_kenyon_cells.csv")
metadata = metadata.set_index("CellID")

In [None]:
metadata = metadata.drop('CTACACCTCAGAGGTG-DGRP-551_9d_r1')

In [289]:
metadata.shape

(2847, 8)

In [290]:
# Calculate TSNE
reload(sct)
from sklearn.manifold import TSNE
myTSNE = sct.TSNE(f_df, df, metadata)
myTSNE.calc_TSNE(perplexity=10, learning_rate=250, early_exaggeration=4.0, method="exact", random_state=1)

[t-SNE] Computed conditional probabilities for sample 1000 / 2847
[t-SNE] Computed conditional probabilities for sample 2000 / 2847
[t-SNE] Computed conditional probabilities for sample 2847 / 2847
[t-SNE] Mean sigma: 0.116605
[t-SNE] Iteration 50: error = 25.9108090, gradient norm = 0.0070685 (50 iterations in 14.823s)
[t-SNE] Iteration 100: error = 25.7676472, gradient norm = 0.0004148 (50 iterations in 14.791s)
[t-SNE] Iteration 150: error = 25.7689364, gradient norm = 0.0001069 (50 iterations in 14.654s)
[t-SNE] Iteration 200: error = 25.7696320, gradient norm = 0.0000370 (50 iterations in 14.673s)
[t-SNE] Iteration 250: error = 25.7699342, gradient norm = 0.0000196 (50 iterations in 14.973s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 25.769934
[t-SNE] Iteration 300: error = 3.3168265, gradient norm = 0.0016014 (50 iterations in 14.786s)
[t-SNE] Iteration 350: error = 3.0080594, gradient norm = 0.0006648 (50 iterations in 14.660s)
[t-SNE] Iteration 400: err

In [291]:
# Using HDBSCAN to call clusters automatically, generate metadata for them
clusterer = hdbscan.HDBSCAN(min_cluster_size=30, min_samples=3).fit(myTSNE.X_tsne)
labels_HDBSCAN = clusterer.labels_

In [292]:
print "Clusters", max(labels_HDBSCAN)+1

Clusters 32


In [293]:
zero_axis = myTSNE.X_tsne[:,0]
one_axis = myTSNE.X_tsne[:,1]
myTSNE.df_libs['x'] = zero_axis
myTSNE.df_libs['y'] = one_axis
myTSNE.df_libs['hdb_clust'] = labels_HDBSCAN


In [298]:
def gen_kc_type(row):
    if int(row.cell_type_id) == 8:
        return 'G-KC'
    elif int(row.cell_type_id) == 22:
        return 'a/b-KC'
    elif int(row.cell_type_id) == 28:
        return "a'/b'-KC"
    
myTSNE.df_libs['subtype'] = myTSNE.df_libs.apply(gen_kc_type, axis=1)
myTSNE.df_libs.to_csv("../data/05_synapse_organizing_gene_TSNE_gen/KC_synapse_organizing_gene_TSNE_data.csv")

In [299]:
myTSNE.df_libs.columns

Index([u'Age', u'Gender', u'Genotype', u'Replicate', u'nGene', u'nUMI',
       u'cell_type_id', u'is_kc', u'x', u'y', u'hdb_clust', u'subtype'],
      dtype='object')