In [4]:
import loompy
import numpy as np
import pandas as pd
import os
import shutil

In [9]:
# Data can be downloaded from http://scope.aertslab.org
# Data is listed under Publically Available -> Drosophila -> Brain
# -> Aerts_Fly_AdultBrain_Filtered_57k.loom


# See http://linnarssonlab.org/loompy/apiwalkthrough/index.html
# for an introduction to working with loom files

# Because .loom data is written to disk as it's altered for e.g.
# exploratory analysis, a fresh copy of the original data is
# copied to re-run any analyses

og_data = "../data/00_original_data/ORIGINAL_Aerts_Fly_AdultBrain_Filtered_57k.loom"
wrk_data = "../data/01_working_data/WORKING_Aerts_Fly_AdultBrain_Filtered_57k.loom"
if os.path.isfile(og_data):
    shutil.copyfile(og_data, wrk_data)

In [10]:
# Make a connection to the working data
# **** Validation is set to False
# Presumably - the dataformat for loom has been altered in the time since this dataset was generated
# Validation is turned off to open, will be reshaped to a valid form in filtering process.
ds = loompy.connect(wrk_data, validate=False)

In [14]:
# Get sense of data
ds[0:10, 0:10] 

array([[ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  1.,  0.,  0.,  3.,  1.,  1.,  0.,  0.],
       [25.,  2.,  4.,  7.,  0., 17., 17.,  8.,  2.,  4.],
       [39.,  8., 11.,  4.,  1., 64., 47., 13.,  3.,  2.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [27]:
ds.shape

(17473, 56902)

In [15]:
# The list of attributes associated with each Column/CELL
ds.ca.keys()

['Age',
 'CellID',
 'ClusterID',
 'Clusterings',
 'Embedding',
 'Embeddings_X',
 'Embeddings_Y',
 'Gender',
 'Genotype',
 'RegulonsAUC',
 'Replicate',
 'nGene',
 'nUMI']

In [16]:
# The list of attributes associated with each row/GENE
ds.ra.keys()

['ClusterMarkers_0',
 'ClusterMarkers_0_sub_0',
 'ClusterMarkers_0_sub_1',
 'ClusterMarkers_0_sub_10',
 'ClusterMarkers_0_sub_11',
 'ClusterMarkers_0_sub_12',
 'ClusterMarkers_0_sub_13',
 'ClusterMarkers_0_sub_14',
 'ClusterMarkers_0_sub_15',
 'ClusterMarkers_0_sub_16',
 'ClusterMarkers_0_sub_17',
 'ClusterMarkers_0_sub_18',
 'ClusterMarkers_0_sub_19',
 'ClusterMarkers_0_sub_2',
 'ClusterMarkers_0_sub_20',
 'ClusterMarkers_0_sub_21',
 'ClusterMarkers_0_sub_22',
 'ClusterMarkers_0_sub_23',
 'ClusterMarkers_0_sub_24',
 'ClusterMarkers_0_sub_25',
 'ClusterMarkers_0_sub_26',
 'ClusterMarkers_0_sub_27',
 'ClusterMarkers_0_sub_28',
 'ClusterMarkers_0_sub_29',
 'ClusterMarkers_0_sub_3',
 'ClusterMarkers_0_sub_30',
 'ClusterMarkers_0_sub_31',
 'ClusterMarkers_0_sub_32',
 'ClusterMarkers_0_sub_33',
 'ClusterMarkers_0_sub_34',
 'ClusterMarkers_0_sub_35',
 'ClusterMarkers_0_sub_36',
 'ClusterMarkers_0_sub_37',
 'ClusterMarkers_0_sub_38',
 'ClusterMarkers_0_sub_39',
 'ClusterMarkers_0_sub_4',
 'Cl

In [24]:
# Exploring the Column/CELL attribute ClusterID
# which contains the code for the cell type of 
# each individual cell. See csv 
# ../data/02_filtered_kenyon_cells/cluster_id_cell_type_definition_at_resolution2.csv

# Clutser ID - Annotation
#  8 - G-KC
# 22 - a/b-KC
# 28 - a'/b'-KC

ds.ca.ClusterID

array([[ 19.,  18.,  19., ...,  24.,  31.,   5.],
       [ 21.,  21.,  21., ...,  12.,  20.,  10.],
       [ 12.,  12.,  13., ...,  20.,  22.,   7.],
       ...,
       [ 79.,  81.,  85., ..., 107., 111.,  -1.],
       [ 53.,  54.,  54., ...,  83.,  90.,   7.],
       [  1.,   1.,   5., ...,  21.,  14.,   2.]])

In [26]:
ds.ca.ClusterID.shape

(56902, 9)

In [29]:
# the loom file contains 56902 cells
# Each cell has a unique ClusterID / Cell type assigned to it
# 9 Seperate clusterings / cell-type callings were performed
# We will work with the initial set!

cell_type_id = []

for row in ds.ca.ClusterID:
    cell_type_id.append(row[0])
# Assign new column attribute 
ds.ca.cell_type_id = cell_type_id

In [33]:
# Create a True / False ( 0 / 1) column/CELL attribute
# too pull out only those cells which are one of the 
# three kenyon cell subtypes

is_kc = []

for cell_type_id in ds.ca.cell_type_id:
    if cell_type_id in [8, 22, 28]:
        is_kc.append(1)
    else:
        is_kc.append(0)
        
ds.ca.is_kc = is_kc

In [42]:
# Generating metadata df
# The attributes associated with each column/CELL is the metadata
# we will want to use to explore the data visually later
ca_keys = ds.ca.keys()
ddict = {}
for key in ca_keys:
    ddict[key] = list(ds.ca[key])
metadf = pd.DataFrame(ddict)
# Only some of these values are of interest to us
keep = ['CellID', 'Age', 'Gender', 'Genotype', 'Replicate',
        'nGene', 'nUMI', 'cell_type_id', 'is_kc']

metadf = metadf[keep]

# Drop all non-Kenyon cells
metadf = metadf[metadf.is_kc == 1]

metadf = metadf.set_index("CellID")

In [43]:
metadf.head()

Unnamed: 0_level_0,Age,Gender,Genotype,Replicate,nGene,nUMI,cell_type_id,is_kc
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ACATACGAGGGCTTCC-DGRP-551_0d_r1,0,Female,DGRP-551,DGRP-551_0d_Rep1,1328,3340.0,8.0,1
ACCCACTTCACTCTTA-DGRP-551_0d_r1,0,Female,DGRP-551,DGRP-551_0d_Rep1,1613,4580.0,8.0,1
ACCGTAAAGATAGTCA-DGRP-551_0d_r1,0,Male,DGRP-551,DGRP-551_0d_Rep1,1466,4349.0,22.0,1
ACTTACTAGTGGTAAT-DGRP-551_0d_r1,0,Male,DGRP-551,DGRP-551_0d_Rep1,1174,2942.0,8.0,1
ACTTGTTCATGGTTGT-DGRP-551_0d_r1,0,Male,DGRP-551,DGRP-551_0d_Rep1,1410,3620.0,8.0,1


In [44]:
# Almost 3000 total kenyon cells between the three types
# Consistent with number of KC listed in manuscript
metadf.shape

(2848, 8)

In [45]:
metadf.to_csv("../data/02_filtered_kenyon_cells/metadata_kenyon_cells.csv")

In [46]:
# Need to generate a new loom file containing only those cells which
# are a kenyon cell.
with loompy.new("../data/02_filtered_kenyon_cells/ORIGINAL_kenyon_cells.loom") as dsout:  # Create a new, empty, loom file
    cells = ds.ca.is_kc != 0
    for (ix, selection, view) in ds.scan(items=cells, axis=1):
        dsout.add_columns(view.layers, col_attrs=view.ca, row_attrs=view.ra)

In [47]:
# close out the original loom file, ds
ds.close()

In [59]:
# There is excess data stored in the column/CELL and row/GENE attributes
# which we either don't need or have stored as metadata
# These need to be removed so we can store our data as 
# a csv

In [48]:
ds2 = loompy.connect("../data/02_filtered_kenyon_cells/ORIGINAL_kenyon_cells.loom", validate=False)

In [49]:
ds2[0:10, 0:10]

array([[  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.],
       [ 10.,   6.,   9.,  10.,   6.,   5.,   7.,   7.,   9.,   7.],
       [ 53., 102.,  84.,  46.,  72.,  80., 102., 121.,   8.,  59.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.]])

In [50]:
# Remove every row/GENE attribute except for 'GENE'
ra_list = ds2.ra.keys()
ra_list.remove('Gene')

for ra in ra_list:
    del  ds2.ra[ra]

In [52]:
# Remove every column/CELL attribute except for 'CellID'
ca_list = ds2.ca.keys()
ca_list.remove('CellID')

for ca in ca_list:
    del ds2.ca[ca]

In [53]:
ds2

0,1,2,3,4,5,6,7,8,9,10,11,12
,CellID,ACATACGAGGGCTTCC-DGRP-551_0d_r1,ACCCACTTCACTCTTA-DGRP-551_0d_r1,ACCGTAAAGATAGTCA-DGRP-551_0d_r1,ACTTACTAGTGGTAAT-DGRP-551_0d_r1,ACTTGTTCATGGTTGT-DGRP-551_0d_r1,ACTTTCATCAATAAGG-DGRP-551_0d_r1,AGATCTGCAACAACCT-DGRP-551_0d_r1,AGATCTGTCTACTCAT-DGRP-551_0d_r1,AGCGGTCCATTTCACT-DGRP-551_0d_r1,AGCTCTCGTTTGACTG-DGRP-551_0d_r1,...
Gene,,,,,,,,,,,,...
128up,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...
140up,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...
14-3-3epsilon,,10.0,6.0,9.0,10.0,6.0,5.0,7.0,7.0,9.0,7.0,...
14-3-3zeta,,53.0,102.0,84.0,46.0,72.0,80.0,102.0,121.0,8.0,59.0,...
18SrRNA:CR41548,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...
18SrRNA:CR45838,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...
18SrRNA:CR45841,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...
18SrRNA-Psi:CR41602,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...


In [54]:
# Want to export data out now that it will be compatible with tabular formats
# help(ds2.export)
ds2.export("../data/02_filtered_kenyon_cells/CLEAN_kenyon_cells.tab")

In [77]:
ds2.close()

In [76]:
# Load in newly exported data
df2 = pd.read_csv("../data/02_filtered_kenyon_cells/CLEAN_kenyon_cells.tab", sep='\t')

# Export process generates some issues
# Data cleaning
df2 = df2.drop("CellID", axis=1)

# Symbol is the index column name
# used in a downstream analysis package
df2['symbol'] = df2['Unnamed: 0']
df2 = df2.set_index('symbol')

# Remove columns which are "unnamed" and empty
# or "unnamed" and contain gene names now stored
# in 'symbol'
df2 = df2.loc[:, ~df2.columns.str.contains('^Unnamed')]

# Drop row which is formed with empty row called "Gene"
df2 = df2.drop("Gene", axis=0)

In [78]:
df2.head()

Unnamed: 0_level_0,ACATACGAGGGCTTCC-DGRP-551_0d_r1,ACCCACTTCACTCTTA-DGRP-551_0d_r1,ACCGTAAAGATAGTCA-DGRP-551_0d_r1,ACTTACTAGTGGTAAT-DGRP-551_0d_r1,ACTTGTTCATGGTTGT-DGRP-551_0d_r1,ACTTTCATCAATAAGG-DGRP-551_0d_r1,AGATCTGCAACAACCT-DGRP-551_0d_r1,AGATCTGTCTACTCAT-DGRP-551_0d_r1,AGCGGTCCATTTCACT-DGRP-551_0d_r1,AGCTCTCGTTTGACTG-DGRP-551_0d_r1,...,GGCAATTCATGGATGG-w1118_15d_r1,GTCCTCAGTTGCGCAC-w1118_15d_r1,GTGCAGCGTACCGTAT-w1118_15d_r1,TATGCCCTCTATGTGG-w1118_15d_r1,TCATTACAGAGGTTGC-w1118_15d_r1,TCGAGGCAGCTATGCT-w1118_15d_r1,TTCTACATCAGTGTTG-w1118_15d_r1,AAGTCTGTCTGGTTCC-w1118_30d_r1,ACATACGTCGGATGGA-w1118_30d_r1,GAAATGACAAGACACG-w1118_30d_r1
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
128up,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140up,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
14-3-3epsilon,10.0,6.0,9.0,10.0,6.0,5.0,7.0,7.0,9.0,7.0,...,5.0,2.0,1.0,5.0,1.0,1.0,6.0,3.0,0.0,3.0
14-3-3zeta,53.0,102.0,84.0,46.0,72.0,80.0,102.0,121.0,8.0,59.0,...,22.0,4.0,8.0,27.0,10.0,7.0,22.0,9.0,6.0,19.0
18SrRNA:CR41548,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
df2.to_csv("../data/02_filtered_kenyon_cells/CLEAN_NOTRANSFORM_kenyon_cells.csv")

In [81]:
df2.applymap(lambda x : np.log2(x+1)).to_csv("../data/02_filtered_kenyon_cells/CLEAN_LOG2TRANSFORM_kenyon_cells.csv")