In [2]:
import os
import pandas as pd
from pathlib import Path
import numpy as np
import anndata
import time
import matplotlib.pyplot as plt

from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache

In [3]:
version = '20250331'
download_base = Path('../../../data/abc_atlas')
abc_cache = AbcProjectCache.from_s3_cache(download_base)
abc_cache.list_manifest_file_names.append(f'releases/{version}/manifest.json')
abc_cache.load_manifest(f'releases/{version}/manifest.json')

releases/20250131/manifest.json
which is newer than the most recent manifest file you have previously been working with
releases/20250331/manifest.json
It is possible that some data files have changed between these two data releases, which will force you to re-download those data files (currently downloaded files will not be overwritten). To continue using releases/20250331/manifest.json, run
type.load_manifest('releases/20250331/manifest.json')


Helper function to format a clickable ENSEMBL id link

In [4]:
def create_clickable_ENSEMBL_id(id):
    url_template= '''<a href="https://identifiers.org/ENSEMBL:{id}" target="_blank">{id}</a>'''.format(id=id)
    return url_template

Helper function to format a clickable NCBI id link

In [5]:
def create_clickable_NCBI_id(id):
    if pd.notna(id) :
        url_template= '''<a href="https://identifiers.org/{id}" target="_blank">{id}</a>'''.format(id=id)
    else :
        url_template = ""
    return url_template

Helper function to create a gene list

In [6]:
def create_output_html( df, file, title ) :
    
    df['gene_identifier'] = df['gene_identifier'].apply(create_clickable_ENSEMBL_id)
    if 'mapped_ncbi_identifier' in df.columns :
        df['mapped_ncbi_identifier'] = df['mapped_ncbi_identifier'].apply(create_clickable_NCBI_id)
    formatter = {'gene_symbol': lambda x: '<b>' + x + '</b>'}
    output = df.to_html(index=False, na_rep='',
                        render_links=True,escape=False,
                        classes="mystyle",formatters=formatter)
    
    html_string = '''
    <html>
    <head><title>%s</title></head>
    <link rel="stylesheet" type="text/css" href="../../_static/simple_style.css"/>
    <body>
    {table}
    </body>
    </html>.
    ''' % title
    
    # OUTPUT AN HTML FILE
    with open(file, 'w') as f:
        f.write(html_string.format(table=output))


Create gene list html for WMB-10X

In [13]:
gene = abc_cache.get_metadata_dataframe(directory='WMB-10X', file_name='gene')
gene.sort_values('gene_symbol',inplace=True)
print(len(gene))

output_directory = 'WMB-10X'
os.makedirs( output_directory, exist_ok=True)
output_file = os.path.join( output_directory, 'gene_list.html')
create_output_html( gene, output_file, 'WMB-10X: gene list')

32285


Create gene list html for WHB-10Xv3

In [14]:
gene = abc_cache.get_metadata_dataframe(directory='WHB-10Xv3', file_name='gene')
gene.sort_values('gene_symbol',inplace=True)
print(len(gene))

output_directory = 'WHB-10Xv3'
os.makedirs( output_directory, exist_ok=True)
output_file = os.path.join( output_directory, 'gene_list.html')
create_output_html( gene, output_file, 'WHB-10Xv3: gene list')

gene.csv: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.23M/4.23M [00:00<00:00, 8.67MMB/s]


59357


Create gene list html for MERFISH-C57BL6J-638850

In [8]:
gene = abc_cache.get_metadata_dataframe(directory='MERFISH-C57BL6J-638850', file_name='gene')
pred = ['Blank' not in x for x in gene['gene_identifier']]
gene = gene[pred].copy()
gene.sort_values('gene_symbol',inplace=True)
gene = gene[['gene_identifier','gene_symbol','name','transcript_identifier','mapped_ncbi_identifier']]
print(len(gene))


output_directory = 'MERFISH-C57BL6J-638850'
os.makedirs( output_directory, exist_ok=True)
output_file = os.path.join( output_directory, 'gene_list.html')
create_output_html( gene, output_file, 'MERFISH-C57BL6J-638850: gene list')

500


In [18]:
gene = abc_cache.get_metadata_dataframe(directory='MERFISH-C57BL6J-638850-imputed', file_name='gene')
pred = ['Blank' not in x for x in gene['gene_identifier']]
gene = gene[pred].copy()
gene.sort_values('gene_symbol',inplace=True)
gene = gene[['gene_identifier','gene_symbol','name','mapped_ncbi_identifier']]
print(len(gene))


output_directory = 'MERFISH-C57BL6J-638850-imputed'
os.makedirs( output_directory, exist_ok=True)
output_file = os.path.join( output_directory, 'gene_list.html')
create_output_html( gene, output_file, 'MERFISH-C57BL6J-638850-imputed: gene list')

8460


Create gene list html for Zhuang-C57BL6J-1

In [19]:
gene = abc_cache.get_metadata_dataframe(directory='Zhuang-ABCA-1', file_name='gene')
gene.sort_values('gene_symbol',inplace=True)
gene
print(len(gene))

output_directory = 'Zhuang-ABCA-1'
os.makedirs( output_directory, exist_ok=True)
output_file = os.path.join( output_directory, 'gene_list.html')
create_output_html( gene, output_file, 'Zhuang-ABCA-1: gene list')

gene.csv: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84.7k/84.7k [00:00<00:00, 540kMB/s]


1122


In [7]:
gene = abc_cache.get_metadata_dataframe(directory='ASAP-PMDBS-10X', file_name='gene')
gene.sort_values('gene_symbol',inplace=True)
print(len(gene))

output_directory = 'ASAP-PMDBS-10X'
os.makedirs( output_directory, exist_ok=True)
output_file = os.path.join( output_directory, 'gene_list.html')
create_output_html( gene, output_file, 'ASAP-PMDBS-10X: gene list')

36601
