# Introduction

Fairlie asked for an AnnData matrix of the Mats Lunjman short read experiments we submitted.

In [1]:
import pandas
from pathlib import Path
import os
import sys
from anndata import AnnData

from pipeline_common import get_gene_id_to_name

In [2]:
LRSC = str(Path('~/proj/long-rna-seq-condor').expanduser())
if LRSC not in sys.path:
    sys.path.append(LRSC)
    
from woldrnaseq.madqc import load_rsem_quantifications, load_genomic_quantifications
from woldrnaseq.models import load_experiments, load_library_tables

In [3]:
EC = Path('~/proj/encoded_client').expanduser()
if EC not in sys.path:
    sys.path.append(str(EC))
from encoded_client import ENCODED

In [4]:
pwd

'/woldlab/loxcyc/home/diane/proj/encode-202006-jamboree-detrout-rna-sc-pipeline'

In [5]:
gene_id_map = get_gene_id_to_name('GRCh38-V24-male')

In [6]:
biosamples = [
    'ENCBS375FZQ',
    'ENCBS555BGE',
    'ENCBS777OZT',
    'ENCBS206PUV',
    'ENCBS975OUN',
    'ENCBS365LJQ',
    'ENCBS734JMT',
    'ENCBS648NSQ',
]


In [7]:
server = ENCODED('www.encodeproject.org')
server.load_netrc()

In [8]:
biosample = server.get_json(biosamples[0])

In [9]:
biosample.keys()

dict_keys(['accession', 'aliases', 'schema_version', 'status', 'lab', 'award', 'date_created', 'submitted_by', 'documents', 'references', 'source', 'biosample_ontology', 'genetic_modifications', 'alternate_accessions', 'treatments', 'dbxrefs', 'donor', 'organism', 'internal_tags', 'part_of', 'nih_institutional_certification', '@id', '@type', 'uuid', 'sex', 'age', 'age_units', 'health_status', 'life_stage', 'applied_modifications', 'characterizations', 'parent_of', 'age_display', 'summary', 'perturbed', '@context', 'audit'])

In [10]:
biosample['references']

[]

In [11]:
biosample['alternate_accessions']

[]

In [12]:
graph = server.search_jsonld(searchTerm='ENCBS375FZQ')

In [13]:
[ x['@type'] for x in graph['@graph']]

[['Biosample', 'Item'],
 ['Experiment', 'Dataset', 'Item'],
 ['Experiment', 'Dataset', 'Item']]

In [14]:
graph['@graph'][1].keys()

dict_keys(['@id', '@type', 'accession', 'aliases', 'assay_term_name', 'assay_title', 'assembly', 'audit', 'award', 'biosample_ontology', 'biosample_summary', 'date_created', 'dbxrefs', 'description', 'files', 'lab', 'references', 'related_series', 'replicates', 'status', 'submitted_by', '@context'])

In [15]:
[ x['assay_term_name'] for x in graph['@graph'][1:]]

['RNA-seq', 'microRNA-seq']

In [16]:
alias_prefix = 'barbara-wold:'
biosample_ids = []
experiment_ids = []
descriptions = []
library_ids = []
cell_ids = []
for accession in biosamples:
    graph = server.search_jsonld(searchTerm=accession)
    for row in graph['@graph']:
        if 'Experiment' in row['@type'] and row['assay_term_name'] == 'RNA-seq':
            experiment = server.get_json(row['accession'])
            #for replicate in experiment['replicates']:
            #    library = replicate['library']
            #    print(accession, experiment['accession'], library['accession'], library['aliases'])
            for f in experiment['files']:
                library = server.get_json(f['library'])

                biosample_ids.append(accession)
                experiment_ids.append(experiment['accession'])
                descriptions.append(experiment['description'])
                library_ids.append(library['accession'])
                cell_ids.append(library['aliases'][0][len(alias_prefix):])
                
                print(experiment['accession'], 
                      library['accession'], 
                      library['aliases'][0][len(alias_prefix):], 
                      #f['accession'], 
                      #f['submitted_file_name']
                )
                break  # hack since I happent to know there are 2 fastq files
            
            

ENCSR698RPL ENCLB041DJG SL428079_C1
ENCSR648KDM ENCLB468VTI SL428090_C2
ENCSR128CYL ENCLB972WVN SL428097_C3
ENCSR615EEK ENCLB067SQT SL428098_C4
ENCSR151NGC ENCLB994RBL SL428099_C5
ENCSR245ATJ ENCLB732LUH SL428100_C6
ENCSR355JZC ENCLB928KKR SL428101_C7
ENCSR797RXV ENCLB351QES SL428102_C8


In [17]:
root = Path('~/proj/analysis/HL7TLDRXX').expanduser()

In [18]:
descriptions

['HCT116_rep2',
 'PC-3_rep2',
 'Panc1_rep2',
 'K562_rep2',
 'GM12878_rep2',
 'HepG2_rep2',
 'MCF-7_rep2',
 'IMR90_rep2']

In [19]:
cell_ids

['SL428079_C1',
 'SL428090_C2',
 'SL428097_C3',
 'SL428098_C4',
 'SL428099_C5',
 'SL428100_C6',
 'SL428101_C7',
 'SL428102_C8']

In [20]:
libraries = load_library_tables([root/'libraries-single.tsv'])

In [21]:
experiment = pandas.Series({
    'replicates': cell_ids,
})

In [22]:
counts = load_genomic_quantifications(experiment, libraries, 'TPM')

quantifications None (61471, 8)


In [23]:
adata = AnnData(counts.T)

In [24]:
gene_symbols = [gene_id_map[x] for x in counts.index]

In [25]:
adata.var['gene_symbols'] = gene_symbols

In [26]:
adata.obs['encode_experiment'] = experiment_ids
adata.obs['encode_biosample'] = biosample_ids
adata.obs['cell_ids'] = cell_ids

In [27]:
adata

AnnData object with n_obs × n_vars = 8 × 61471 
    obs: 'encode_experiment', 'encode_biosample', 'cell_ids'
    var: 'gene_symbols'

In [28]:
biosample_ids

['ENCBS375FZQ',
 'ENCBS555BGE',
 'ENCBS777OZT',
 'ENCBS206PUV',
 'ENCBS975OUN',
 'ENCBS365LJQ',
 'ENCBS734JMT',
 'ENCBS648NSQ']

In [29]:
adata.write_h5ad('mats-rep2-illumina-tpm.h5ad')

... storing 'gene_symbols' as categorical
