# 0. Setup

Import packages and specify any important functions here.

In [1]:
# import standard python packages
import pandas as pd
import subprocess, os, dill, sys

# add the utils and env directories to the path
sys.path.append('../../utils/')
sys.path.append('../../env/')

# import functions from utils directory files
from string_functions import *
from biofile_handling import *

# import paths to software installs from env
from install_locs import *

In [2]:
################
# general info #
################

# Specify the name of the species folder in Amazon S3
species = 'Mus_musculus'

# Specify any particular identifying conditions, eg tissue type:
conditions = 'adolescentbrain'

################
################

sample_BFD = BioFileDocket(species, conditions)

/home/ec2-user/glial-origins/output/Mmus_adolescentbrain/ already exists
Files will be saved into /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/


# 1. Download and describe data

This notebook collects data from the [Han et al. 2018](https://www.sciencedirect.com/science/article/pii/S0092867418301168#sec4) mouse cell atlas.

## Dataset description

- Data can be found at the [Linnarsson Lab Website](http://mousebrain.org/adolescent/).
- This notebook collects data from all samples in LOOM format for the purposes of data analysis and exploration.

In [3]:
# Specify url and other variables
genome_fasta_url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.23_GRCm38.p3/GCF_000001635.23_GRCm38.p3_genomic.fna.gz'
genome_version = 'GRCm38.p3'

annot_url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.23_GRCm38.p3/GCF_000001635.23_GRCm38.p3_genomic.gff.gz'
loom_url = 'https://storage.googleapis.com/linnarsson-lab-chromium/10X05_1/10X05_1.loom'

###########
# runtime #
###########

protocol = 'curl'

genome_fasta = GenomeFastaFile(
    sampledict = sample_BFD.sampledict,
    version = genome_version,
    url = genome_fasta_url,
    protocol = protocol
)

annot = GenomeGffFile(
    sampledict = sample_BFD.sampledict,
    reference_genome = genome_fasta,
    url = annot_url,
    protocol = protocol
)

loom = LoomFile(
    sampledict = sample_BFD.sampledict,
    url = loom_url,
    protocol = protocol,
    reference_genome = genome_fasta,
    reference_annot = annot
)

gxc = loom.to_gxc()
loom_idmm = loom.get_idmm(id_type = 'ensembl_id')
loom_cellannot = loom.get_cellannot()

keyfiles = {
    'annot': annot,
    'genome_fasta': genome_fasta,
    'gxc': gxc,
    'loom': loom,
    'loom_idmm': loom_idmm,
    'loom_cellannot': loom_cellannot
}

sample_BFD.add_keyfiles(keyfiles)
display(vars(sample_BFD))

inferring file name as GCF_000001635.23_GRCm38.p3_genomic.fna.gz
file GCF_000001635.23_GRCm38.p3_genomic.fna.gz already exists at /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/GCF_000001635.23_GRCm38.p3_genomic.fna.gz
file GCF_000001635.23_GRCm38.p3_genomic.fna.gz unzipped and object renamed to GCF_000001635.23_GRCm38.p3_genomic.fna
inferring file name as GCF_000001635.23_GRCm38.p3_genomic.gff.gz
file GCF_000001635.23_GRCm38.p3_genomic.gff.gz already exists at /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/GCF_000001635.23_GRCm38.p3_genomic.gff.gz
file GCF_000001635.23_GRCm38.p3_genomic.gff.gz unzipped and object renamed to GCF_000001635.23_GRCm38.p3_genomic.gff
inferring file name as 10X05_1.loom
file 10X05_1.loom already exists at /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/10X05_1.loom


gzip: /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/GCF_000001635.23_GRCm38.p3_genomic.fna: unknown suffix -- ignored
gzip: /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/GCF_000001635.23_GRCm38.p3_genomic.gff: unknown suffix -- ignored


GxcFile already exists at /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/10X05_1.loom.asgxc.tsv
IdmmFile already exists at /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/10X05_1.loom.idmm.tsv
CellAnnotFile already exists at /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/10X05_1.loom.cellannot.tsv


{'species': 'Mus_musculus',
 'conditions': 'adolescentbrain',
 'directory': '/home/ec2-user/glial-origins/output/Mmus_adolescentbrain/',
 'files': {},
 'metadata': <biofile_handling.metadata_object at 0x7fa67bdafee0>,
 'annot': <biofile_handling.GenomeGffFile at 0x7fa5ca9378b0>,
 'genome_fasta': <biofile_handling.GenomeFastaFile at 0x7fa5ca936440>,
 'gxc': <biofile_handling.GxcFile at 0x7fa5ca9348e0>,
 'loom': <biofile_handling.LoomFile at 0x7fa5ca936470>,
 'loom_idmm': <biofile_handling.IdmmFile at 0x7fa67bdaf970>,
 'loom_cellannot': <biofile_handling.CellAnnotFile at 0x7fa5c26633d0>}

# 2. Load in the gxc matrix and get gene names

In [4]:
genes_matrix = pd.read_csv(sample_BFD.gxc.path, sep = '\t', nrows = 10)
display(genes_matrix)

gxc_genes_list = pd.read_csv(sample_BFD.gxc.path, sep = '\t', usecols=[0])
display(gxc_genes_list)

sample_BFD.metadata.add('num_cells', len(genes_matrix.columns) - 1)
sample_BFD.metadata.add('num_genes', len(genes_matrix))

Unnamed: 0,gene_name,10X05_1:AAACATACTTTACC-1,10X05_1:AAACATTGCCTCCA-1,10X05_1:AAACATTGCTTACT-1,10X05_1:AAACATTGGTACGT-1,10X05_1:AAACCGTGACTCAG-1,10X05_1:AAACCGTGCGTAGT-1,10X05_1:AAACGCACACACGT-1,10X05_1:AAACGCACGTCGAT-1,10X05_1:AAACGGCTGTGCTA-1,...,10X05_1:TTTCTACTCGCTAA-1,10X05_1:TTTCTACTCTGACA-1,10X05_1:TTTCTACTTGTGCA-1,10X05_1:TTTCTACTTTCTCA-1,10X05_1:TTTCTACTTTGGCA-1,10X05_1:TTTGACTGTTCAGG-1,10X05_1:TTTGCATGCAGAGG-1,10X05_1:TTTGCATGGCGATT-1,10X05_1:TTTGCATGTACTGG-1,10X05_1:TTTGCATGTTCCGC-1
0,Xkr4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Gm1992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Gm37381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Rp1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Rp1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Sox17,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Gm37323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Mrpl15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
8,Lypla1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
9,Gm37988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,gene_name
0,Xkr4
1,Gm1992
2,Gm37381
3,Rp1
4,Rp1
...,...
27993,AC168977.1
27994,PISD
27995,DHRSX
27996,Vmn2r122


# 3. Get mapping identifiers

In [5]:
# load in the original GFF-based annotation
models = pd.read_csv(sample_BFD.annot.path, comment = '#', header = None, sep = '\t')
display(models)

attributes_column = 8

# Check the structure of fields in the GFF additional fields section
display(models[attributes_column][3])

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,NC_000067.6,RefSeq,region,1,195471971,.,+,.,ID=id0;Dbxref=taxon:10090;Name=1;chromosome=1;...
1,NC_000067.6,BestRefSeq%2CGnomon,gene,3199731,3671742,.,-,.,"ID=gene0;Dbxref=GeneID:497097,MGI:MGI:3528744;..."
2,NC_000067.6,Gnomon,mRNA,3199731,3671742,.,-,.,"ID=rna0;Parent=gene0;Dbxref=GeneID:497097,Genb..."
3,NC_000067.6,Gnomon,exon,3670552,3671742,.,-,.,"ID=id1;Parent=rna0;Dbxref=GeneID:497097,Genban..."
4,NC_000067.6,Gnomon,exon,3421702,3421901,.,-,.,"ID=id2;Parent=rna0;Dbxref=GeneID:497097,Genban..."
...,...,...,...,...,...,...,...,...,...
2284235,NC_005089.1,RefSeq,region,15451,15509,.,+,.,ID=id1169986;Note=ETAS1%3B extended terminatio...
2284236,NC_005089.1,RefSeq,region,15515,15558,.,+,.,ID=id1169987;Note=ETAS2%3B extended terminatio...
2284237,NC_005089.1,RefSeq,region,16035,16058,.,+,.,ID=id1169988;Note=CSB1%3B conserved sequencing...
2284238,NC_005089.1,RefSeq,region,16089,16104,.,+,.,ID=id1169989;Note=CSB2%3B conserved sequencing...


'ID=id1;Parent=rna0;Dbxref=GeneID:497097,Genbank:XM_006495550.2,MGI:MGI:3528744;gbkey=mRNA;gene=Xkr4;product=X Kell blood group precursor related family member 4%2C transcript variant X1;transcript_id=XM_006495550.2'

In [6]:
# Remove any rows with NaNs
models.dropna(inplace = True)

# Extract field and database cross-ref (dbxref) information into columns
models['field_dictionary'] = models[attributes_column].apply(convert_fields_to_dict_gff)
models['gene_name'] = [d.get('gene') for d in models['field_dictionary']]
models['Dbxref'] = [d.get('Dbxref') for d in models['field_dictionary']]
models['dbxref_dict'] = models['Dbxref'].apply(convert_dbxref_to_dict)

display(models)

Unnamed: 0,0,1,2,3,4,5,6,7,8,field_dictionary,gene_name,Dbxref,dbxref_dict
0,NC_000067.6,RefSeq,region,1,195471971,.,+,.,ID=id0;Dbxref=taxon:10090;Name=1;chromosome=1;...,"{'ID': 'id0', 'Dbxref': 'taxon:10090', 'Name':...",,taxon:10090,{'taxon': '10090'}
1,NC_000067.6,BestRefSeq%2CGnomon,gene,3199731,3671742,.,-,.,"ID=gene0;Dbxref=GeneID:497097,MGI:MGI:3528744;...","{'ID': 'gene0', 'Dbxref': 'GeneID:497097,MGI:M...",Xkr4,"GeneID:497097,MGI:MGI:3528744","{'GeneID': '497097', 'MGI': '3528744'}"
2,NC_000067.6,Gnomon,mRNA,3199731,3671742,.,-,.,"ID=rna0;Parent=gene0;Dbxref=GeneID:497097,Genb...","{'ID': 'rna0', 'Parent': 'gene0', 'Dbxref': 'G...",Xkr4,"GeneID:497097,Genbank:XM_006495550.2,MGI:MGI:3...","{'GeneID': '497097', 'Genbank': 'XM_006495550...."
3,NC_000067.6,Gnomon,exon,3670552,3671742,.,-,.,"ID=id1;Parent=rna0;Dbxref=GeneID:497097,Genban...","{'ID': 'id1', 'Parent': 'rna0', 'Dbxref': 'Gen...",Xkr4,"GeneID:497097,Genbank:XM_006495550.2,MGI:MGI:3...","{'GeneID': '497097', 'Genbank': 'XM_006495550...."
4,NC_000067.6,Gnomon,exon,3421702,3421901,.,-,.,"ID=id2;Parent=rna0;Dbxref=GeneID:497097,Genban...","{'ID': 'id2', 'Parent': 'rna0', 'Dbxref': 'Gen...",Xkr4,"GeneID:497097,Genbank:XM_006495550.2,MGI:MGI:3...","{'GeneID': '497097', 'Genbank': 'XM_006495550...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2284235,NC_005089.1,RefSeq,region,15451,15509,.,+,.,ID=id1169986;Note=ETAS1%3B extended terminatio...,"{'ID': 'id1169986', 'Note': 'ETAS1%3B extended...",,,
2284236,NC_005089.1,RefSeq,region,15515,15558,.,+,.,ID=id1169987;Note=ETAS2%3B extended terminatio...,"{'ID': 'id1169987', 'Note': 'ETAS2%3B extended...",,,
2284237,NC_005089.1,RefSeq,region,16035,16058,.,+,.,ID=id1169988;Note=CSB1%3B conserved sequencing...,"{'ID': 'id1169988', 'Note': 'CSB1%3B conserved...",,,
2284238,NC_005089.1,RefSeq,region,16089,16104,.,+,.,ID=id1169989;Note=CSB2%3B conserved sequencing...,"{'ID': 'id1169989', 'Note': 'CSB2%3B conserved...",,,


# 4. Extract gene IDs for mapping to UniprotKB
Specify which set of identifiers will be use to query the [Uniprot ID Mapping Tool](https://www.uniprot.org/id-mapping) via API.

If using an identifier from the `dbxref_dict`, specify the name via string in the `dbxref_datafield` variable.

In [7]:
loom_idmm_df = pd.read_csv(loom_idmm.path, sep = '\t')
annot_gene_names = models.dropna(axis = 0, subset = ['gene_name'])[['gene_name']].drop_duplicates()

models_subset = annot_gene_names.merge(loom_idmm_df, on = 'gene_name', how = 'left').dropna()

dbxref_datafield = 'ensembl_id'
display(models_subset)

Unnamed: 0,gene_name,ensembl_id
0,Xkr4,ENSMUSG00000051951
7,Rp1,ENSMUSG00000025900
8,Rp1,ENSMUSG00000109048
9,Sox17,ENSMUSG00000025902
19,Mrpl15,ENSMUSG00000033845
...,...,...
46282,Gm21294,ENSMUSG00000102045
46283,Gm21996,ENSMUSG00000100608
46286,Gm29504,ENSMUSG00000100533
46288,Gm20837,ENSMUSG00000096178


# 5. Generate gene list file to query Uniprot ID Mapping API
Generate a text file ending in `_ids.txt` for sending to the ID mapping API.

In [8]:
gene_list = models_subset[dbxref_datafield].unique()

genelist_object = GeneListFile(
    filename = '', # filename is automatically generated from sampledict
    sampledict = sample_BFD.sampledict,
    sources = [sample_BFD.annot],
    genes = gene_list,
    identifier = dbxref_datafield
    )

Wrote 23090 gene ids to /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mmus_adolescentbrain_ensembl_id_ids.txt


# 6. Query Uniprot ID Mapping API
Specify the `from_type` variable based on the Uniprot name of the identifier.  
The table below lists some databases and the `from_type` string that the API accepts for that datatype.  

| datatype | `from_type` string | description |
| ---: | :--- | :--- |
| Mouse Genome Informatics | `MGI` | ID starts with `MGI:` |
| Zebrafish Information Network | `ZFIN` | ID starts with `ZDB-GENE-` |
| Xenbase | `Xenbase` | ID starts with `XB-GENE-` |

__NOTE:__ You may have to run the cell below twice - UniProt sometimes throws an "Resource not found" message on the first query to the database.

In [9]:
from_type = 'Ensembl'
to_type = 'UniProtKB'

uniprot_idmm_object = genelist_object.get_uniprot_ids(ID_MAPPER_LOC, from_type, to_type)
sample_BFD.add_keyfiles({'uniprot_output_idmm': uniprot_idmm_object})

uniprot_idmm = pd.read_csv(sample_BFD.uniprot_output_idmm.path, sep = '\t')
display(uniprot_idmm)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  428k    0    52  100  428k     23   191k  0:00:02  0:00:02 --:--:--  191k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    24    0    24    0     0     31      0 --:--:-- --:--:-- --:--:--    31


Unnamed: 0,From,Entry,Entry Name,Reviewed,Protein names,Gene Names,Organism,Length
0,ENSMUSG00000051951,Q5GH67,XKR4_MOUSE,reviewed,XK-related protein 4 [Cleaved into: XK-related...,Xkr4 Xrg4,Mus musculus (Mouse),647
1,ENSMUSG00000025900,P56716,RP1_MOUSE,reviewed,Oxygen-regulated protein 1 (Retinitis pigmento...,Rp1 Orp1 Rp1h,Mus musculus (Mouse),2095
2,ENSMUSG00000025900,A0A140LHJ6,A0A140LHJ6_MOUSE,unreviewed,Oxygen-regulated protein 1,Rp1,Mus musculus (Mouse),1371
3,ENSMUSG00000025902,Q61473,SOX17_MOUSE,reviewed,Transcription factor SOX-17,Sox17 Sox-17,Mus musculus (Mouse),419
4,ENSMUSG00000025902,A0A0A6YWS4,A0A0A6YWS4_MOUSE,unreviewed,Transcription factor SOX-17,Sox17,Mus musculus (Mouse),169
...,...,...,...,...,...,...,...,...
51510,ENSMUSG00000099856,A0A087WRK1,A0A087WRK1_MOUSE,unreviewed,"Predicted gene, 20814 (Predicted gene, 20855) ...",Gm20905 Gm20814 Gm20835 Gm20850 Gm20855 Gm2086...,Mus musculus (Mouse),222
51511,ENSMUSG00000102045,A0A087WS79,A0A087WS79_MOUSE,unreviewed,"Predicted gene, 21294",Gm21294,Mus musculus (Mouse),222
51512,ENSMUSG00000100608,A0A087WSS9,A0A087WSS9_MOUSE,unreviewed,Predicted gene 21996,Gm21996,Mus musculus (Mouse),222
51513,ENSMUSG00000096178,A0A087WP97,A0A087WP97_MOUSE,unreviewed,"Predicted gene, 20837",Gm20837,Mus musculus (Mouse),188


# 7. Extract results and generate Uniprot IDMM
Generates an idmm that links `gene_name`, the `dbxref_datafield` seleted above, and `uniprot_id` returned by API.

In [10]:
uniprot_idpairs = uniprot_idmm[['From', 'Entry']]
uniprot_idpairs.rename(columns = {'From': dbxref_datafield, 'Entry': 'uniprot_id'}, inplace = True)
display(uniprot_idpairs)

uniprot_output_idmm = models_subset.merge(uniprot_idpairs, on = dbxref_datafield)
display(uniprot_output_idmm)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_idpairs.rename(columns = {'From': dbxref_datafield, 'Entry': 'uniprot_id'}, inplace = True)


Unnamed: 0,ensembl_id,uniprot_id
0,ENSMUSG00000051951,Q5GH67
1,ENSMUSG00000025900,P56716
2,ENSMUSG00000025900,A0A140LHJ6
3,ENSMUSG00000025902,Q61473
4,ENSMUSG00000025902,A0A0A6YWS4
...,...,...
51510,ENSMUSG00000099856,A0A087WRK1
51511,ENSMUSG00000102045,A0A087WS79
51512,ENSMUSG00000100608,A0A087WSS9
51513,ENSMUSG00000096178,A0A087WP97


Unnamed: 0,gene_name,ensembl_id,uniprot_id
0,Xkr4,ENSMUSG00000051951,Q5GH67
1,Rp1,ENSMUSG00000025900,P56716
2,Rp1,ENSMUSG00000025900,A0A140LHJ6
3,Sox17,ENSMUSG00000025902,Q61473
4,Sox17,ENSMUSG00000025902,A0A0A6YWS4
...,...,...,...
51510,Gm20906,ENSMUSG00000099856,A0A087WRK1
51511,Gm21294,ENSMUSG00000102045,A0A087WS79
51512,Gm21996,ENSMUSG00000100608,A0A087WSS9
51513,Gm20837,ENSMUSG00000096178,A0A087WP97


In [11]:
# generate a filename and file for the idmm
uniprot_output_idmm_filename = '_'.join([sample_BFD.species_prefix, conditions, 'uniprot-idmm.tsv'])
uniprot_output_idmm_object = IdmmFile(
    filename = uniprot_output_idmm_filename, 
    sampledict = sample_BFD.sampledict, 
    kind = 'uniprot_idmm', 
    sources = [sample_BFD.annot]
)

# save to file and add to the BioFileDocket
uniprot_output_idmm.to_csv(uniprot_output_idmm_object.path, sep = '\t')
sample_BFD.add_keyfile('uniprot_idmm', uniprot_output_idmm_object)

# 8. Convert GFF to GTF

In [12]:
# convert the GFF file to GTF using gffread
models_asgtf = sample_BFD.annot.to_gtf(GFFREAD_LOC)

Converted file GCF_000001635.23_GRCm38.p3_genomic.gtf already exists at:
 /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/GCF_000001635.23_GRCm38.p3_genomic.gtf


In [13]:
# load the newly-generated GTF file as a dataframe
models_asgtf_df = pd.read_csv(models_asgtf.path, skiprows = 0, header = None, sep = '\t')

display(models_asgtf_df)
display(models_asgtf_df[attributes_column][1])

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,NC_000067.6,Gnomon,transcript,3199731,3671742,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
1,NC_000067.6,Gnomon,exon,3199731,3207317,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
2,NC_000067.6,Gnomon,exon,3213439,3216968,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
3,NC_000067.6,Gnomon,exon,3421702,3421901,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
4,NC_000067.6,Gnomon,exon,3670552,3671742,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
...,...,...,...,...,...,...,...,...,...
2224794,NC_005089.1,RefSeq,CDS,14145,15288,.,+,0,"transcript_id ""gene48832""; gene_name ""CYTB"";"
2224795,NC_005089.1,RefSeq,transcript,15289,15355,.,+,.,"transcript_id ""rna111579""; gene_id ""gene48833""..."
2224796,NC_005089.1,RefSeq,exon,15289,15355,.,+,.,"transcript_id ""rna111579""; gene_id ""gene48833""..."
2224797,NC_005089.1,RefSeq,transcript,15356,15422,.,-,.,"transcript_id ""rna111580""; gene_id ""gene48834""..."


'transcript_id "rna0"; gene_id "gene0"; gene_name "Xkr4";'

In [14]:
# Use a custom function to extract useful fields from the additional fields section (column 8)
# Pull from that dict to fill in additional useful columns
models_asgtf_df['field_dictionary'] = models_asgtf_df[attributes_column].apply(convert_fields_to_dict_gtf)
models_asgtf_df['gene_name'] = [d.get('gene_name') for d in models_asgtf_df['field_dictionary']]
models_asgtf_df['gene_id'] = [d.get('gene_id') for d in models_asgtf_df['field_dictionary']]
models_asgtf_df['transcript_id'] = [d.get('transcript_id') for d in models_asgtf_df['field_dictionary']]

# Remove CDS annotations because they interfere with TransDecoder cDNA generation
models_asgtf_df = models_asgtf_df[models_asgtf_df[2] != 'CDS']
display(models_asgtf_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,field_dictionary,gene_name,gene_id,transcript_id
0,NC_000067.6,Gnomon,transcript,3199731,3671742,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",Xkr4,gene0,rna0
1,NC_000067.6,Gnomon,exon,3199731,3207317,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",Xkr4,gene0,rna0
2,NC_000067.6,Gnomon,exon,3213439,3216968,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",Xkr4,gene0,rna0
3,NC_000067.6,Gnomon,exon,3421702,3421901,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",Xkr4,gene0,rna0
4,NC_000067.6,Gnomon,exon,3670552,3671742,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",Xkr4,gene0,rna0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2224793,NC_005089.1,RefSeq,transcript,14145,15288,.,+,.,"transcript_id ""gene48832""; gene_id ""gene48832""...","{'transcript_id': 'gene48832', 'gene_id': 'gen...",CYTB,gene48832,gene48832
2224795,NC_005089.1,RefSeq,transcript,15289,15355,.,+,.,"transcript_id ""rna111579""; gene_id ""gene48833""...","{'transcript_id': 'rna111579', 'gene_id': 'gen...",TrnT,gene48833,rna111579
2224796,NC_005089.1,RefSeq,exon,15289,15355,.,+,.,"transcript_id ""rna111579""; gene_id ""gene48833""...","{'transcript_id': 'rna111579', 'gene_id': 'gen...",TrnT,gene48833,rna111579
2224797,NC_005089.1,RefSeq,transcript,15356,15422,.,-,.,"transcript_id ""rna111580""; gene_id ""gene48834""...","{'transcript_id': 'rna111580', 'gene_id': 'gen...",TrnP,gene48834,rna111580


# 9. Generate gtf-idmm
This file maps the `gene_name` to `gene_id` and `transcript_id` fields generated by the conversion from GFF to GTF, which will be needed for downstream processing.

In [15]:
# Extract gene_name, gene_id, and transcript_id fields to generate an ID mapping matrix (idmm)
idmm_df = models_asgtf_df[['gene_name', 'gene_id', 'transcript_id']].drop_duplicates()
idmm_df.dropna(inplace = True)
display(idmm_df)

# generate a filename and file for the idmm
idmm_filename = '_'.join([sample_BFD.species_prefix, conditions, 'gtf-idmm.tsv'])
idmm = IdmmFile(
    filename = idmm_filename, 
    sampledict = sample_BFD.sampledict, 
    kind = 'gtf_idmm', 
    sources = [sample_BFD.annot]
)

# save to file and add to the BioFileDocket
idmm_df.to_csv(idmm.path, sep = '\t')
sample_BFD.add_keyfile('gtf_idmm', idmm)

Unnamed: 0,gene_name,gene_id,transcript_id
0,Xkr4,gene0,rna0
8,Xkr4,gene0,rna1
15,Xkr4,gene0,rna2
22,Xkr4,gene0,rna3
29,LOC105243853,gene2,rna4
...,...,...,...
2224789,ND6,gene48830,gene48830
2224791,TrnE,gene48831,rna111578
2224793,CYTB,gene48832,gene48832
2224795,TrnT,gene48833,rna111579


# 10. Generate updated gtf
Generated an updated GTF file using transcript_id as the key. For some datasets, transcripts do not consistently get gene names and gene IDs added, which causes Transdecoder to throw errors. This resolves that problem.

In [16]:
models_asgtf_updated_df = models_asgtf_df.merge(idmm_df, on = 'transcript_id')
models_asgtf_updated_df.apply(lambda x: x['field_dictionary'].update({'gene_name': x['gene_name_y']}), axis = 1)
models_asgtf_updated_df.apply(lambda x: x['field_dictionary'].update({'gene_id': x['gene_id_y']}), axis = 1)
models_asgtf_updated_df['new_fields'] = models_asgtf_updated_df['field_dictionary'].apply(convert_dict_to_fields_gtf)
models_asgtf_updated_df = models_asgtf_updated_df[[0, 1, 2, 3, 4, 5, 6, 7, 'new_fields']]

models_asgtf_updated_filename = models_asgtf.filename.replace('.gtf', '_updated.gtf')
models_asgtf_updated = GenomeGtfFile(
    filename = models_asgtf_updated_filename, 
    sampledict = sample_BFD.sampledict, 
    reference_genome = sample_BFD.genome_fasta
)

models_asgtf_updated_df.to_csv(models_asgtf_updated.path, header = None, index = None, sep = '\t')

# 11. Generate cDNA and peptide files
Using the updated gtf file and genome file, generate cDNA sequence.

Then, using the cDNA sequence, generate peptide sequences using transdecoder.

Expect this step to take some time, probably ~20-30min.

In [17]:
cdna = sample_BFD.genome_fasta.get_transdecoder_cdna_gtf(models_asgtf_updated, TRANSDECODER_LOC)
sample_BFD.add_keyfile('cdna', cdna)

transdecoder_files = sample_BFD.cdna.to_pep_files(TDLONGORF_LOC, TDPREDICT_LOC)
sample_BFD.add_keyfiles(transdecoder_files)

-- Skipping CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/compute_base_probs.pl /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna 0 > /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir//base_freqs.dat, checkpoint [/home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir/.__checkpoints_longorfs/base_freqs_file.ok] exists.
-skipping long orf extraction, already completed earlier as per checkpoint: /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir/.__checkpoints_longorfs/TD.longorfs.ok
-- Skipping CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/get_top_longest_fasta_entries.pl /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir//longest_orfs.cds 5000 5000 > /home/ec

null device 
          1 


* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/compute_AUC.pl /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir//start_refinement.feature.scores.roc


null device 
          1 


* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/make_seqLogo.Rscript /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir//start_refinement.+.pwm || :
Error in library(seqLogo) : there is no package called ‘seqLogo’
Execution halted
* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/make_seqLogo.Rscript /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir//start_refinement.-.pwm || :
Error in library(seqLogo) : there is no package called ‘seqLogo’
Execution halted
* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/deplete_feature_noise.pl  --features_plus /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir//start_refinement.+.features  --pwm_minus /home/ec2-user/glial-origins/output/Mmus_adole

null device 
          1 


* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/compute_AUC.pl /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir//start_refinement.enhanced.feature.scores.roc


null device 
          1 


* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/make_seqLogo.Rscript /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir//start_refinement.enhanced.+.pwm || :
Error in library(seqLogo) : there is no package called ‘seqLogo’
Execution halted
* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/start_codon_refinement.pl --transcripts /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna --gff3_file /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir//longest_orfs.cds.best_candidates.gff3 --workdir /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir/ > /home/ec2-user/glial-origins/output/Mmus_adolescentbrain/Mus_musculus_adolescentbrain.transdecoder_dir//longest_orfs.cds.best_candidates.gff3.revised_s

# 12. Push files to AWS S3

Iteratively moves through the file_set and file_dict variables and populates files into the right place in AWS.

In [18]:
sample_BFD.local_to_s3()

GCF_000001635.23_GRCm38.p3_genomic.gff already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000001635.23_GRCm38.p3_genomic.fna already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
upload: ../../output/Mmus_adolescentbrain/10X05_1.loom.asgxc.tsv to s3://arcadia-reference-datasets/organisms/Mus_musculus/functional_sequencing/scRNA-Seq/10X05_1.loom.asgxc.tsv
upload: ../../output/Mmus_adolescentbrain/10X05_1.loom to s3://arcadia-reference-datasets/organisms/Mus_musculus/functional_sequencing/scRNA-Seq/10X05_1.loom
upload: ../../output/Mmus_adolescentbrain/10X05_1.loom.idmm.tsv to s3://arcadia-reference-datasets/organisms/Mus_musculus/genomics_reference/mapping_file/10X05_1.loom.idmm.tsv
upload: ../../output/Mmus_adolescentbrain/10X05_1.loom.cellannot.tsv to s3://arcadia-reference-datasets/organisms/Mus_musculus/functional_sequencing/scRNA-Seq/10X05_1.loom.cellannot.tsv
upload: ../../output/Mmus_

# 13. Pickle the `sample_BFD` variable for use by the next script

In [19]:
# Generate a .pkl file for the Docket
sample_BFD.pickle()

# Push to S3, optionally overwriting existing file
sample_BFD.push_to_s3(overwrite = True)

upload: ../../output/Mmus_adolescentbrain/Mmus_adolescentbrain_BioFileDocket.pkl to s3://arcadia-reference-datasets/glial-origins-pkl/Mmus_adolescentbrain_BioFileDocket.pkl
