# 0. Setup

Import packages and specify any important functions here.

In [1]:
# import standard python packages
import pandas as pd
import subprocess
import os
import dill

# add the utils and env directories to the path
import sys
sys.path.append('../../utils/')
sys.path.append('../../env/')

# import functions from utils directory files
from string_functions import *
from biofile_handling import *

# import paths to software installs from env
from install_locs import *

# 1. Download and describe data


This notebook collects data from the  .

Dataset description
Data can be found at the .
This notebook only collects data from one sample, "Brain1", for the purposes of data analysis and exploration.

This notebook collects data from the [Jiang et al. 2021](https://www.frontiersin.org/articles/10.3389/fcell.2021.743421/full) zebrafish cell atlas.

## Dataset description

- Data can be found at the [GEO Accession GSE130487](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE130487).
- This notebook previously collects data from one sample, ["Brain8"](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM3768152), for the purposes of data analysis and exploration.

In [2]:
################
# general info #
################

# Specify the name of the species folder in Amazon S3
species = 'Danio_rerio'

# Specify any particular identifying conditions, eg tissue type:
conditions = 'adultbrain'

# Specify url and other variables
genome_fasta_url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/035/GCF_000002035.5_GRCz10/GCF_000002035.5_GRCz10_genomic.fna.gz'
genome_version = 'GRCz10'

annot_url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/035/GCF_000002035.5_GRCz10/GCF_000002035.5_GRCz10_genomic.gff.gz'
gxc_url = 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3768nnn/GSM3768152/suppl/GSM3768152_Brain_8_dge.txt.gz'

###########
# runtime #
###########

protocol = 'curl'

species_prefix = prefixify(species)

# Specify folder as destination for file downloads
output_folder = '../../output/' + prefixify(species) + '_' + conditions + '/'

if not os.path.exists(output_folder):
    os.mkdir(output_folder)
    
species_SampleDict = SampleDict(species, conditions, output_folder)

genome_fasta = GenomeFastaFile(
    filename = '',
    sampledict = species_SampleDict,
    version = genome_version,
    url = genome_fasta_url,
    protocol = protocol
)

annot = GenomeGffFile(
    filename = '',
    sampledict = species_SampleDict,
    GenomeFastaFile = genome_fasta,
    url = annot_url,
    protocol = protocol
)

gxc = GxcFile(
    filename = '',
    sampledict = species_SampleDict,
    GenomeFastaFile = genome_fasta,
    GenomeAnnotFile = annot,
    url = gxc_url,
    protocol = protocol
)


sample_BioFileDocket = BioFileDocket(species_SampleDict)
keyfiles = {
    'annot': annot,
    'genome_fasta': genome_fasta,
    'gxc': gxc
}
sample_BioFileDocket.add_keyfiles(keyfiles)

display(vars(sample_BioFileDocket))

{'species': 'Danio_rerio',
 'conditions': 'adultbrain',
 'directory': '../../output/Drer_adultbrain/',
 'files': {},
 'annot': <biofile_handling.GenomeGffFile at 0x7f73d40ea8f0>,
 'genome_fasta': <biofile_handling.GenomeFastaFile at 0x7f73d40eb850>,
 'gxc': <biofile_handling.GxcFile at 0x7f73d40eba60>}

# 2. Load in the gxc matrix and get gene names

In [3]:
genes_matrix = pd.read_csv(sample_BioFileDocket.gxc.path, sep = '\t')
display(genes_matrix)

gxc_genes_list = pd.DataFrame({'gene_name':genes_matrix['GENE']})
display(gxc_genes_list)

Unnamed: 0,GENE,ACAATATATTGTACCTGA,ACGTTGATGGCGTAGAGA,AACCTAACCTGAATTTGC,CTCGCAGCCCTCTATGTA,ACGTTGCGTATTTAGTCG,AACCTATAGAGACCGACG,ACGAGCGCTGTGGCCTAG,GCGAATGGACATGGACAT,TCTACCGCTCAAGCTCAA,...,CGGCAGTCAAAGATCTCT,GACACTGCGAATCTGTGT,GCAGGAGGCTGCTAAGGG,TATGTATACTTCCGCACC,TGGATGTTCCGCACAATA,AACCTATGGATGGGGTTT,AAGCGGAGGACTCTCCAT,ACCTGACTCGCAAGCGAG,ACGTTGCAAAGTTTCATA,ATCAACTGCAATTTCCGC
0,ABCF3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ACOT12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ACSF3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ACTC1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ACVR1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20802,zwilch,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20803,zyg11,0,0,0,0,0,0,0,1,0,...,0,1,1,0,0,0,0,0,0,0
20804,zyx,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20805,zzef1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,gene_name
0,ABCF3
1,ACOT12
2,ACSF3
3,ACTC1
4,ACVR1C
...,...
20802,zwilch
20803,zyg11
20804,zyx
20805,zzef1


# 3. Get mapping identifiers

In [4]:
# load in the original GFF-based annotation
models = pd.read_csv(sample_BioFileDocket.annot.path, skiprows = 7, header = None, sep = '\t')
display(models)

attributes_column = 8

# Check the structure of fields in the GFF additional fields section
display(models[attributes_column][3])

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,NC_007112.6,RefSeq,region,1.0,58871917.0,.,+,.,ID=id0;Dbxref=taxon:7955;Name=1;chromosome=1;g...
1,NC_007112.6,BestRefSeq,gene,6642.0,11878.0,.,-,.,"ID=gene0;Dbxref=GeneID:192301,ZFIN:ZDB-GENE-02..."
2,NC_007112.6,BestRefSeq,mRNA,6642.0,11878.0,.,-,.,"ID=rna0;Parent=gene0;Dbxref=GeneID:192301,Genb..."
3,NC_007112.6,BestRefSeq,exon,11751.0,11878.0,.,-,.,"ID=id1;Parent=rna0;Dbxref=GeneID:192301,Genban..."
4,NC_007112.6,BestRefSeq,exon,11550.0,11625.0,.,-,.,"ID=id2;Parent=rna0;Dbxref=GeneID:192301,Genban..."
...,...,...,...,...,...,...,...,...,...
1362356,NC_002333.2,RefSeq,exon,16449.0,16520.0,.,+,.,ID=id634507;Parent=rna65516;Dbxref=GeneID:1405...
1362357,NC_002333.2,RefSeq,gene,16527.0,16596.0,.,-,.,"ID=gene42297;Dbxref=GeneID:140511,ZFIN:ZDB-GEN..."
1362358,NC_002333.2,RefSeq,tRNA,16527.0,16596.0,.,-,.,ID=rna65517;Parent=gene42297;Dbxref=GeneID:140...
1362359,NC_002333.2,RefSeq,exon,16527.0,16596.0,.,-,.,ID=id634508;Parent=rna65517;Dbxref=GeneID:1405...


'ID=id1;Parent=rna0;Dbxref=GeneID:192301,Genbank:NM_173235.3,ZFIN:ZDB-GENE-020419-25;gbkey=mRNA;gene=rpl24;product=ribosomal protein L24;transcript_id=NM_173235.3'

In [5]:
# Remove any rows with NaNs
models.dropna(inplace = True)

# Extract field and database cross-ref (dbxref) information into columns
models['field_dictionary'] = models[attributes_column].apply(convert_fields_to_dict_gff)
models['gene_name'] = [d.get('gene') for d in models['field_dictionary']]
models['Dbxref'] = [d.get('Dbxref') for d in models['field_dictionary']]
models['dbxref_dict'] = models['Dbxref'].apply(convert_dbxref_to_dict)

display(models)

Unnamed: 0,0,1,2,3,4,5,6,7,8,field_dictionary,gene_name,Dbxref,dbxref_dict
0,NC_007112.6,RefSeq,region,1.0,58871917.0,.,+,.,ID=id0;Dbxref=taxon:7955;Name=1;chromosome=1;g...,"{'ID': 'id0', 'Dbxref': 'taxon:7955', 'Name': ...",,taxon:7955,{'taxon': '7955'}
1,NC_007112.6,BestRefSeq,gene,6642.0,11878.0,.,-,.,"ID=gene0;Dbxref=GeneID:192301,ZFIN:ZDB-GENE-02...","{'ID': 'gene0', 'Dbxref': 'GeneID:192301,ZFIN:...",rpl24,"GeneID:192301,ZFIN:ZDB-GENE-020419-25","{'GeneID': '192301', 'ZFIN': 'ZDB-GENE-020419-..."
2,NC_007112.6,BestRefSeq,mRNA,6642.0,11878.0,.,-,.,"ID=rna0;Parent=gene0;Dbxref=GeneID:192301,Genb...","{'ID': 'rna0', 'Parent': 'gene0', 'Dbxref': 'G...",rpl24,"GeneID:192301,Genbank:NM_173235.3,ZFIN:ZDB-GEN...","{'GeneID': '192301', 'Genbank': 'NM_173235.3',..."
3,NC_007112.6,BestRefSeq,exon,11751.0,11878.0,.,-,.,"ID=id1;Parent=rna0;Dbxref=GeneID:192301,Genban...","{'ID': 'id1', 'Parent': 'rna0', 'Dbxref': 'Gen...",rpl24,"GeneID:192301,Genbank:NM_173235.3,ZFIN:ZDB-GEN...","{'GeneID': '192301', 'Genbank': 'NM_173235.3',..."
4,NC_007112.6,BestRefSeq,exon,11550.0,11625.0,.,-,.,"ID=id2;Parent=rna0;Dbxref=GeneID:192301,Genban...","{'ID': 'id2', 'Parent': 'rna0', 'Dbxref': 'Gen...",rpl24,"GeneID:192301,Genbank:NM_173235.3,ZFIN:ZDB-GEN...","{'GeneID': '192301', 'Genbank': 'NM_173235.3',..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362355,NC_002333.2,RefSeq,tRNA,16449.0,16520.0,.,+,.,ID=rna65516;Parent=gene42296;Dbxref=GeneID:140...,"{'ID': 'rna65516', 'Parent': 'gene42296', 'Dbx...",trnT,"GeneID:140518,ZFIN:ZDB-GENE-011205-39","{'GeneID': '140518', 'ZFIN': 'ZDB-GENE-011205-..."
1362356,NC_002333.2,RefSeq,exon,16449.0,16520.0,.,+,.,ID=id634507;Parent=rna65516;Dbxref=GeneID:1405...,"{'ID': 'id634507', 'Parent': 'rna65516', 'Dbxr...",trnT,"GeneID:140518,ZFIN:ZDB-GENE-011205-39","{'GeneID': '140518', 'ZFIN': 'ZDB-GENE-011205-..."
1362357,NC_002333.2,RefSeq,gene,16527.0,16596.0,.,-,.,"ID=gene42297;Dbxref=GeneID:140511,ZFIN:ZDB-GEN...","{'ID': 'gene42297', 'Dbxref': 'GeneID:140511,Z...",trnP,"GeneID:140511,ZFIN:ZDB-GENE-011205-38","{'GeneID': '140511', 'ZFIN': 'ZDB-GENE-011205-..."
1362358,NC_002333.2,RefSeq,tRNA,16527.0,16596.0,.,-,.,ID=rna65517;Parent=gene42297;Dbxref=GeneID:140...,"{'ID': 'rna65517', 'Parent': 'gene42297', 'Dbx...",trnP,"GeneID:140511,ZFIN:ZDB-GENE-011205-38","{'GeneID': '140511', 'ZFIN': 'ZDB-GENE-011205-..."


# 4. Extract gene IDs for mapping to UniprotKB
Specify which set of identifiers will be use to query the [Uniprot ID Mapping Tool](https://www.uniprot.org/id-mapping) via API.

If using an identifier from the `dbxref_dict`, specify the name via string in the `dbxref_datafield` variable.

In [6]:
dbxref_datafield = 'ZFIN'


models.dropna(axis = 0, subset = ['dbxref_dict'], inplace = True)
models[dbxref_datafield] = [d.get(dbxref_datafield) for d in models['dbxref_dict']]

models_subset = models[['gene_name', dbxref_datafield]].dropna().drop_duplicates()


display(models_subset)

Unnamed: 0,gene_name,ZFIN
1,rpl24,ZDB-GENE-020419-25
15,cep97,ZDB-GENE-031030-11
74,nfkbiz,ZDB-GENE-071024-1
102,eed,ZDB-GENE-050417-287
128,zgc:110091,ZDB-GENE-050417-34
...,...,...
1362347,ND6,ZDB-GENE-011205-13
1362349,trnE,ZDB-GENE-011205-37
1362352,CYTB,ZDB-GENE-011205-17
1362354,trnT,ZDB-GENE-011205-39


# 5. Generate gene list file to query Uniprot ID Mapping API
Generate a text file ending in `_ids.txt` for sending to the ID mapping API.

In [7]:
gene_list = models_subset[dbxref_datafield].unique()

genelist_object = GeneListFile(
    filename = '', # filename is automatically generated from sampledict
    sampledict = species_SampleDict,
    sources = [sample_BioFileDocket.annot],
    genes = gene_list,
    identifier = dbxref_datafield
    )

filename is ignored and generated by input as  Drer_adultbrain_ZFIN_ids.txt
Wrote 22031 gene ids to ../../output/Drer_adultbrain/Drer_adultbrain_ZFIN_ids.txt


# 6. Query Uniprot ID Mapping API
Specify the `from_type` variable based on the Uniprot name of the identifier.

The table below lists some databases and the `from_type` string that the API accepts for that datatype.




| datatype | `from_type` string | description |
| ---: | :--- | :--- |
| Mouse Genome Informatics | `MGI` | ID starts with `MGI:` |
| Zebrafish Information Network | `ZFIN` | ID starts with `ZDB-GENE-` |

In [8]:
from_type = 'ZFIN'
to_type = 'UniProtKB'

uniprot_idmm_object = genelist_object.get_uniprot_ids(ID_MAPPER_LOC, from_type, to_type)
sample_BioFileDocket.add_keyfiles({'uniprot_idmm': uniprot_idmm_object})

uniprot_idmm = pd.read_csv(sample_BioFileDocket.uniprot_idmm.path, sep = '\t')
display(uniprot_idmm)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  419k    0    52  100  418k     23   191k  0:00:02  0:00:02 --:--:--  191k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    24    0    24    0     0     33      0 --:--:-- --:--:-- --:--:--    33


Unnamed: 0,From,Entry,Entry Name,Reviewed,Protein names,Gene Names,Organism,Length
0,ZDB-GENE-020419-25,Q8JGR4,RL24_DANRE,reviewed,60S ribosomal protein L24,rpl24,Danio rerio (Zebrafish) (Brachydanio rerio),157
1,ZDB-GENE-020419-25,A0A0R4IMS3,A0A0R4IMS3_DANRE,unreviewed,60S ribosomal protein L24,rpl24 SO:0001217,Danio rerio (Zebrafish) (Brachydanio rerio),157
2,ZDB-GENE-031030-11,A0A0R4ICF0,A0A0R4ICF0_DANRE,unreviewed,Centrosomal protein 97,cep97 SO:0001217,Danio rerio (Zebrafish) (Brachydanio rerio),599
3,ZDB-GENE-031030-11,A0A0R4ISF5,A0A0R4ISF5_DANRE,unreviewed,Centrosomal protein 97,cep97 SO:0001217,Danio rerio (Zebrafish) (Brachydanio rerio),40
4,ZDB-GENE-031030-11,A0A0R4IX25,A0A0R4IX25_DANRE,unreviewed,Centrosomal protein 97,cep97 SO:0001217,Danio rerio (Zebrafish) (Brachydanio rerio),240
...,...,...,...,...,...,...,...,...
54705,ZDB-GENE-011205-12,Q9MIY0,NU5M_DANRE,reviewed,NADH-ubiquinone oxidoreductase chain 5 (EC 7.1...,mt-nd5 mtnd5 nd5,Danio rerio (Zebrafish) (Brachydanio rerio),606
54706,ZDB-GENE-011205-13,Q9MIX9,NU6M_DANRE,reviewed,NADH-ubiquinone oxidoreductase chain 6 (EC 7.1...,mt-nd6 mtnd6 nd6,Danio rerio (Zebrafish) (Brachydanio rerio),172
54707,ZDB-GENE-011205-13,A0A0A0VG13,A0A0A0VG13_DANRE,unreviewed,NADH-ubiquinone oxidoreductase chain 6 (EC 7.1...,ND6 mt-nd6 SO:0001217,Danio rerio (Zebrafish) (Brachydanio rerio),172
54708,ZDB-GENE-011205-17,Q9MIX8,CYB_DANRE,reviewed,Cytochrome b (Complex III subunit 3) (Complex ...,mt-cyb cob cytb mtcyb,Danio rerio (Zebrafish) (Brachydanio rerio),380


# 7. Extract results and generate Uniprot IDMM
Generates an idmm that links `gene_name`, the `dbxref_datafield` seleted above, and `uniprot_id` returned by API.

In [9]:
uniprot_idpairs = uniprot_idmm[['From', 'Entry']]
uniprot_idpairs.rename(columns = {'From': dbxref_datafield, 'Entry': 'uniprot_id'}, inplace = True)
display(uniprot_idpairs)

uniprot_output_idmm = models_subset.merge(uniprot_idpairs, on = dbxref_datafield)
display(uniprot_output_idmm)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_idpairs.rename(columns = {'From': dbxref_datafield, 'Entry': 'uniprot_id'}, inplace = True)


Unnamed: 0,ZFIN,uniprot_id
0,ZDB-GENE-020419-25,Q8JGR4
1,ZDB-GENE-020419-25,A0A0R4IMS3
2,ZDB-GENE-031030-11,A0A0R4ICF0
3,ZDB-GENE-031030-11,A0A0R4ISF5
4,ZDB-GENE-031030-11,A0A0R4IX25
...,...,...
54705,ZDB-GENE-011205-12,Q9MIY0
54706,ZDB-GENE-011205-13,Q9MIX9
54707,ZDB-GENE-011205-13,A0A0A0VG13
54708,ZDB-GENE-011205-17,Q9MIX8


Unnamed: 0,gene_name,ZFIN,uniprot_id
0,rpl24,ZDB-GENE-020419-25,Q8JGR4
1,rpl24,ZDB-GENE-020419-25,A0A0R4IMS3
2,cep97,ZDB-GENE-031030-11,A0A0R4ICF0
3,cep97,ZDB-GENE-031030-11,A0A0R4ISF5
4,cep97,ZDB-GENE-031030-11,A0A0R4IX25
...,...,...,...
54705,ND5,ZDB-GENE-011205-12,Q9MIY0
54706,ND6,ZDB-GENE-011205-13,Q9MIX9
54707,ND6,ZDB-GENE-011205-13,A0A0A0VG13
54708,CYTB,ZDB-GENE-011205-17,Q9MIX8


In [10]:
# generate a filename and file for the idmm
uniprot_output_idmm_filename = '_'.join([species_prefix, conditions, 'uniprot-idmm.tsv'])
uniprot_output_idmm_object = IdmmFile(uniprot_output_idmm_filename, species_SampleDict, kind = 'uniprot_idmm', sources = [sample_BioFileDocket.annot])

# save to file and add to the BioFileDocket
uniprot_output_idmm.to_csv(uniprot_output_idmm_object.path, sep = '\t')
sample_BioFileDocket.add_keyfile(uniprot_output_idmm_object, 'uniprot_idmm')

# 8. Convert GFF to GTF

In [11]:
# convert the GFF file to GTF using gffread
models_asgtf = sample_BioFileDocket.annot.to_gtf(GFFREAD_LOC)

Converted file GCF_000002035.5_GRCz10_genomic.gtf already exists at:
 ../../output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic.gtf


In [12]:
# load the newly-generated GTF file as a dataframe
models_asgtf_df = pd.read_csv(models_asgtf.path, skiprows = 0, header = None, sep = '\t')

display(models_asgtf_df)
display(models_asgtf_df[attributes_column][1])

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,NC_007112.6,BestRefSeq,transcript,6642,11878,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
1,NC_007112.6,BestRefSeq,exon,6642,6760,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
2,NC_007112.6,BestRefSeq,exon,6892,6955,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
3,NC_007112.6,BestRefSeq,exon,9558,9694,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
4,NC_007112.6,BestRefSeq,exon,10081,10191,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
...,...,...,...,...,...,...,...,...,...
1258174,NC_002333.2,RefSeq,CDS,15308,16448,.,+,0,"transcript_id ""gene42295""; gene_name ""CYTB"";"
1258175,NC_002333.2,RefSeq,transcript,16449,16520,.,+,.,"transcript_id ""rna65516""; gene_id ""gene42296"";..."
1258176,NC_002333.2,RefSeq,exon,16449,16520,.,+,.,"transcript_id ""rna65516""; gene_id ""gene42296"";..."
1258177,NC_002333.2,RefSeq,transcript,16527,16596,.,-,.,"transcript_id ""rna65517""; gene_id ""gene42297"";..."


'transcript_id "rna0"; gene_id "gene0"; gene_name "rpl24";'

In [13]:
# Use a custom function to extract useful fields from the additional fields section (column 8)
# Pull from that dict to fill in additional useful columns
models_asgtf_df['field_dictionary'] = models_asgtf_df[attributes_column].apply(convert_fields_to_dict_gtf)
models_asgtf_df['gene_name'] = [d.get('gene_name') for d in models_asgtf_df['field_dictionary']]
models_asgtf_df['gene_id'] = [d.get('gene_id') for d in models_asgtf_df['field_dictionary']]
models_asgtf_df['transcript_id'] = [d.get('transcript_id') for d in models_asgtf_df['field_dictionary']]

# Remove CDS annotations because they interfere with TransDecoder cDNA generation
models_asgtf_df = models_asgtf_df[models_asgtf_df[2] != 'CDS']
display(models_asgtf_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,field_dictionary,gene_name,gene_id,transcript_id
0,NC_007112.6,BestRefSeq,transcript,6642,11878,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",rpl24,gene0,rna0
1,NC_007112.6,BestRefSeq,exon,6642,6760,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",rpl24,gene0,rna0
2,NC_007112.6,BestRefSeq,exon,6892,6955,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",rpl24,gene0,rna0
3,NC_007112.6,BestRefSeq,exon,9558,9694,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",rpl24,gene0,rna0
4,NC_007112.6,BestRefSeq,exon,10081,10191,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",rpl24,gene0,rna0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1258173,NC_002333.2,RefSeq,transcript,15308,16448,.,+,.,"transcript_id ""gene42295""; gene_id ""gene42295""...","{'transcript_id': 'gene42295', 'gene_id': 'gen...",CYTB,gene42295,gene42295
1258175,NC_002333.2,RefSeq,transcript,16449,16520,.,+,.,"transcript_id ""rna65516""; gene_id ""gene42296"";...","{'transcript_id': 'rna65516', 'gene_id': 'gene...",trnT,gene42296,rna65516
1258176,NC_002333.2,RefSeq,exon,16449,16520,.,+,.,"transcript_id ""rna65516""; gene_id ""gene42296"";...","{'transcript_id': 'rna65516', 'gene_id': 'gene...",trnT,gene42296,rna65516
1258177,NC_002333.2,RefSeq,transcript,16527,16596,.,-,.,"transcript_id ""rna65517""; gene_id ""gene42297"";...","{'transcript_id': 'rna65517', 'gene_id': 'gene...",trnP,gene42297,rna65517


# 9. Generate gtf-idmm
This file maps the `gene_name` to `gene_id` and `transcript_id` fields generated by the conversion from GFF to GTF, which will be needed for downstream processing.

In [14]:
# Extract gene_name, gene_id, and transcript_id fields to generate an ID mapping matrix (idmm)
idmm_df = models_asgtf_df[['gene_name', 'gene_id', 'transcript_id']].drop_duplicates()
idmm_df.dropna(inplace = True)
display(idmm_df)

# generate a filename and file for the idmm
idmm_filename = '_'.join([species_prefix, conditions, 'gtf-idmm.tsv'])
idmm = IdmmFile(idmm_filename, species_SampleDict, kind = 'gtf_idmm', sources = [sample_BioFileDocket.annot])

# save to file and add to the BioFileDocket
idmm_df.to_csv(idmm.path, sep = '\t')
sample_BioFileDocket.add_keyfile(idmm, 'gtf_idmm')

Unnamed: 0,gene_name,gene_id,transcript_id
0,rpl24,gene0,rna0
13,cep97,gene1,rna2
25,cep97,gene1,rna1
48,cep97,gene1,rna3
71,nfkbiz,gene2,rna4
...,...,...,...
1258169,ND6,gene42293,gene42293
1258171,trnE,gene42294,rna65515
1258173,CYTB,gene42295,gene42295
1258175,trnT,gene42296,rna65516


# 10. Generate updated gtf
Generated an updated GTF file using transcript_id as the key. For some datasets, transcripts do not consistently get gene names and gene IDs added, which causes Transdecoder to throw errors. This resolves that problem.

In [15]:
models_asgtf_updated_df = models_asgtf_df.merge(idmm_df, on = 'transcript_id')
models_asgtf_updated_df.apply(lambda x: x['field_dictionary'].update({'gene_name': x['gene_name_y']}), axis = 1)
models_asgtf_updated_df.apply(lambda x: x['field_dictionary'].update({'gene_id': x['gene_id_y']}), axis = 1)
models_asgtf_updated_df['new_fields'] = models_asgtf_updated_df['field_dictionary'].apply(convert_dict_to_fields_gtf)
models_asgtf_updated_df = models_asgtf_updated_df[[0, 1, 2, 3, 4, 5, 6, 7, 'new_fields']]

models_asgtf_updated_filename = models_asgtf.filename.replace('.gtf', '_updated.gtf')
models_asgtf_updated = GenomeGtfFile(models_asgtf_updated_filename, species_SampleDict, GenomeFastaFile = sample_BioFileDocket.genome_fasta)

models_asgtf_updated_df.to_csv(models_asgtf_updated.path, header = None, index = None, sep = '\t')

# 11. Generate cDNA and peptide files
Using the updated gtf file and genome file, generate cDNA sequence.

Then, using the cDNA sequence, generate peptide sequences using transdecoder.

Expect this step to take some time, probably ~20-30min.

In [16]:
cdna = sample_BioFileDocket.genome_fasta.get_transdecoder_cdna_gtf(models_asgtf_updated, TRANSDECODER_LOC)
sample_BioFileDocket.add_keyfile(cdna, 'cdna')

transdecoder_files = sample_BioFileDocket.cdna.to_pep_files(TDLONGORF_LOC, TDPREDICT_LOC)
sample_BioFileDocket.add_keyfiles(transdecoder_files)

-- Skipping CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/compute_base_probs.pl ../../output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic_cDNA.fna 0 > ../../output/Drer_adultbrain/Danio_rerio_adultbrain.transdecoder_dir//base_freqs.dat, checkpoint [/home/ec2-user/glial-origins/notebooks/Drer_adultbrain/../../output/Drer_adultbrain/Danio_rerio_adultbrain.transdecoder_dir/.__checkpoints_longorfs/base_freqs_file.ok] exists.
-skipping long orf extraction, already completed earlier as per checkpoint: ../../output/Drer_adultbrain/Danio_rerio_adultbrain.transdecoder_dir/.__checkpoints_longorfs/TD.longorfs.ok
-- Skipping CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/get_top_longest_fasta_entries.pl ../../output/Drer_adultbrain/Danio_rerio_adultbrain.transdecoder_dir//longest_orfs.cds 5000 5000 > ../../output/Drer_adultbrain/Danio_rerio_adultbrain.transdecoder_dir//longest_orfs.cds.top_longest_5000, checkpoint [/hom

# 12. Push files to AWS S3

Iteratively moves through BioFileDocket BioFile objects and pushes to the right place in AWS.

In [17]:
sample_BioFileDocket.local_to_s3()

{
    "AcceptRanges": "bytes", 
    "ContentType": "binary/octet-stream", 
    "LastModified": "Wed, 31 Aug 2022 16:53:44 GMT", 
    "ContentLength": 404109353, 
    "VersionId": "6YfNqzr1SyHj1mGlHke1D4CNrCEp8aVR", 
    "ETag": "\"97f90265ca3e1b21f7c13e4ddfb41584-49\"", 
    "ServerSideEncryption": "AES256", 
    "Metadata": {}
}
GCF_000002035.5_GRCz10_genomic.gff already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
{
    "AcceptRanges": "bytes", 
    "ContentType": "binary/octet-stream", 
    "LastModified": "Tue, 11 Oct 2022 23:09:05 GMT", 
    "ContentLength": 1388963222, 
    "VersionId": "kZEWjktEIBOm0Z3sTNGRCe_CjgcEOWud", 
    "ETag": "\"8dba2a5b1b850bfc0cd3e7b3152903c3-166\"", 
    "ServerSideEncryption": "AES256", 
    "Metadata": {}
}
GCF_000002035.5_GRCz10_genomic.fna already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
{
    "AcceptRanges": "bytes", 
    "ContentType": "text/plain", 


# 13. Pickle the `sample_BioFileDocket` variable for use by the next script

In [18]:
sample_BioFileDocket.pickle()
sample_BioFileDocket.push_to_s3()

upload: ../../output/Drer_adultbrain/Drer_adultbrain_sample_BioFileDocket.pkl to s3://arcadia-reference-datasets/glial-origins-pkl/Drer_adultbrain_sample_BioFileDocket.pkl
