# 0. Setup

Import packages and specify any important functions here.

In [1]:
# import standard python packages
import pandas as pd
import subprocess
import os
import dill

# add the utils and env directories to the path
import sys
sys.path.append('../../utils/')
sys.path.append('../../env/')

# import functions from utils directory files
from string_functions import *
from biofile_handling import *

# import paths to software installs from env
from install_locs import *

# 1. Download and describe data

This notebook collects data from the [Liao et al. 2022](https://www.sciencedirect.com/science/article/pii/S0092867418301168#sec4) Xenopus laevis adult cell atlas.

## Dataset description

- Data can be found at the [GEO Accession GSE195790](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE195790).
- This notebook previously collects data from one sample, ["Xenopus_brain_COL65"](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM6214268), for the purposes of data analysis and exploration.

In [2]:
################
# general info #
################

# Specify the name of the species folder in Amazon S3
species = 'Xenopus_laevis'

# Specify any particular identifying conditions, eg tissue type:
conditions = 'adultbrain'

# Specify url and other variables
genome_fasta_url = 'https://ftp.xenbase.org/pub/Genomics/JGI/Xenla9.2/XENLA_9.2_genome.fa.gz'
genome_version = 'XENLA_9.2'

annot_url = 'https://ftp.xenbase.org/pub/Genomics/JGI/Xenla9.2/XENLA_9.2_GCA.gff3'
gxc_url = 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM6214nnn/GSM6214268/suppl/GSM6214268_Xenopus_brain_COL65_dge.txt.gz'

###########
# runtime #
###########

protocol = 'curl'

species_prefix = prefixify(species)

# Specify folder as destination for file downloads
output_folder = '../../output/' + prefixify(species) + '_' + conditions + '/'

if not os.path.exists(output_folder):
    os.mkdir(output_folder)
    
species_SampleDict = SampleDict(species, conditions, output_folder)

genome_fasta = GenomeFastaFile(
    filename = '',
    sampledict = species_SampleDict,
    version = genome_version,
    url = genome_fasta_url,
    protocol = protocol
)

annot = GenomeGffFile(
    filename = '',
    sampledict = species_SampleDict,
    GenomeFastaFile = genome_fasta,
    url = annot_url,
    protocol = protocol
)

gxc = GxcFile(
    filename = '',
    sampledict = species_SampleDict,
    GenomeFastaFile = genome_fasta,
    GenomeAnnotFile = annot,
    url = gxc_url,
    protocol = protocol
)


sample_Docket = Docket(species_SampleDict)
keyfiles = {
    'annot': annot,
    'genome_fasta': genome_fasta,
    'gxc': gxc
}
sample_Docket.add_keyfiles(keyfiles)

display(vars(sample_Docket))

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  150M  100  150M    0     0  28.7M      0  0:00:05  0:00:05 --:--:-- 31.3M


{'species': 'Xenopus_laevis',
 'conditions': 'adultbrain',
 'directory': '../../output/Xlae_adultbrain/',
 'files': {},
 'annot': <biofile_handling.GenomeGffFile at 0x7f4629a2e260>,
 'genome_fasta': <biofile_handling.GenomeFastaFile at 0x7f4629a2e230>,
 'gxc': <biofile_handling.GxcFile at 0x7f46daa72530>}

# 2. Load in the gxc matrix and get gene names

In [3]:
genes_matrix = pd.read_csv(sample_Docket.gxc.path, sep = '\t')
display(genes_matrix)

gxc_genes_list = pd.DataFrame({'gene_name':genes_matrix['GENE']})
display(gxc_genes_list)

Unnamed: 0,GENE,AACCTATTCATATAAGGG,CTCGCATCAAAGTTAACT,AACCTAGTATACTTCCGC,AACCTAAAAGTTCTGAAA,CTCGCACGCACCCTCCAT,ACGTTGTATTGTAGCGAG,ACGAGCATGCTTTAGTCG,AACCTAGTCCCGCCATCT,AACCTAGCGAATTAGAGA,...,TCACTTGTTGCCATGCTT,TCGGGTTGTCACACTTAT,TCGTAATCGTAAGTTGCC,TGTCACGAATTACACAAG,TGTGCGTACTTCTAGTCG,TTAACTATACAGTGGATG,TTGGACACTTATGATCTT,AAAGTTACTTATGCCCTC,AACCTACGCACCTGCGGA,AACCTATAGTCGCTGTGT
0,3.S,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,42Sp43.L,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,42Sp50.L,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AK6.L,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AK6.S,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25907,zyg11b.L,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25908,zyg11b.S,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25909,zzef1.S,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25910,zzz3.L,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,gene_name
0,3.S
1,42Sp43.L
2,42Sp50.L
3,AK6.L
4,AK6.S
...,...
25907,zyg11b.L
25908,zyg11b.S
25909,zzef1.S
25910,zzz3.L


# 3. Get mapping identifiers

In [15]:
# load in the original GFF-based annotation
models = pd.read_csv(sample_Docket.annot.path, skiprows = 6, header = None, sep = '\t', on_bad_lines = 'skip', comment='#')
display(models)

attributes_column = 8

# Check the structure of fields in the GFF additional fields section
display(models[attributes_column][0])

# Remove any rows with NaNs
models.dropna(inplace = True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1L,Genbank,gene,17924,18399,.,-,.,"ID=gene0;Name=Xelaev18004747m;end_range=18399,..."
1,chr1L,Genbank,mRNA,17924,18399,.,-,.,ID=rna0;Parent=gene0;Note=transcript XELAEV_18...
2,chr1L,Genbank,exon,18336,18399,.,-,.,ID=id1;Parent=rna0;Note=transcript XELAEV_1800...
3,chr1L,Genbank,exon,17924,18243,.,-,.,ID=id2;Parent=rna0;Note=transcript XELAEV_1800...
4,chr1L,Genbank,CDS,18336,18399,.,-,0,ID=cds0;Parent=rna0;Dbxref=Phytozome:Xelaev180...
...,...,...,...,...,...,...,...,...,...
809901,Scaffold94051,Genbank,CDS,8,205,.,+,0,ID=cds47730;Parent=rna47730;Dbxref=Phytozome:X...
809902,Scaffold95291,Genbank,gene,9,236,.,-,.,ID=gene45941;Name=Xelaev18004691m;end_range=23...
809903,Scaffold95291,Genbank,mRNA,9,236,.,-,.,ID=rna47731;Parent=gene45941;Note=transcript X...
809904,Scaffold95291,Genbank,exon,9,236,.,-,.,ID=id452950;Parent=rna47731;Note=transcript XE...


'ID=gene0;Name=Xelaev18004747m;end_range=18399,.;gbkey=Gene;gene_biotype=protein_coding;locus_tag=XELAEV_18004747mg;partial=true;start_range=.,17924;Alias=XB-GENE-5942444'

In [18]:
# Extract field and database cross-ref (dbxref) information into columns
models['field_dictionary'] = models[attributes_column].apply(convert_fields_to_dict_gff)
models['gene_name'] = [d.get('Name') for d in models['field_dictionary']]
models['Xenbase'] = [d.get('Alias') for d in models['field_dictionary']]
models['Dbxref'] = [d.get('Dbxref') for d in models['field_dictionary']]
models['dbxref_dict'] = models['Dbxref'].apply(convert_dbxref_to_dict)

display(models)

Unnamed: 0,0,1,2,3,4,5,6,7,8,field_dictionary,gene_name,Xenbase,Dbxref,dbxref_dict,ID,Parent
0,chr1L,Genbank,gene,17924,18399,.,-,.,"ID=gene0;Name=Xelaev18004747m;end_range=18399,...","{'ID': 'gene0', 'Name': 'Xelaev18004747m', 'en...",Xelaev18004747m,XB-GENE-5942444,,,gene0,
1,chr1L,Genbank,mRNA,17924,18399,.,-,.,ID=rna0;Parent=gene0;Note=transcript XELAEV_18...,"{'ID': 'rna0', 'Parent': 'gene0', 'Note': 'tra...",,,,,rna0,gene0
2,chr1L,Genbank,exon,18336,18399,.,-,.,ID=id1;Parent=rna0;Note=transcript XELAEV_1800...,"{'ID': 'id1', 'Parent': 'rna0', 'Note': 'trans...",,,,,id1,rna0
3,chr1L,Genbank,exon,17924,18243,.,-,.,ID=id2;Parent=rna0;Note=transcript XELAEV_1800...,"{'ID': 'id2', 'Parent': 'rna0', 'Note': 'trans...",,,,,id2,rna0
4,chr1L,Genbank,CDS,18336,18399,.,-,0,ID=cds0;Parent=rna0;Dbxref=Phytozome:Xelaev180...,"{'ID': 'cds0', 'Parent': 'rna0', 'Dbxref': 'Ph...",OCT98948.1,,"Phytozome:Xelaev18004747m,NCBI_GP:OCT98948.1","{'Phytozome': 'Xelaev18004747m', 'NCBI_GP': 'O...",cds0,rna0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809901,Scaffold94051,Genbank,CDS,8,205,.,+,0,ID=cds47730;Parent=rna47730;Dbxref=Phytozome:X...,"{'ID': 'cds47730', 'Parent': 'rna47730', 'Dbxr...",OCT55143.1,,"Phytozome:Xelaev18004680m,NCBI_GP:OCT55143.1","{'Phytozome': 'Xelaev18004680m', 'NCBI_GP': 'O...",cds47730,rna47730
809902,Scaffold95291,Genbank,gene,9,236,.,-,.,ID=gene45941;Name=Xelaev18004691m;end_range=23...,"{'ID': 'gene45941', 'Name': 'Xelaev18004691m',...",Xelaev18004691m,,,,gene45941,
809903,Scaffold95291,Genbank,mRNA,9,236,.,-,.,ID=rna47731;Parent=gene45941;Note=transcript X...,"{'ID': 'rna47731', 'Parent': 'gene45941', 'Not...",,,,,rna47731,gene45941
809904,Scaffold95291,Genbank,exon,9,236,.,-,.,ID=id452950;Parent=rna47731;Note=transcript XE...,"{'ID': 'id452950', 'Parent': 'rna47731', 'Note...",,,,,id452950,rna47731


In [20]:
### Exception ###
# Generate a new .gtf file that includes a gene name in the "gene" field for gffread to use
models['ID'] = [d.get('ID') for d in models['field_dictionary']]
models['Parent'] = [d.get('Parent') for d in models['field_dictionary']]
display(models)

Unnamed: 0,0,1,2,3,4,5,6,7,8,field_dictionary,gene_name,Xenbase,Dbxref,dbxref_dict,ID,Parent
0,chr1L,Genbank,gene,17924,18399,.,-,.,"ID=gene0;Name=Xelaev18004747m;end_range=18399,...","{'ID': 'gene0', 'Name': 'Xelaev18004747m', 'en...",Xelaev18004747m,XB-GENE-5942444,,,gene0,
1,chr1L,Genbank,mRNA,17924,18399,.,-,.,ID=rna0;Parent=gene0;Note=transcript XELAEV_18...,"{'ID': 'rna0', 'Parent': 'gene0', 'Note': 'tra...",,,,,rna0,gene0
2,chr1L,Genbank,exon,18336,18399,.,-,.,ID=id1;Parent=rna0;Note=transcript XELAEV_1800...,"{'ID': 'id1', 'Parent': 'rna0', 'Note': 'trans...",,,,,id1,rna0
3,chr1L,Genbank,exon,17924,18243,.,-,.,ID=id2;Parent=rna0;Note=transcript XELAEV_1800...,"{'ID': 'id2', 'Parent': 'rna0', 'Note': 'trans...",,,,,id2,rna0
4,chr1L,Genbank,CDS,18336,18399,.,-,0,ID=cds0;Parent=rna0;Dbxref=Phytozome:Xelaev180...,"{'ID': 'cds0', 'Parent': 'rna0', 'Dbxref': 'Ph...",OCT98948.1,,"Phytozome:Xelaev18004747m,NCBI_GP:OCT98948.1","{'Phytozome': 'Xelaev18004747m', 'NCBI_GP': 'O...",cds0,rna0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809901,Scaffold94051,Genbank,CDS,8,205,.,+,0,ID=cds47730;Parent=rna47730;Dbxref=Phytozome:X...,"{'ID': 'cds47730', 'Parent': 'rna47730', 'Dbxr...",OCT55143.1,,"Phytozome:Xelaev18004680m,NCBI_GP:OCT55143.1","{'Phytozome': 'Xelaev18004680m', 'NCBI_GP': 'O...",cds47730,rna47730
809902,Scaffold95291,Genbank,gene,9,236,.,-,.,ID=gene45941;Name=Xelaev18004691m;end_range=23...,"{'ID': 'gene45941', 'Name': 'Xelaev18004691m',...",Xelaev18004691m,,,,gene45941,
809903,Scaffold95291,Genbank,mRNA,9,236,.,-,.,ID=rna47731;Parent=gene45941;Note=transcript X...,"{'ID': 'rna47731', 'Parent': 'gene45941', 'Not...",,,,,rna47731,gene45941
809904,Scaffold95291,Genbank,exon,9,236,.,-,.,ID=id452950;Parent=rna47731;Note=transcript XE...,"{'ID': 'id452950', 'Parent': 'rna47731', 'Note...",,,,,id452950,rna47731


In [37]:
gene_name_keys = models[['ID', 'gene_name']].dropna(subset = 'gene_name')
gene_name_keys = gene_name_keys[~gene_name_keys['gene_name'].str.contains('OCT*.*')]
gene_name_keys.rename(columns = {'ID': 'geneid'}, inplace = True)
display(gene_name_keys)

all_keys = models[['ID', 'Parent']]
rna_keys = all_keys[all_keys['ID'].str.contains('rna')]
rna_keys.rename(columns = {'ID': 'rnaid'}, inplace = True)
id_keys = all_keys[all_keys['ID'].str.contains('id')]
id_keys.rename(columns = {'ID': 'idid'}, inplace = True)
cds_keys = all_keys[all_keys['ID'].str.contains('cds')]
cds_keys.rename(columns = {'ID': 'cdsid'}, inplace = True)

key_aggregator = gene_name_keys.merge(rna_keys, left_on = 'geneid', right_on = 'Parent')
id_key_aggregator = key_aggregator.merge(id_keys, left_on = 'rnaid', right_on = 'Parent')
cds_key_aggregator = key_aggregator.merge(cds_keys, left_on = 'rnaid', right_on = 'Parent')

gene_keypairs = gene_name_keys[['geneid', 'gene_name']].rename(columns = {'geneid':'ID'})
rna_keypairs = key_aggregator[['rnaid', 'gene_name']].rename(columns = {'rnaid':'ID'})
id_keypairs = id_key_aggregator[['idid', 'gene_name']].rename(columns = {'idid':'ID'})
cds_keypairs = cds_key_aggregator[['cdsid', 'gene_name']].rename(columns = {'cdsid':'ID'})

keypair_collector = gene_keypairs.append(rna_keypairs).append(id_keypairs).append(cds_keypairs)
keypair_collector.rename(columns = {'gene_name': 'new_gene_name'}, inplace = True)
display(keypair_collector)

Unnamed: 0,geneid,gene_name
0,gene0,Xelaev18004747m
6,gene1,Xetrov90028798m.L
12,gene2,Xelaev18004749m
18,gene3,Xelaev18004750m
24,gene4,Xelaev18004751m
...,...,...
809886,gene45937,rpl37.S
809890,gene45938,Xelaev18004664m
809894,gene45939,Xelaev18004665m
809898,gene45940,Xelaev18004680m


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rna_keys.rename(columns = {'ID': 'rnaid'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  id_keys.rename(columns = {'ID': 'idid'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cds_keys.rename(columns = {'ID': 'cdsid'}, inplace = True)
  keypair_collector = gene_keypairs.append(rna_keypairs).append(id_keypairs).append(cds_keypairs)
  keypair_collector = gene_keypairs.append(rna_keypairs).append(id_keypairs).append(cds_keypairs)
  key

Unnamed: 0,ID,new_gene_name
0,gene0,Xelaev18004747m
6,gene1,Xetrov90028798m.L
12,gene2,Xelaev18004749m
18,gene3,Xelaev18004750m
24,gene4,Xelaev18004751m
...,...,...
331470,cds47727,rpl37.S
331471,cds47728,Xelaev18004664m
331472,cds47729,Xelaev18004665m
331473,cds47730,Xelaev18004680m


In [38]:
models = models.merge(keypair_collector, on = 'ID', how = 'left')

models.apply(lambda x: x['field_dictionary'].update({'gene': x['new_gene_name']}), axis = 1)
display(models)
models.dropna(subset = ['field_dictionary'], inplace = True)
models[attributes_column] = models['field_dictionary'].apply(convert_dict_to_fields_gff)

new_annot = GenomeGffFile(
    filename = sample_Docket.annot.filename.replace('.gff', '_new.gff'),
    sampledict = species_SampleDict,
    GenomeFastaFile = genome_fasta,
)

sample_Docket.add_keyfile(new_annot, 'new_annot')
new_models = models[[0, 1, 2, 3, 4, 5, 6, 7, 8]]
new_models.to_csv(sample_Docket.new_annot.path, sep = '\t', header = None, index = None)

display(new_models)
display(new_models[attributes_column][0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,field_dictionary,gene_name,Xenbase,Dbxref,dbxref_dict,ID,Parent,new_gene_name
0,chr1L,Genbank,gene,17924,18399,.,-,.,"ID=gene0;Name=Xelaev18004747m;end_range=18399,...","{'ID': 'gene0', 'Name': 'Xelaev18004747m', 'en...",Xelaev18004747m,XB-GENE-5942444,,,gene0,,Xelaev18004747m
1,chr1L,Genbank,mRNA,17924,18399,.,-,.,ID=rna0;Parent=gene0;Note=transcript XELAEV_18...,"{'ID': 'rna0', 'Parent': 'gene0', 'Note': 'tra...",,,,,rna0,gene0,Xelaev18004747m
2,chr1L,Genbank,exon,18336,18399,.,-,.,ID=id1;Parent=rna0;Note=transcript XELAEV_1800...,"{'ID': 'id1', 'Parent': 'rna0', 'Note': 'trans...",,,,,id1,rna0,Xelaev18004747m
3,chr1L,Genbank,exon,17924,18243,.,-,.,ID=id2;Parent=rna0;Note=transcript XELAEV_1800...,"{'ID': 'id2', 'Parent': 'rna0', 'Note': 'trans...",,,,,id2,rna0,Xelaev18004747m
4,chr1L,Genbank,CDS,18336,18399,.,-,0,ID=cds0;Parent=rna0;Dbxref=Phytozome:Xelaev180...,"{'ID': 'cds0', 'Parent': 'rna0', 'Dbxref': 'Ph...",OCT98948.1,,"Phytozome:Xelaev18004747m,NCBI_GP:OCT98948.1","{'Phytozome': 'Xelaev18004747m', 'NCBI_GP': 'O...",cds0,rna0,Xelaev18004747m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6375865,Scaffold94051,Genbank,CDS,8,205,.,+,0,ID=cds47730;Parent=rna47730;Dbxref=Phytozome:X...,"{'ID': 'cds47730', 'Parent': 'rna47730', 'Dbxr...",OCT55143.1,,"Phytozome:Xelaev18004680m,NCBI_GP:OCT55143.1","{'Phytozome': 'Xelaev18004680m', 'NCBI_GP': 'O...",cds47730,rna47730,Xelaev18004680m
6375866,Scaffold95291,Genbank,gene,9,236,.,-,.,ID=gene45941;Name=Xelaev18004691m;end_range=23...,"{'ID': 'gene45941', 'Name': 'Xelaev18004691m',...",Xelaev18004691m,,,,gene45941,,Xelaev18004691m
6375867,Scaffold95291,Genbank,mRNA,9,236,.,-,.,ID=rna47731;Parent=gene45941;Note=transcript X...,"{'ID': 'rna47731', 'Parent': 'gene45941', 'Not...",,,,,rna47731,gene45941,Xelaev18004691m
6375868,Scaffold95291,Genbank,exon,9,236,.,-,.,ID=id452950;Parent=rna47731;Note=transcript XE...,"{'ID': 'id452950', 'Parent': 'rna47731', 'Note...",,,,,id452950,rna47731,Xelaev18004691m


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1L,Genbank,gene,17924,18399,.,-,.,"ID=gene0;Name=Xelaev18004747m;end_range=18399,..."
1,chr1L,Genbank,mRNA,17924,18399,.,-,.,ID=rna0;Parent=gene0;Note=transcript XELAEV_18...
2,chr1L,Genbank,exon,18336,18399,.,-,.,ID=id1;Parent=rna0;Note=transcript XELAEV_1800...
3,chr1L,Genbank,exon,17924,18243,.,-,.,ID=id2;Parent=rna0;Note=transcript XELAEV_1800...
4,chr1L,Genbank,CDS,18336,18399,.,-,0,ID=cds0;Parent=rna0;Dbxref=Phytozome:Xelaev180...
...,...,...,...,...,...,...,...,...,...
6375865,Scaffold94051,Genbank,CDS,8,205,.,+,0,ID=cds47730;Parent=rna47730;Dbxref=Phytozome:X...
6375866,Scaffold95291,Genbank,gene,9,236,.,-,.,ID=gene45941;Name=Xelaev18004691m;end_range=23...
6375867,Scaffold95291,Genbank,mRNA,9,236,.,-,.,ID=rna47731;Parent=gene45941;Note=transcript X...
6375868,Scaffold95291,Genbank,exon,9,236,.,-,.,ID=id452950;Parent=rna47731;Note=transcript XE...


'ID=gene0;Name=Xelaev18004747m;end_range=18399,.;gbkey=Gene;gene_biotype=protein_coding;locus_tag=XELAEV_18004747mg;partial=true;start_range=.,17924;Alias=XB-GENE-5942444;gene=Xelaev18004747m'

# 4. Extract gene IDs for mapping to UniprotKB
Specify which set of identifiers will be use to query the [Uniprot ID Mapping Tool](https://www.uniprot.org/id-mapping) via API.

If using an identifier from the `dbxref_dict`, specify the name via string in the `dbxref_datafield` variable.

In [39]:
dbxref_datafield = ''
datafield = 'Xenbase'

if dbxref_datafield != '':
    models.dropna(axis = 0, subset = ['dbxref_dict'], inplace = True)
    models[dbxref_datafield] = [d.get(dbxref_datafield) for d in models['dbxref_dict']]

    models_subset = models[['gene_name', dbxref_datafield]].dropna().drop_duplicates()
    display(models_subset)

elif datafield == 'gene_name':
    models_subset = models[['gene_name']].dropna().drop_duplicates()
    display(models_subset)

elif datafield != '':
    models_subset = models[['gene_name', datafield]].dropna().drop_duplicates()
    display(models_subset)

else:
    raise Exception('You must provide a data field for ID mapping.')

Unnamed: 0,gene_name,Xenbase
0,Xelaev18004747m,XB-GENE-5942444
24,Xelaev18004750m,XB-GENE-5877070
32,Xelaev18004751m,XB-GENE-5758410
98,loc652493.L,XB-GENE-5833449
110,loc100130100.L,XB-GENE-6050773
...,...,...
6375130,trex2.L,XB-GENE-17336984
6375530,nmral1.L,XB-GENE-17339003
6375810,rps24.L,XB-GENE-967651
6375826,rpl37a.S,XB-GENE-17331914


# 5. Generate gene list file to query Uniprot ID Mapping API
Generate a text file ending in `_ids.txt` for sending to the ID mapping API.

In [40]:
datafield = dbxref_datafield if dbxref_datafield != '' else datafield

gene_list = models_subset[datafield].unique()

genelist_object = GeneListFile(
    filename = '', # filename is automatically generated from sampledict
    sampledict = species_SampleDict,
    sources = [sample_Docket.annot],
    genes = gene_list,
    identifier = datafield
    )

Wrote 22001 gene ids to ../../output/Xlae_adultbrain/Xlae_adultbrain_Xenbase_ids.txt


# 6. Query Uniprot ID Mapping API
Specify the `from_type` variable based on the Uniprot name of the identifier.
The table below lists some databases and the `from_type` string that the API accepts for that datatype.

For some reason, the ID mapping API usually returns "resource not found" the first time you use it.
Try re-running this cell if you receive that error.


| datatype | `from_type` string | description |
| ---: | :--- | :--- |
| Mouse Genome Informatics | `MGI` | ID starts with `MGI:` |
| Zebrafish Information Network | `ZFIN` | ID starts with `ZDB-GENE-` |
| NCBI Gene ID | `GeneID` | varies, usually numeric string |

In [41]:
from_type = 'Xenbase'
to_type = 'UniProtKB'

uniprot_idmm_object = genelist_object.get_uniprot_ids(ID_MAPPER_LOC, from_type, to_type)
sample_Docket.add_keyfiles({'uniprot_idmm': uniprot_idmm_object})

uniprot_idmm = pd.read_csv(sample_Docket.uniprot_idmm.path, sep = '\t')
display(uniprot_idmm)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  349k    0    52  100  348k     27   187k  0:00:01  0:00:01 --:--:--  187k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    24    0    24    0     0     37      0 --:--:-- --:--:-- --:--:--    37


Unnamed: 0,From,Entry,Entry Name,Reviewed,Protein names,Gene Names,Organism,Length
0,XB-GENE-865154,Q63ZJ1,Q63ZJ1_XENLA,unreviewed,"LOC494796 protein (succinate-CoA ligase, alpha...",suclg1.L galpha LOC494796 mtdps9 sucla1 suclg1...,Xenopus laevis (African clawed frog),195
1,XB-GENE-17336652,A0A1L8HSE5,A0A1L8HSE5_XENLA,unreviewed,leucine-rich repeat transmembrane neuronal pro...,lrrtm1.L,Xenopus laevis (African clawed frog),521
2,XB-GENE-5955229,Q6GLP0,CTNA2_XENLA,reviewed,Catenin alpha-2 (Alpha N-catenin),ctnna2,Xenopus laevis (African clawed frog),966
3,XB-GENE-17338166,A0A1L8HSD7,A0A1L8HSD7_XENLA,unreviewed,Alpha-1D adrenergic receptor (Alpha-1D adrenor...,adra1d.L,Xenopus laevis (African clawed frog),501
4,XB-GENE-17330544,A0A1L8HSE9,A0A1L8HSE9_XENLA,unreviewed,spermine oxidase isoform X1,smox.L,Xenopus laevis (African clawed frog),538
...,...,...,...,...,...,...,...,...
18678,XB-GENE-865099,O42569,SOX2_XENLA,reviewed,Transcription factor Sox-2 (XSox2) (XlSox-2) (...,sox2,Xenopus laevis (African clawed frog),311
18679,XB-GENE-865099,A0A4P2X5R0,A0A4P2X5R0_XENLA,unreviewed,Transcription factor Sox2.L,sox2.L,Xenopus laevis (African clawed frog),311
18680,XB-GENE-977569,Q5PQ23,ODFP2_XENLA,reviewed,Outer dense fiber protein 2 (Cenexin) (Outer d...,odf2,Xenopus laevis (African clawed frog),649
18681,XB-GENE-967651,P02377,RS24_XENLA,reviewed,40S ribosomal protein S24 (S19),rps24,Xenopus laevis (African clawed frog),132


# 7. Extract results and generate Uniprot IDMM
Generates an idmm that links `gene_name`, the `dbxref_datafield` seleted above, and `uniprot_id` returned by API.

In [42]:
uniprot_idpairs = uniprot_idmm[['From', 'Entry']]
uniprot_idpairs.rename(columns = {'From': datafield, 'Entry': 'uniprot_id'}, inplace = True)
uniprot_idpairs[datafield] = uniprot_idpairs[datafield].astype(str)
display(uniprot_idpairs)

uniprot_output_idmm = models_subset.merge(uniprot_idpairs, on = datafield)
display(uniprot_output_idmm)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_idpairs.rename(columns = {'From': datafield, 'Entry': 'uniprot_id'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_idpairs[datafield] = uniprot_idpairs[datafield].astype(str)


Unnamed: 0,Xenbase,uniprot_id
0,XB-GENE-865154,Q63ZJ1
1,XB-GENE-17336652,A0A1L8HSE5
2,XB-GENE-5955229,Q6GLP0
3,XB-GENE-17338166,A0A1L8HSD7
4,XB-GENE-17330544,A0A1L8HSE9
...,...,...
18678,XB-GENE-865099,O42569
18679,XB-GENE-865099,A0A4P2X5R0
18680,XB-GENE-977569,Q5PQ23
18681,XB-GENE-967651,P02377


Unnamed: 0,gene_name,Xenbase,uniprot_id
0,suclg1.L,XB-GENE-865154,Q63ZJ1
1,lrrtm1.L,XB-GENE-17336652,A0A1L8HSE5
2,ctnna2.L,XB-GENE-5955229,Q6GLP0
3,adra1d.L,XB-GENE-17338166,A0A1L8HSD7
4,smox.L,XB-GENE-17330544,A0A1L8HSE9
...,...,...,...
18679,sox2.S,XB-GENE-865099,O42569
18680,sox2.S,XB-GENE-865099,A0A4P2X5R0
18681,odf2.L,XB-GENE-977569,Q5PQ23
18682,rps24.L,XB-GENE-967651,P02377


In [43]:
# generate a filename and file for the idmm
uniprot_output_idmm_filename = '_'.join([species_prefix, conditions, 'uniprot-idmm.tsv'])
uniprot_output_idmm_object = IdmmFile(uniprot_output_idmm_filename, species_SampleDict, kind = 'uniprot_idmm', sources = [sample_Docket.annot])

# save to file and add to the Docket
uniprot_output_idmm.to_csv(uniprot_output_idmm_object.path, sep = '\t')
sample_Docket.add_keyfile(uniprot_output_idmm_object, 'uniprot_idmm')

# 8. Convert GFF to GTF

In [46]:
# convert the GFF file to GTF using gffread
models_asgtf = sample_Docket.new_annot.to_gtf(GFFREAD_LOC)

In [47]:
# load the newly-generated GTF file as a dataframe
models_asgtf_df = pd.read_csv(models_asgtf.path, skiprows = 0, header = None, sep = '\t')

display(models_asgtf_df)
display(models_asgtf_df[attributes_column][1])

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1L,Genbank,transcript,17924,18399,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
1,chr1L,Genbank,exon,17924,18243,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
2,chr1L,Genbank,exon,18336,18399,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
3,chr1L,Genbank,CDS,17924,18243,.,-,2,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
4,chr1L,Genbank,CDS,18336,18399,.,-,0,"transcript_id ""rna0""; gene_id ""gene0""; gene_na..."
...,...,...,...,...,...,...,...,...,...
763959,Scaffold94051,Genbank,exon,8,205,.,+,.,"transcript_id ""rna47730""; gene_id ""gene45940"";..."
763960,Scaffold94051,Genbank,CDS,8,205,.,+,0,"transcript_id ""rna47730""; gene_id ""gene45940"";..."
763961,Scaffold95291,Genbank,transcript,9,236,.,-,.,"transcript_id ""rna47731""; gene_id ""gene45941"";..."
763962,Scaffold95291,Genbank,exon,9,236,.,-,.,"transcript_id ""rna47731""; gene_id ""gene45941"";..."


'transcript_id "rna0"; gene_id "gene0"; gene_name "Xelaev18004747m";'

In [48]:
# Use a custom function to extract useful fields from the additional fields section (column 8)
# Pull from that dict to fill in additional useful columns
models_asgtf_df['field_dictionary'] = models_asgtf_df[attributes_column].apply(convert_fields_to_dict_gtf)
models_asgtf_df['gene_name'] = [d.get('gene_name') for d in models_asgtf_df['field_dictionary']]
models_asgtf_df['gene_id'] = [d.get('gene_id') for d in models_asgtf_df['field_dictionary']]
models_asgtf_df['transcript_id'] = [d.get('transcript_id') for d in models_asgtf_df['field_dictionary']]

# Remove CDS annotations because they interfere with TransDecoder cDNA generation
models_asgtf_df = models_asgtf_df[models_asgtf_df[2] != 'CDS']
display(models_asgtf_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,field_dictionary,gene_name,gene_id,transcript_id
0,chr1L,Genbank,transcript,17924,18399,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",Xelaev18004747m,gene0,rna0
1,chr1L,Genbank,exon,17924,18243,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",Xelaev18004747m,gene0,rna0
2,chr1L,Genbank,exon,18336,18399,.,-,.,"transcript_id ""rna0""; gene_id ""gene0""; gene_na...","{'transcript_id': 'rna0', 'gene_id': 'gene0', ...",Xelaev18004747m,gene0,rna0
5,chr1L,Genbank,transcript,35021,35381,.,-,.,"transcript_id ""rna1""; gene_id ""gene1""; gene_na...","{'transcript_id': 'rna1', 'gene_id': 'gene1', ...",Xetrov90028798m.L,gene1,rna1
6,chr1L,Genbank,exon,35021,35187,.,-,.,"transcript_id ""rna1""; gene_id ""gene1""; gene_na...","{'transcript_id': 'rna1', 'gene_id': 'gene1', ...",Xetrov90028798m.L,gene1,rna1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
763956,Scaffold93892,Genbank,exon,2,241,.,+,.,"transcript_id ""rna47729""; gene_id ""gene45939"";...","{'transcript_id': 'rna47729', 'gene_id': 'gene...",Xelaev18004665m,gene45939,rna47729
763958,Scaffold94051,Genbank,transcript,8,205,.,+,.,"transcript_id ""rna47730""; gene_id ""gene45940"";...","{'transcript_id': 'rna47730', 'gene_id': 'gene...",Xelaev18004680m,gene45940,rna47730
763959,Scaffold94051,Genbank,exon,8,205,.,+,.,"transcript_id ""rna47730""; gene_id ""gene45940"";...","{'transcript_id': 'rna47730', 'gene_id': 'gene...",Xelaev18004680m,gene45940,rna47730
763961,Scaffold95291,Genbank,transcript,9,236,.,-,.,"transcript_id ""rna47731""; gene_id ""gene45941"";...","{'transcript_id': 'rna47731', 'gene_id': 'gene...",Xelaev18004691m,gene45941,rna47731


# 9. Generate gtf-idmm
This file maps the `gene_name` to `gene_id` and `transcript_id` fields generated by the conversion from GFF to GTF, which will be needed for downstream processing.

In [49]:
# Extract gene_name, gene_id, and transcript_id fields to generate an ID mapping matrix (idmm)
idmm_df = models_asgtf_df[['gene_name', 'gene_id', 'transcript_id']].drop_duplicates()
idmm_df.dropna(inplace = True)
display(idmm_df)

# generate a filename and file for the idmm
idmm_filename = '_'.join([species_prefix, conditions, 'gtf-idmm.tsv'])
idmm = IdmmFile(idmm_filename, species_SampleDict, kind = 'gtf_idmm', sources = [sample_Docket.annot])

# save to file and add to the Docket
idmm_df.to_csv(idmm.path, sep = '\t')
sample_Docket.add_keyfile(idmm, 'gtf_idmm')

Unnamed: 0,gene_name,gene_id,transcript_id
0,Xelaev18004747m,gene0,rna0
5,Xetrov90028798m.L,gene1,rna1
10,Xelaev18004749m,gene2,rna2
15,Xelaev18004750m,gene3,rna3
20,Xelaev18004751m,gene4,rna4
...,...,...,...
763949,rpl37.S,gene45937,rna47727
763952,Xelaev18004664m,gene45938,rna47728
763955,Xelaev18004665m,gene45939,rna47729
763958,Xelaev18004680m,gene45940,rna47730


# 10. Generate updated gtf
Generated an updated GTF file using transcript_id as the key. For some datasets, transcripts do not consistently get gene names and gene IDs added, which causes Transdecoder to throw errors. This resolves that problem.

In [50]:
models_asgtf_updated_df = models_asgtf_df.merge(idmm_df, on = 'transcript_id')
models_asgtf_updated_df.apply(lambda x: x['field_dictionary'].update({'gene_name': x['gene_name_y']}), axis = 1)
models_asgtf_updated_df.apply(lambda x: x['field_dictionary'].update({'gene_id': x['gene_id_y']}), axis = 1)
models_asgtf_updated_df['new_fields'] = models_asgtf_updated_df['field_dictionary'].apply(convert_dict_to_fields_gtf)
models_asgtf_updated_df = models_asgtf_updated_df[[0, 1, 2, 3, 4, 5, 6, 7, 'new_fields']]

models_asgtf_updated_filename = models_asgtf.filename.replace('.gtf', '_updated.gtf')
models_asgtf_updated = GenomeGtfFile(models_asgtf_updated_filename, species_SampleDict, GenomeFastaFile = sample_Docket.genome_fasta)

models_asgtf_updated_df.to_csv(models_asgtf_updated.path, header = None, index = None, sep = '\t')

# 11. Generate cDNA and peptide files
Using the updated gtf file and genome file, generate cDNA sequence.

Then, using the cDNA sequence, generate peptide sequences using transdecoder.

Expect this step to take some time, probably ~20-30min.

In [52]:
cdna = sample_Docket.genome_fasta.get_transdecoder_cdna_gtf(models_asgtf_updated, TRANSDECODER_LOC)
sample_Docket.add_keyfile(cdna, 'cdna')

transdecoder_files = sample_Docket.cdna.to_pep_files(TDLONGORF_LOC, TDPREDICT_LOC)
sample_Docket.add_keyfiles(transdecoder_files)

* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/compute_base_probs.pl ../../output/Xlae_adultbrain/XENLA_9.2_genome_cDNA.fa 0 > ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//base_freqs.dat


-first extracting base frequencies, we'll need them later.


- extracting ORFs from transcripts.
-total transcripts to examine: 63852
[63800/63852] = 99.92% done    

CMD: touch ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir/.__checkpoints_longorfs/TD.longorfs.ok




#################################
### Done preparing long ORFs.  ###
##################################

	Use file: ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//longest_orfs.pep  for Pfam and/or BlastP searches to enable homology-based coding region identification.

	Then, run TransDecoder.Predict for your final coding region predictions.


* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/get_top_longest_fasta_entries.pl ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//longest_orfs.cds 5000 5000 > ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//longest_orfs.cds.top_longest_5000
* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/exclude_similar_proteins.pl ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//longest_orfs.cds.top_longest_5000 > ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.tra

null device 
          1 


* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/compute_AUC.pl ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//start_refinement.feature.scores.roc


null device 
          1 


* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/make_seqLogo.Rscript ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//start_refinement.+.pwm || :
Error in library(seqLogo) : there is no package called ‘seqLogo’
Execution halted
* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/make_seqLogo.Rscript ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//start_refinement.-.pwm || :
Error in library(seqLogo) : there is no package called ‘seqLogo’
Execution halted
* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/deplete_feature_noise.pl  --features_plus ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//start_refinement.+.features  --pwm_minus ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//start_refinement.-.pwm  --out_prefix ../../output/Xlae_adultbrain/

null device 
          1 


* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/compute_AUC.pl ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//start_refinement.enhanced.feature.scores.roc


null device 
          1 


* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/PWM/make_seqLogo.Rscript ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//start_refinement.enhanced.+.pwm || :
Error in library(seqLogo) : there is no package called ‘seqLogo’
Execution halted
* Running CMD: /home/ec2-user/miniconda3/pkgs/transdecoder-5.5.0-pl526_1/opt/transdecoder/util/start_codon_refinement.pl --transcripts ../../output/Xlae_adultbrain/XENLA_9.2_genome_cDNA.fa --gff3_file ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//longest_orfs.cds.best_candidates.gff3 --workdir ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir/ > ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_dir//longest_orfs.cds.best_candidates.gff3.revised_starts.gff3
Refining start codon selections.
-number of revised start positions: 3129
* Running CMD: cp ../../output/Xlae_adultbrain/Xenopus_laevis_adultbrain.transdecoder_

# 12. Push files to AWS S3

Iteratively moves through the file_set and file_dict variables and populates files into the right place in AWS.

# 13. Pickle the `sample_Docket` variable for use by the next script

In [53]:
dill_filename = output_folder + '_'.join([species_prefix, conditions, 'sample_Docket.pkl'])

with open(dill_filename, 'wb') as file:
    dill.dump(sample_Docket, file)