# 0. Setup

Import packages and specify any important functions here.

In [1]:
# import standard python packages
import pandas as pd
import subprocess, os, dill, sys
import datetime

# add the utils and env directories to the path
sys.path.append('../../utils/')
sys.path.append('../../env/')

# import functions from utils directory files
from string_functions import *
from biofile_handling import *

# import paths to software installs from env
from install_locs import *

# 1. Collect BioFileDockets for each starting dataset
Collects BioFileDockets for each of the species datasets in the `species_dict`.

This `dict` expects key:value pairs in the form of `species`:`conditions` and is passed to the MultiSpeciesBioFileDocket class, which creates a folder in `output/` for the analysis.

The folder name takes the following format:

- `species_concat` + `_` + `global_conditions` + `analysis_type`

Where `species_concat` is the string concatenation of the alphabetize list of species prefixes.

In [2]:
################
# general info #
################

species_dict = {
    'Danio_rerio': 'adultbrain',
    'Mus_musculus': 'adultbrain',
    'Xenopus_laevis': 'adultbrain'
}

global_conditions = 'adultbrain'

analysis_type = 'OrthoFinder'

################

sample_MSD = MultiSpeciesBioFileDocket(
    species_dict = species_dict,
    global_conditions = global_conditions,
    analysis_type = analysis_type
    )

sample_MSD.get_BioFileDockets()
sample_MSD.s3_to_local()

/home/ec2-user/glial-origins/output/DrerMmusXlae_adultbrain_OrthoFinder/ already exists
file GCF_000002035.5_GRCz10_genomic.gff already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic.gff
file GCF_000002035.5_GRCz10_genomic.fna already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic.fna
file GSM3768152_Brain_8_dge.txt already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/GSM3768152_Brain_8_dge.txt
file Drer_adultbrain_uniprot-idmm.tsv already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/Drer_adultbrain_uniprot-idmm.tsv
file Drer_adultbrain_gtf-idmm.tsv already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/Drer_adultbrain_gtf-idmm.tsv
file GCF_000002035.5_GRCz10_genomic_cDNA.fna already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic_cDNA.fna
file GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder.bed al

# 2. Get the peptide file paths from each BioFileDocket

In [3]:
# Extract the peptide file info from the sample_MSD
peptide_files = {pre: sample_MSD.species_BioFileDockets[pre].transdecoder_pep for pre in sample_MSD.species_BioFileDockets}
display(peptide_files)

# Move the files from the starting directory to the OrthoFinder run directory
for pepfile in peptide_files.values():
    start = os.path.abspath(pepfile.path)
    end = sample_MSD.directory + pepfile.filename
    if not os.path.exists(end):
        print('copying', start, 'to', end)
        subprocess.run(['cp', start, end])
    else:
        print('file', start, 'has already been moved')

{'Drer': <biofile_handling.TransdecoderOutFile at 0x7f2904c98790>,
 'Mmus': <biofile_handling.TransdecoderOutFile at 0x7f2904c99000>,
 'Xlae': <biofile_handling.TransdecoderOutFile at 0x7f2904c99990>}

file /home/ec2-user/glial-origins/output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder.pep has already been moved
file /home/ec2-user/glial-origins/output/Mmus_adultbrain/GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna.transdecoder.pep has already been moved
file /home/ec2-user/glial-origins/output/Xlae_adultbrain/XENLA_9.2_genome_cDNA.fa.transdecoder.pep has already been moved


# 3. Run OrthoFinder on all collected peptide files

Before running, will check if analysis has already been done - if so, continues.

In [4]:
# Generate OrthoFinder run folder information based on today's date
OF_folder = sample_MSD.directory + 'OrthoFinder/'
today = datetime.date.today()
datefolder = OF_folder + 'Results_' + today.strftime('%b%d')
print(datefolder)

# Check of an OrthoFinder folder already exists; if not, runs analysis
if not os.path.exists(OF_folder):
    subprocess.run([ORTHOFINDER_LOC, '-f', sample_MSD.directory])
else:
    print('OF folder already exists at', OF_folder)
    print('Checking for results folder')
    folders = os.listdir(OF_folder)
    datefolder = OF_folder + [i for i in folders if 'Results' in i][0]
    print('Results folder found at', datefolder)

# Generates OrthoGroups file object
orthogroups_file = OrthoFinderOutputFile(
    sampledict = sample_MSD.sampledict,
    species_dict = sample_MSD.species_dict,
    filename = '/'.join(datefolder.split('/')[-2:]) + '/Orthogroups/Orthogroups.tsv'
)

orthogroups_df = pd.read_csv(orthogroups_file.path, sep = '\t')
display(orthogroups_df)

/home/ec2-user/glial-origins/output/DrerMmusXlae_adultbrain_OrthoFinder/OrthoFinder/Results_Jan04
OF folder already exists at /home/ec2-user/glial-origins/output/DrerMmusXlae_adultbrain_OrthoFinder/OrthoFinder/
Checking for results folder
Results folder found at /home/ec2-user/glial-origins/output/DrerMmusXlae_adultbrain_OrthoFinder/OrthoFinder/Results_Sep28


Unnamed: 0,Orthogroup,GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna.transdecoder,GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder,XENLA_9.2_genome_cDNA.fa.transdecoder
0,OG0000000,,,"gene10395_t.p1, gene12148_t.p1, gene1307_t.p1,..."
1,OG0000001,rna75711.p2,"gene10036.p1, gene10201.p2, gene10535.p1, gene...",
2,OG0000002,,,"gene2928_t.p3, gene38029_t.p2, gene41218_t.p1,..."
3,OG0000003,,,"gene264_t.p1, rna1157.p2, rna12598.p1, rna1516..."
4,OG0000004,,,"gene50425_t.p1, rna27145.p2, rna34455.p1, rna4..."
...,...,...,...,...
32348,OG0032348,,,"rna97220.p1, rna97221.p1"
32349,OG0032349,,,"rna97260.p1, rna97260.p2"
32350,OG0032350,,,"rna97447.p1, rna97448.p1"
32351,OG0032351,,,"rna97780.p1, rna97781.p2"


# 4. Generate Orthofinder_exc file from gxc file of each starting dataset

In [6]:
gxc_to_exc(sample_MSD = sample_MSD,
           embedding_df = orthogroups_df, 
           exc_file = orthogroups_file,
           overwrite = True)

Using Orthogroup embeddings as expected from OrthoFinder
Expanding column GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder
Extracting keys for Orthogroup
Saving keys to Drer_adultbrain_og_idmm.tsv
Done saving Drer_adultbrain_og_idmm.tsv
key "og_idmm" already exists, ignoring
key "DrerMmusXlae_OGfile" already exists, ignoring
Generating exc file at /home/ec2-user/glial-origins/output/Drer_adultbrain/GSM3768152_Brain_8_dge_asOrthogroup.txt
Preview of exc file: /home/ec2-user/glial-origins/output/Drer_adultbrain/GSM3768152_Brain_8_dge_asOrthogroup.txt


Unnamed: 0,Orthogroup,ACAATATATTGTACCTGA,ACGTTGATGGCGTAGAGA,AACCTAACCTGAATTTGC,CTCGCAGCCCTCTATGTA,ACGTTGCGTATTTAGTCG,AACCTATAGAGACCGACG,ACGAGCGCTGTGGCCTAG,GCGAATGGACATGGACAT,TCTACCGCTCAAGCTCAA,...,CGGCAGTCAAAGATCTCT,GACACTGCGAATCTGTGT,GCAGGAGGCTGCTAAGGG,TATGTATACTTCCGCACC,TGGATGTTCCGCACAATA,AACCTATGGATGGGGTTT,AAGCGGAGGACTCTCCAT,ACCTGACTCGCAAGCGAG,ACGTTGCAAAGTTTCATA,ATCAACTGCAATTTCCGC
0,OG0000001,0,0,0,0,1,0,1,1,0,...,0,0,0,0,0,1,0,1,0,0
1,OG0000005,0,1,0,0,0,0,1,0,0,...,0,2,0,0,0,0,0,0,0,0
2,OG0000009,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,OG0000013,0,1,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,OG0000014,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13611,OG0030137,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13612,OG0030138,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13614,OG0030140,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13615,OG0030141,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Exc file saved at /home/ec2-user/glial-origins/output/Drer_adultbrain/GSM3768152_Brain_8_dge_asOrthogroup.txt
key "Orthogroup_exc" already exists, ignoring
Expanding column GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna.transdecoder
Extracting keys for Orthogroup
Saving keys to Mmus_adultbrain_og_idmm.tsv
Done saving Mmus_adultbrain_og_idmm.tsv
key "og_idmm" already exists, ignoring
key "DrerMmusXlae_OGfile" already exists, ignoring
Generating exc file at /home/ec2-user/glial-origins/output/Mmus_adultbrain/GSM2906405_Brain1_dge_coerced_asOrthogroup.txt
Preview of exc file: /home/ec2-user/glial-origins/output/Mmus_adultbrain/GSM2906405_Brain1_dge_coerced_asOrthogroup.txt


Unnamed: 0,Orthogroup,Brain_1.CCGCTAAATAAATAAGGG,Brain_1.AACGCCGATCTTGCCCTC,Brain_1.ACCTGAAGTTTATCGTAA,Brain_1.CTCGCACTGAAACCGCTA,Brain_1.ATCAACATCTCTTCGGGT,Brain_1.GCGAATAGGGTCTATGTA,Brain_1.CGAGTAAGGGTCTAGTCG,Brain_1.ATCTCTTCGTAAGTTGCC,Brain_1.AACGCCTAAGGGCTCGCA,...,Brain_1.TTGGACGCCTAGGAGATC,Brain_1.TTAACTAAAGTTTATGTA,Brain_1.GTCCCGGGACATAGGACT,Brain_1.CCGCTAGGGTTTGCTCAA,Brain_1.TGATCAGCTGTGTCAAAG,Brain_1.TCACTTGAATTATGAAGC,Brain_1.AAGTACGCTGTGTATGTA,Brain_1.CCTAGATAGAGAATTTGC,Brain_1.CATCCCATTTGCGGCTGC,Brain_1.CAAAGTGGGTTTAGCGAG
0,OG0000001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,OG0000005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,OG0000008,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,OG0000011,3,0,0,0,0,15,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,OG0000017,0,1,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18406,OG0028363,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
18408,OG0028365,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
18409,OG0028366,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18413,OG0028373,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Exc file saved at /home/ec2-user/glial-origins/output/Mmus_adultbrain/GSM2906405_Brain1_dge_coerced_asOrthogroup.txt
key "Orthogroup_exc" already exists, ignoring
Expanding column XENLA_9.2_genome_cDNA.fa.transdecoder
Extracting keys for Orthogroup
Saving keys to Xlae_adultbrain_og_idmm.tsv
Done saving Xlae_adultbrain_og_idmm.tsv
key "og_idmm" already exists, ignoring
key "DrerMmusXlae_OGfile" already exists, ignoring


KeyError: "['gene_name'] not in index"

# 5. Generate merged exc file for all samples in dataset

(TO BE WRITTEN)

In [8]:
collector_df = pd.DataFrame()

display(collector_df)
export_filename = '_'.join([sample_MSD.species_concat, sample_MSD.global_conditions, 'asOrthogroup', 'combined_exc.tsv'])

jointexc = JointExcFile(
    species_dict = sample_MSD.species_dict,
    sampledict = sample_MSD.sampledict,
    filename = export_filename,
    embedding = 'Orthogroup',
    sources = [sample_MSD.species_BioFileDockets[pre].Orthogroup_exc for pre in sample_MSD.species_BioFileDockets]
)

if not os.path.exists(jointexc.path):

    # Iterates through all of the species in the Species BioFileDocket
    for i, pre in enumerate(sample_MSD.species_BioFileDockets.keys()):
    
        df = pd.read_csv(sample_MSD.species_BioFileDockets[pre].Orthogroup_exc.path, sep = '\t')
        df = df.add_prefix(pre + '_')
        df.rename(columns = {pre + '_' + 'Orthogroup': 'Orthogroup'}, inplace = True)
    
        if i == 0:
            collector_df = df
        else:
            collector_df = collector_df.merge(df, on = 'Orthogroup', how = 'inner')

    collector_df.to_csv(jointexc.path, sep = '\t', index = None)

# Iterates through all of the species in the Species BioFileDocket
for pre in sample_MSD.species_BioFileDockets.keys():
    sample_MSD.species_BioFileDockets[pre].add_keyfile('Orthogroup_jointexc', jointexc)

# 6. Pickle the species_BioFileDocket files

In [9]:
for pre in sample_MSD.species_BioFileDockets.keys():
    sample_MSD.species_BioFileDockets[pre].local_to_s3()
    sample_MSD.species_BioFileDockets[pre].pickle()
    sample_MSD.species_BioFileDockets[pre].push_to_s3(overwrite = True)

GCF_000002035.5_GRCz10_genomic.gff already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic.fna already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GSM3768152_Brain_8_dge.txt already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
Drer_adultbrain_uniprot-idmm.tsv already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
Drer_adultbrain_gtf-idmm.tsv already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic_cDNA.fna already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder.bed already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecode