# 0. Setup

Import packages and specify any important functions here.

In [1]:
# import standard python packages
import pandas as pd
import subprocess
import os
import dill

# add the utils and env directories to the path
import sys
sys.path.append('../../utils/')
sys.path.append('../../env/')

# import functions from utils directory files
from string_functions import *
from biofile_handling import *

# import paths to software installs from env
from install_locs import *

In [2]:
################
# general info #
################

species_dict = {
    'Danio_rerio': 'adultbrain',
    'Mus_musculus': 'adultbrain',
    'Xenopus_laevis': 'adultbrain'
}

global_conditions = 'adultbrain'

sample_MSD = MultiSpeciesDocket(
    species_dict = species_dict,
    global_conditions = global_conditions
    )

sample_MSD.make_directory()
sample_MSD.get_Dockets()

In [3]:
peptide_files = {pre: sample_MSD.species_Dockets[pre].transdecoder_pep for pre in sample_MSD.species_Dockets}
display(peptide_files)

for pepfile in peptide_files.values():
    start = os.path.abspath(pepfile.path)
    end = sample_MSD.directory + pepfile.filename
    if not os.path.exists(end):
        print('copying', start, 'to', end)
        subprocess.run(['cp', start, end])
    else:
        print('file', start, 'has already been moved')

{'Drer': <biofile_handling.TransdecoderOutFile at 0x7f4eadae6800>,
 'Mmus': <biofile_handling.TransdecoderOutFile at 0x7f4eadae6fb0>,
 'Xlae': <biofile_handling.TransdecoderOutFile at 0x7f4eadae7580>}

file /home/ec2-user/glial-origins/output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder.pep has already been moved
file /home/ec2-user/glial-origins/output/Mmus_adultbrain/GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna.transdecoder.pep has already been moved
file /home/ec2-user/glial-origins/output/Xlae_adultbrain/XENLA_9.2_genome_cDNA.fa.transdecoder.pep has already been moved


In [11]:
results_folder = '_'.join(['Results', sample_MSD.species_concat, sample_MSD.global_conditions])
of_folder = 'OrthoFinder/'
results_folder_abspath = os.path.abspath(sample_MSD.directory + of_folder + results_folder)
of_folder_abspath = os.path.abspath(sample_MSD.directory + of_folder)

warn = True

if not os.path.exists(results_folder_abspath):
    
    if warn and os.path.exists(of_folder_abspath):
        raise Warning('A Results folder may already exist, set warn to False if you want to run anyway.')
    else:
        print('Running OrthoFinder')
        subprocess.run([ORTHOFINDER_LOC, '-f', sample_MSD.directory, '-o', results_folder_abspath])
else:
    print('Results folder already exists at', results_folder_abspath)

orthogroups_file = OrthoFinderOutputFile(
    filename = '/Orthogroups/Orthogroups.tsv', 
    multispeciesdocket = sample_MSD,
    directory = results_folder_abspath)

orthogroups_df = pd.read_csv(orthogroups_file.path, sep = '\t')
display(orthogroups_df)

Results folder already exists at /home/ec2-user/glial-origins/output/DrerMmusXlae_adultbrain_OrthoFinder/OrthoFinder/Results_DrerMmusXlae_adultbrain


Unnamed: 0,Orthogroup,GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna.transdecoder,GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder,XENLA_9.2_genome_cDNA.fa.transdecoder
0,OG0000000,"gene16575.p2, rna13103.p1, rna75711.p2, rna921...","gene10036.p1, gene10201.p2, gene10535.p1, gene...","rna31757.p1, rna51854.p1, rna51855.p1, rna5838..."
1,OG0000001,rna41863.p1,"gene36062.p1, gene36087.p1, gene36102.p1, gene...","rna34151.p1, rna37031.p1"
2,OG0000002,,,"gene2667.p1, gene2689.p1, gene2689.p2, gene299..."
3,OG0000003,"gene44223.p3, gene4542.p1, rna109419.p1, rna11...","gene10036.p3, gene10085.p1, gene10201.p1, gene...","gene20017.p1, rna11833.p1, rna34226.p1, rna342..."
4,OG0000004,,"gene10085.p2, gene10303.p2, gene10850.p1, gene...",gene35990.p1
...,...,...,...,...
32412,OG0032412,,,"rna9580.p2, rna9581.p2"
32413,OG0032413,,,"rna9870.p1, rna9871.p1"
32414,OG0032414,,,"rna9934.p1, rna9935.p1"
32415,OG0032415,,,"rna9954.p2, rna9955.p2"


In [48]:
for pre in ['Mmus']:
    gtf_idmm_df = pd.read_csv(sample_MSD.species_Dockets[pre].gtf_idmm.path, index_col = 0, sep = '\t')
    display(gtf_idmm_df)
    species_column = sample_MSD.species_Dockets[pre].cdna.filename + '.transdecoder'
    
    orthogroups_df_copy = orthogroups_df.copy(deep = True)
    
    orthogroups_df_copy['protein_id'] = orthogroups_df_copy[species_column].str.split(', ')
    orthogroups_df_copy = orthogroups_df_copy.explode('protein_id')
    orthogroups_df_copy['transcript_id'] = orthogroups_df_copy['protein_id'].str.split('.', expand = True)[0]
    
    og_keys = orthogroups_df_copy[['transcript_id', 'protein_id', 'Orthogroup']].drop_duplicates()
    og_keys.dropna(inplace = True)
    
    og_idmm_df = gtf_idmm_df.merge(og_keys, on = 'transcript_id')
    display(og_idmm_df)
    
    og_idmm_filename = pre + '_' + sample_MSD.SampleDicts[pre].conditions + '_og-idmm.tsv'
    
    og_idmm = IdmmFile(
        filename = og_idmm_filename,
        sampledict = sample_MSD.SampleDicts[pre],
        kind = 'og_idmm',
        sources = [orthogroups_file, sample_MSD.species_Dockets[pre].gtf_idmm]
        )
    
    og_idmm_df.to_csv(og_idmm.path, sep = '\t', index = None)
    
    sample_MSD.species_Dockets[pre].add_keyfile(og_idmm, 'og_idmm')
    sample_MSD.species_Dockets[pre].add_keyfile(orthogroups_file, sample_MSD.species_concat + '_OGfile')
    
    og_gene_keys = og_idmm_df[['gene_name', 'Orthogroup']]
    
    gxc_df = pd.read_csv(sample_MSD.species_Dockets[pre].gxc.path, sep = ',')
    display(gxc_df)

Unnamed: 0,gene_name,gene_id,transcript_id
0,Xkr4,gene0,rna0
8,Xkr4,gene0,rna1
15,Xkr4,gene0,rna2
22,Xkr4,gene0,rna3
29,LOC105243853,gene2,rna4
...,...,...,...
2224789,ND6,gene48830,gene48830
2224791,TrnE,gene48831,rna111578
2224793,CYTB,gene48832,gene48832
2224795,TrnT,gene48833,rna111579


Unnamed: 0,gene_name,gene_id,transcript_id,protein_id,Orthogroup
0,Xkr4,gene0,rna0,rna0.p1,OG0007024
1,Xkr4,gene0,rna1,rna1.p1,OG0007024
2,Xkr4,gene0,rna2,rna2.p1,OG0007024
3,Xkr4,gene0,rna3,rna3.p1,OG0007024
4,LOC105242467,gene5,rna6,rna6.p1,OG0022540
...,...,...,...,...,...
105366,Cbx7,gene48797,rna111555,rna111555.p1,OG0001305
105367,Cbx7,gene48797,rna111555,rna111555.p6,OG0003972
105368,Cbx7,gene48797,rna111556,rna111556.p2,OG0001305
105369,Cbx7,gene48797,rna111556,rna111556.p5,OG0003972


Unnamed: 0,Brain_1.CCGCTAAATAAATAAGGG,Brain_1.AACGCCGATCTTGCCCTC,Brain_1.ACCTGAAGTTTATCGTAA,Brain_1.CTCGCACTGAAACCGCTA,Brain_1.ATCAACATCTCTTCGGGT,Brain_1.GCGAATAGGGTCTATGTA,Brain_1.CGAGTAAGGGTCTAGTCG,Brain_1.ATCTCTTCGTAAGTTGCC,Brain_1.AACGCCTAAGGGCTCGCA,Brain_1.AACGCCTCACTTATACAG,...,Brain_1.TTGGACGCCTAGGAGATC,Brain_1.TTAACTAAAGTTTATGTA,Brain_1.GTCCCGGGACATAGGACT,Brain_1.CCGCTAGGGTTTGCTCAA,Brain_1.TGATCAGCTGTGTCAAAG,Brain_1.TCACTTGAATTATGAAGC,Brain_1.AAGTACGCTGTGTATGTA,Brain_1.CCTAGATAGAGAATTTGC,Brain_1.CATCCCATTTGCGGCTGC,Brain_1.CAAAGTGGGTTTAGCGAG
0610005C13Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007P14Rik,0,3,1,2,0,0,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009B22Rik,0,3,0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
0610009E02Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009L18Rik,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
n-R5s28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
n-R5s37,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
n-R5s52,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
n-R5s88,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
vars(sample_MSD.species_Dockets['Mmus'].gtf_idmm)

{'filename': 'Mmus_adultbrain_gtf-idmm.tsv',
 'species': 'Mus_musculus',
 'conditions': 'adultbrain',
 'directory': '../../output/Mmus_adultbrain/',
 'sampledict': <biofile_handling.SampleDict at 0x7f4eadae6c80>,
 'path': '../../output/Mmus_adultbrain/Mmus_adultbrain_gtf-idmm.tsv',
 'species_prefix': 'Mmus',
 's3uri': 's3://arcadia-reference-datasets/organisms/Mus_musculus/genomics_reference/mapping_file/Mmus_adultbrain_gtf-idmm.tsv',
 'sources': [<biofile_handling.GenomeGffFile at 0x7f4eadae6a10>],
 'kind': 'gtf_idmm'}

In [22]:
sample_MSD.species_Dockets['Mmus'].gtf_idmm.path

'../../output/Mmus_adultbrain/Mmus_adultbrain_gtf-idmm.tsv'