# 0. Setup

Import packages and specify any important functions here.

In [1]:
# import standard python packages
import pandas as pd
import subprocess
import os
import dill

# add the utils and env directories to the path
import sys
sys.path.append('../../utils/')
sys.path.append('../../env/')

# import functions from utils directory files
from string_functions import *
from biofile_handling import *

# import paths to software installs from env
from install_locs import *

# 1. Collect Dockets for each starting dataset
Collects Dockets for each of the species datasets in the `species_dict`.

This `dict` expects key:value pairs in the form of `species`:`conditions` and is passed to the MultiSpeciesDocket class, which creates a folder in `output/` for the analysis.

The folder name takes the following format:

- `species_concat` + `_` + `global_conditions` + `analysis_type`

Where `species_concat` is the string concatenation of the alphabetize list of species prefixes.

In [2]:
################
# general info #
################

species_dict = {
    'Danio_rerio': 'adultbrain',
    'Mus_musculus': 'adultbrain'
}

global_conditions = 'adultbrain'

analysis_type = 'OrthoFinder'

################

sample_MSD = MultiSpeciesDocket(
    species_dict = species_dict,
    global_conditions = global_conditions,
    analysis_type = analysis_type
    )

sample_MSD.make_directory()
sample_MSD.get_Dockets()

# 2. Get the peptide file paths from each Docket

In [3]:
peptide_files = {pre: sample_MSD.species_Dockets[pre].transdecoder_pep for pre in sample_MSD.species_Dockets}
display(peptide_files)

for pepfile in peptide_files.values():
    start = os.path.abspath(pepfile.path)
    end = sample_MSD.directory + pepfile.filename
    if not os.path.exists(end):
        print('copying', start, 'to', end)
        subprocess.run(['cp', start, end])
    else:
        print('file', start, 'has already been moved')

{'Drer': <biofile_handling.TransdecoderOutFile at 0x7f8596b5c2e0>,
 'Mmus': <biofile_handling.TransdecoderOutFile at 0x7f8596b5ca30>}

copying /home/ec2-user/glial-origins/output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder.pep to ../../output/DrerMmus_adultbrain_OrthoFinder/GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder.pep
copying /home/ec2-user/glial-origins/output/Mmus_adultbrain/GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna.transdecoder.pep to ../../output/DrerMmus_adultbrain_OrthoFinder/GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna.transdecoder.pep


# 3. Run OrthoFinder on all collected peptide files

Before running, will check if analysis has already been done - if so, continues.

In [6]:
results_folder = '_'.join(['Results', sample_MSD.species_concat, sample_MSD.global_conditions])
of_folder = 'OrthoFinder/'
results_folder_abspath = os.path.abspath(sample_MSD.directory + of_folder + results_folder)
of_folder_abspath = os.path.abspath(sample_MSD.directory + of_folder)

warn = True

if not os.path.exists(of_folder_abspath):
    subprocess.run(['mkdir', of_folder_abspath])

if not os.path.exists(results_folder_abspath):
    print('Running OrthoFinder')
    subprocess.run([ORTHOFINDER_LOC, '-f', sample_MSD.directory, '-o', results_folder_abspath])
else:
    print('Results folder already exists at', results_folder_abspath)

orthogroups_file = OrthoFinderOutputFile(
    filename = '/Orthogroups/Orthogroups.tsv', 
    multispeciesdocket = sample_MSD,
    directory = results_folder_abspath)

orthogroups_df = pd.read_csv(orthogroups_file.path, sep = '\t')
display(orthogroups_df)

Running OrthoFinder

OrthoFinder version 2.5.4 Copyright (C) 2014 David Emms

2022-09-22 20:43:55 : Starting OrthoFinder 2.5.4
16 thread(s) for highly parallel tasks (BLAST searches etc.)
2 thread(s) for OrthoFinder algorithm

Checking required programs are installed
----------------------------------------
Test can run "mcl -h" - ok
Test can run "fastme -i /home/ec2-user/glial-origins/output/DrerMmus_adultbrain_OrthoFinder/OrthoFinder/Results_DrerMmus_adultbrain/Results_Sep22/WorkingDirectory/SimpleTest.phy -o /home/ec2-user/glial-origins/output/DrerMmus_adultbrain_OrthoFinder/OrthoFinder/Results_DrerMmus_adultbrain/Results_Sep22/WorkingDirectory/SimpleTest.tre" - ok

Dividing up work for BLAST for parallel processing
--------------------------------------------------
2022-09-22 20:43:56 : Creating diamond database 1 of 2
2022-09-22 20:43:57 : Creating diamond database 2 of 2

Running diamond all-versus-all
------------------------------
Using 16 thread(s)
2022-09-22 20:43:58 : This m

FileNotFoundError: [Errno 2] No such file or directory: '/home/ec2-user/glial-origins/output/DrerMmus_adultbrain_OrthoFinder/OrthoFinder/Results_DrerMmus_adultbrain/Orthogroups/Orthogroups.tsv'

# 4. Generate Orthofinder_exc file from gxc file of each starting dataset

In [None]:
for pre in sample_MSD.species_Dockets.keys():
    
    exc_filename = sample_MSD.species_Dockets[pre].gxc.filename.replace('.' + sample_MSD.species_Dockets[pre].gxc.filetype, '_as' + sample_MSD.species_concat + 'Orthogroup.' + sample_MSD.species_Dockets[pre].gxc.filetype)
    
    exc = ExcFile(
            filename = exc_filename,
            sampledict = sample_MSD.SampleDicts[pre],
            gxcfile = sample_MSD.species_Dockets[pre].gxc,
            embedding = 'Orthogroup'
            )
                                                                        
    if os.path.exists(exc.path):
        print('Orthofinder_exc file already exists at', exc.path)
        sample_MSD.species_Dockets[pre].add_keyfile(exc, 'Orthogroup_exc')
        
        continue
                                                                        
    gtf_idmm_df = pd.read_csv(sample_MSD.species_Dockets[pre].gtf_idmm.path, index_col = 0, sep = '\t')
    species_column = sample_MSD.species_Dockets[pre].cdna.filename + '.transdecoder'
    
    orthogroups_df_copy = orthogroups_df.copy(deep = True)
    
    orthogroups_df_copy['protein_id'] = orthogroups_df_copy[species_column].str.split(', ')
    orthogroups_df_copy = orthogroups_df_copy.explode('protein_id')
    orthogroups_df_copy['transcript_id'] = orthogroups_df_copy['protein_id'].str.split('.', expand = True)[0]
    
    og_keys = orthogroups_df_copy[['transcript_id', 'protein_id', 'Orthogroup']].drop_duplicates()
    og_keys.dropna(inplace = True)
    
    og_idmm_df = gtf_idmm_df.merge(og_keys, on = 'transcript_id')
    
    og_idmm_filename = pre + '_' + sample_MSD.SampleDicts[pre].conditions + '_og-idmm.tsv'
    
    og_idmm = IdmmFile(
        filename = og_idmm_filename,
        sampledict = sample_MSD.SampleDicts[pre],
        kind = 'og_idmm',
        sources = [orthogroups_file, sample_MSD.species_Dockets[pre].gtf_idmm]
        )
    
    og_idmm_df.to_csv(og_idmm.path, sep = '\t', index = None)
    
    sample_MSD.species_Dockets[pre].add_keyfile(og_idmm, 'og_idmm')
    sample_MSD.species_Dockets[pre].add_keyfile(orthogroups_file, sample_MSD.species_concat + '_OGfile')
    
    og_gene_keys = og_idmm_df[['gene_name', 'Orthogroup']].drop_duplicates()
    
    gxc_df = pd.read_csv(sample_MSD.species_Dockets[pre].gxc.path, sep = '\t')
    gxc_original_dataname = gxc_df.columns[0]
    gxc_df.rename(columns = {gxc_original_dataname: 'gene_name'}, inplace = True)
    exc_df = og_gene_keys.merge(gxc_df, on = 'gene_name')
    
    exc_df = exc_df.drop(columns = 'gene_name')
    exc_df = exc_df.groupby('Orthogroup').agg({i: ('first' if i == 'Orthogroup' else 'sum') for i in exc_df.columns}).reset_index(drop = True)
    
    exc_df.to_csv(exc.path, sep = '\t', index = None)
    sample_MSD.species_Dockets[pre].add_keyfile(exc, 'Orthogroup' + sample_MSD.species_concat + '_exc')
    
    display(exc_df)

# 5. Generate merged exc file for all samples in dataset

(TO BE WRITTEN)

# 6. Pickle the species_Docket files

In [6]:
for pre in sample_MSD.species_Dockets.keys():

    dill_filename = sample_MSD.species_Dockets[pre].directory + '_'.join([pre, sample_MSD.species_Dockets[pre].conditions, 'sample_Docket.pkl'])

    with open(dill_filename, 'wb') as file:
        dill.dump(sample_MSD.species_Dockets[pre], file)