# 0. Setup

Import packages and specify any important functions here.

In [4]:
# import standard python packages
import pandas as pd
import subprocess
import os
import dill
import datetime

# add the utils and env directories to the path
import sys
sys.path.append('../../utils/')
sys.path.append('../../env/')

# import functions from utils directory files
from string_functions import *
from biofile_handling import *

# import paths to software installs from env
from install_locs import *

# 1. Collect BioFileDockets for each starting dataset
Collects BioFileDockets for each of the species datasets in the `species_dict`.

This `dict` expects key:value pairs in the form of `species`:`conditions` and is passed to the MultiSpeciesBioFileDocket class, which creates a folder in `output/` for the analysis.

The folder name takes the following format:

- `species_concat` + `_` + `global_conditions` + `analysis_type`

Where `species_concat` is the string concatenation of the alphabetize list of species prefixes.

In [5]:
################
# general info #
################

species_dict = {
    'Danio_rerio': 'adultbrain',
    'Mus_musculus': 'adultbrain',
    'Xenopus_laevis': 'adultbrain'
}

global_conditions = 'adultbrain'

analysis_type = 'OrthoFinder'

################

sample_MSD = MultiSpeciesBioFileDocket(
    species_dict = species_dict,
    global_conditions = global_conditions,
    analysis_type = analysis_type
    )

sample_MSD.make_directory()
sample_MSD.get_BioFileDockets()

# 2. Get the peptide file paths from each BioFileDocket

In [6]:
# Extract the peptide file info from the sample_MSD
peptide_files = {pre: sample_MSD.species_BioFileDockets[pre].transdecoder_pep for pre in sample_MSD.species_BioFileDockets}
display(peptide_files)

# Move the files from the starting directory to the OrthoFinder run directory
for pepfile in peptide_files.values():
    start = os.path.abspath(pepfile.path)
    end = sample_MSD.directory + pepfile.filename
    if not os.path.exists(end):
        print('copying', start, 'to', end)
        subprocess.run(['cp', start, end])
    else:
        print('file', start, 'has already been moved')

{'Drer': <biofile_handling.TransdecoderOutFile at 0x7f966b6ee5c0>,
 'Mmus': <biofile_handling.TransdecoderOutFile at 0x7f966b6ef490>,
 'Xlae': <biofile_handling.TransdecoderOutFile at 0x7f966b6efac0>}

file /home/ec2-user/glial-origins/output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder.pep has already been moved
file /home/ec2-user/glial-origins/output/Mmus_adultbrain/GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna.transdecoder.pep has already been moved
file /home/ec2-user/glial-origins/output/Xlae_adultbrain/XENLA_9.2_genome_cDNA.fa.transdecoder.pep has already been moved


# 3. Run OrthoFinder on all collected peptide files

Before running, will check if analysis has already been done - if so, continues.

In [7]:
# Generate OrthoFinder run folder information based on today's date
OF_folder = sample_MSD.directory + 'OrthoFinder/'
today = datetime.date.today()
datefolder = OF_folder + 'Results_' + today.strftime('%b%d')
print(datefolder)

# Check of an OrthoFinder folder already exists; if not, runs analysis
if not os.path.exists(OF_folder):
    subprocess.run([ORTHOFINDER_LOC, '-f', sample_MSD.directory])
else:
    print('OF folder already exists at', OF_folder)
    print('Checking for results folder')
    folders = os.listdir(OF_folder)
    datefolder = OF_folder + [i for i in folders if 'Results' in i][0]
    print('Results folder found at', datefolder)

# Generates OrthoGroups file object
orthogroups_file = OrthoFinderOutputFile(
    filename = '/Orthogroups/Orthogroups.tsv', 
    multispeciesbiofiledocket = sample_MSD,
    directory = datefolder)

orthogroups_df = pd.read_csv(orthogroups_file.path, sep = '\t')
display(orthogroups_df)

../../output/DrerMmusXlae_adultbrain_OrthoFinder/OrthoFinder/Results_Sep30
OF folder already exists at ../../output/DrerMmusXlae_adultbrain_OrthoFinder/OrthoFinder/
Checking for results folder
Results folder found at ../../output/DrerMmusXlae_adultbrain_OrthoFinder/OrthoFinder/Results_Sep28


Unnamed: 0,Orthogroup,GCF_000001635.23_GRCm38.p3_genomic_cDNA.fna.transdecoder,GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder,XENLA_9.2_genome_cDNA.fa.transdecoder
0,OG0000000,,,"gene10395_t.p1, gene12148_t.p1, gene1307_t.p1,..."
1,OG0000001,rna75711.p2,"gene10036.p1, gene10201.p2, gene10535.p1, gene...",
2,OG0000002,,,"gene2928_t.p3, gene38029_t.p2, gene41218_t.p1,..."
3,OG0000003,,,"gene264_t.p1, rna1157.p2, rna12598.p1, rna1516..."
4,OG0000004,,,"gene50425_t.p1, rna27145.p2, rna34455.p1, rna4..."
...,...,...,...,...
32348,OG0032348,,,"rna97220.p1, rna97221.p1"
32349,OG0032349,,,"rna97260.p1, rna97260.p2"
32350,OG0032350,,,"rna97447.p1, rna97448.p1"
32351,OG0032351,,,"rna97780.p1, rna97781.p2"


# 4. Generate Orthofinder_exc file from gxc file of each starting dataset

In [8]:
# Iterates through all of the species in the Species BioFileDocket
for pre in sample_MSD.species_BioFileDockets.keys():
    
    # Generates filename automatically
    exc_filename = sample_MSD.species_BioFileDockets[pre].gxc.filename.replace('.' + sample_MSD.species_BioFileDockets[pre].gxc.filetype, '_asOrthogroup.' + sample_MSD.species_BioFileDockets[pre].gxc.filetype)
    
    # Generates file object
    exc = ExcFile(
            filename = exc_filename,
            sampledict = sample_MSD.SampleDicts[pre],
            gxcfile = sample_MSD.species_BioFileDockets[pre].gxc,
            embedding = 'Orthogroup'
            )
    
    # Checks whether an Orthofinder_exc file already exists; avoid re-generating if it does
    if os.path.exists(exc.path):
        print('Orthofinder_exc file already exists at', exc.path)
        sample_MSD.species_BioFileDockets[pre].add_keyfile(exc, 'Orthogroup_exc')
        
        continue
    
    # Loads the gtf id mapping matrix                                                                    
    gtf_idmm_df = pd.read_csv(sample_MSD.species_BioFileDockets[pre].gtf_idmm.path, index_col = 0, sep = '\t')
    
    # Automatically gets the expected column name of the OrthoFinder file
    species_column = sample_MSD.species_BioFileDockets[pre].cdna.filename + '.transdecoder'
    
    # Copies orthogroups dataframe to do transformations without modifying original
    orthogroups_df_copy = orthogroups_df.copy(deep = True)
    
    # Expands orthogroups column for species-specific dataset
    orthogroups_df_copy['protein_id'] = orthogroups_df_copy[species_column].str.split(', ')
    orthogroups_df_copy = orthogroups_df_copy.explode('protein_id')
    orthogroups_df_copy['transcript_id'] = orthogroups_df_copy['protein_id'].str.split('.', expand = True)[0]
    
    # Gets id mapping between transcript, protein, and Orthogroup ids
    og_keys = orthogroups_df_copy[['transcript_id', 'protein_id', 'Orthogroup']].drop_duplicates()
    og_keys.dropna(inplace = True)
    
    # Merges original idmm with orthogroup info, generating a new idmm to be used in downstream analysis
    og_idmm_df = gtf_idmm_df.merge(og_keys, on = 'transcript_id')  
    og_idmm_filename = pre + '_' + sample_MSD.SampleDicts[pre].conditions + '_og-idmm.tsv'
    
    og_idmm = IdmmFile(
        filename = og_idmm_filename,
        sampledict = sample_MSD.SampleDicts[pre],
        kind = 'og_idmm',
        sources = [orthogroups_file, sample_MSD.species_BioFileDockets[pre].gtf_idmm]
        )
    og_idmm_df.to_csv(og_idmm.path, sep = '\t', index = None)
    
    # Adds the new og_idmm to the BioFileDocket for the species
    sample_MSD.species_BioFileDockets[pre].add_keyfile(og_idmm, 'og_idmm')
    sample_MSD.species_BioFileDockets[pre].add_keyfile(orthogroups_file, sample_MSD.species_concat + '_OGfile')
    
    # Extracts gene_name to Orthogroup mapping keys
    og_gene_keys = og_idmm_df[['gene_name', 'Orthogroup']].drop_duplicates()
    
    # Reads in original gxc matrix file
    gxc_df = pd.read_csv(sample_MSD.species_BioFileDockets[pre].gxc.path, sep = '\t')
    # Automatically gets the first column name of file for later use
    gxc_original_dataname = gxc_df.columns[0]
    # Renames that column to 'gene_name' for easier merging
    gxc_df.rename(columns = {gxc_original_dataname: 'gene_name'}, inplace = True)
    # Merges gxc with orthogroup gene keys to generate exc dataframe
    exc_df = og_gene_keys.merge(gxc_df, on = 'gene_name')
    
    # Removes 'gene_name' column
    exc_df = exc_df.drop(columns = 'gene_name')
    # Aggregates read counts per cell by Orthogroup ID
    exc_df = exc_df.groupby('Orthogroup').agg({i: ('first' if i == 'Orthogroup' else 'sum') for i in exc_df.columns}).reset_index(drop = True)
    
    # Saves new exc matrix to file and puts it into the species BioFileDocket
    exc_df.to_csv(exc.path, sep = '\t', index = None)
    sample_MSD.species_BioFileDockets[pre].add_keyfile(exc, 'Orthogroup_exc')
    
    display(exc_df)

Orthofinder_exc file already exists at ../../output/Drer_adultbrain/GSM3768152_Brain_8_dge_asOrthogroup.txt
Orthofinder_exc file already exists at ../../output/Mmus_adultbrain/GSM2906405_Brain1_dge_coerced_asOrthogroup.txt
Orthofinder_exc file already exists at ../../output/Xlae_adultbrain/GSM6214268_Xenopus_brain_COL65_dge_asOrthogroup.txt


# 5. Generate merged exc file for all samples in dataset

(TO BE WRITTEN)

# 6. Pickle the species_BioFileDocket files

In [9]:
for pre in sample_MSD.species_BioFileDockets.keys():
    sample_MSD.species_BioFileDockets[pre].pickle()