# 0. Setup

Import packages and specify any important functions here.

In [2]:
# import standard python packages
import pandas as pd
import subprocess
import os
import dill
import sys
import datetime

# add the utils and env directories to the path
sys.path.append('../../utils/')
sys.path.append('../../env/')

# import functions from utils directory files
from string_functions import *
from biofile_handling import *

# import paths to software installs from env
from install_locs import *

# 1. Collect BioFileDockets for each starting dataset
Collects BioFileDockets for each of the species datasets in the `species_dict`.

This `dict` expects key:value pairs in the form of `species`:`conditions` and is passed to the MultiSpeciesBioFileDocket class, which creates a folder in `output/` for the analysis.

The folder name takes the following format:

- `species_concat` + `_` + `global_conditions` + `analysis_type`

Where `species_concat` is the string concatenation of the alphabetize list of species prefixes.

In [3]:
################
# general info #
################

species_dict = {
    'Danio_rerio': 'adultbrain',
    'Mus_musculus': 'adultbrain',
    'Xenopus_laevis': 'adultbrain'
}

global_conditions = 'adultbrain'

analysis_type = 'FoldSeek'

################

sample_MSD = MultiSpeciesBioFileDocket(
    species_dict = species_dict,
    global_conditions = global_conditions,
    analysis_type = analysis_type
    )

sample_MSD.get_BioFileDockets()
sample_MSD.s3_to_local()

/home/ec2-user/glial-origins/output/DrerMmusXlae_adultbrain_FoldSeek/ already exists
file GCF_000002035.5_GRCz10_genomic.gff already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic.gff
file GCF_000002035.5_GRCz10_genomic.fna already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic.fna
file GSM3768152_Brain_8_dge.txt already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/GSM3768152_Brain_8_dge.txt
file Drer_adultbrain_ZFIN_UniProtIDs.txt already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/Drer_adultbrain_ZFIN_UniProtIDs.txt
file Drer_adultbrain_gtf-idmm.tsv already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/Drer_adultbrain_gtf-idmm.tsv
file GCF_000002035.5_GRCz10_genomic_cDNA.fna already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/GCF_000002035.5_GRCz10_genomic_cDNA.fna
file GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder.bed

# 2. Get gene list from UniProt using Taxid
Also, set the taxid for each species BioFileDocket.

In [7]:
taxid_dict = {
    'Drer': 7955,
    'Mmus': 10090,
    'Xlae': 8355
}

for pre in sample_MSD.species_BioFileDockets:
    taxid = taxid_dict[pre]
    sample_MSD.species_BioFileDockets[pre].set_taxid(taxid)
    uniprot_genes = UniProtTaxidListFile(
        filename = pre + '_taxid_genes.txt',
        sampledict = sample_MSD.species_BioFileDockets[pre].sampledict,
        taxid = taxid_dict[pre]
    )
    sample_MSD.species_BioFileDockets[pre].add_keyfile('taxid_genes', uniprot_genes)
    
    display(pd.read_csv(sample_MSD.species_BioFileDockets[pre].taxid_genes.path, sep = '\t'))

overwriting taxid
key "taxid_genes" already exists, ignoring


Unnamed: 0,Entry,Organism (ID),Organism
0,A0A0D5W690,7955,Danio rerio (Zebrafish) (Brachydanio rerio)
1,A0A0G2KQY6,7955,Danio rerio (Zebrafish) (Brachydanio rerio)
2,A0A0G2KTI4,7955,Danio rerio (Zebrafish) (Brachydanio rerio)
3,A0A0G2KYH9,7955,Danio rerio (Zebrafish) (Brachydanio rerio)
4,A0A0G2L7I0,7955,Danio rerio (Zebrafish) (Brachydanio rerio)
...,...,...,...
52322,Z4YHP6,7955,Danio rerio (Zebrafish) (Brachydanio rerio)
52323,Z4YHQ6,7955,Danio rerio (Zebrafish) (Brachydanio rerio)
52324,Z4YHZ1,7955,Danio rerio (Zebrafish) (Brachydanio rerio)
52325,Z4YJ74,7955,Danio rerio (Zebrafish) (Brachydanio rerio)


overwriting taxid
key "taxid_genes" already exists, ignoring


Unnamed: 0,Entry,Organism (ID),Organism
0,A0A075F5C6,10090,Mus musculus (Mouse)
1,A0A087WPF7,10090,Mus musculus (Mouse)
2,A0A087WPU4,10090,Mus musculus (Mouse)
3,A0A087WRK1,10090,Mus musculus (Mouse)
4,A0A087WRT4,10090,Mus musculus (Mouse)
...,...,...,...
88529,Z4YN82,10090,Mus musculus (Mouse)
88530,Z4YN86,10090,Mus musculus (Mouse)
88531,Z4YN92,10090,Mus musculus (Mouse)
88532,Z4YN97,10090,Mus musculus (Mouse)


overwriting taxid


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5678k    0 5678k    0     0  43673      0 --:--:--  0:02:13 --:--:--  100k

key "taxid_genes" already exists, ignoring


100 5724k    0 5724k    0     0  43743      0 --:--:--  0:02:14 --:--:-- 69191


Unnamed: 0,Entry,Organism (ID),Organism
0,A0A191ZDL1,8355,Xenopus laevis (African clawed frog)
1,A0A1L8ETL3,8355,Xenopus laevis (African clawed frog)
2,A0A1L8F1P8,8355,Xenopus laevis (African clawed frog)
3,A0A1L8F5J9,8355,Xenopus laevis (African clawed frog)
4,A0A1L8FDW4,8355,Xenopus laevis (African clawed frog)
...,...,...,...
111802,W8W3R8,8355,Xenopus laevis (African clawed frog)
111803,X2FP34,8355,Xenopus laevis (African clawed frog)
111804,X5FKY5,8355,Xenopus laevis (African clawed frog)
111805,X5FX98,8355,Xenopus laevis (African clawed frog)


# 3. Download files from all species into a single directory

In [9]:
output_folder = sample_MSD.directory
alphafold_folder = output_folder + 'alphafold/'
if not os.path.exists(alphafold_folder):
    os.mkdir(alphafold_folder)

for pre in sample_MSD.species_BioFileDockets:
    taxid = sample_MSD.species_BioFileDockets[pre].metadata.taxid
    target_files = 'gs://public-datasets-deepmind-alphafold/proteomes/proteome-tax_id-' + taxid + '-*'
    
    checkpoint_filepath = output_folder + taxid + '.checkpoint'
    
    if not os.path.exists(checkpoint_filepath):
        subprocess.run(['gsutil', '-m', 'cp', target_files, output_folder])
        print('Download and untar for taxid', taxid, 'completed;\ncheckpoint file created at', checkpoint_filepath)
        
        filepaths = [output_folder + file for file in os.listdir(output_folder) if '.tar' in file]
        for file in filepaths:
            # Important to put all of the untar'd files into a separate folder
            # Otherwise filesystem stutters b/c too many files in a single directory
            subprocess.run(['tar', '-xf', file, '--directory', alphafold_folder])
        
        subprocess.run(['touch', checkpoint_filepath])
    else:
        print('Download and untar for taxid', taxid, 'has already been processed at', output_folder)

Download and untar for taxid 7955 has already been processed at /home/ec2-user/glial-origins/output/DrerMmusXlae_adultbrain_FoldSeek/
Download and untar for taxid 10090 has already been processed at /home/ec2-user/glial-origins/output/DrerMmusXlae_adultbrain_FoldSeek/
Download and untar for taxid 8355 has already been processed at /home/ec2-user/glial-origins/output/DrerMmusXlae_adultbrain_FoldSeek/


In [10]:
db_folder = output_folder + 'all_foldomesDB'
foldseek_clustertsv = output_folder + 'clu_greedy.tsv'

if not os.path.exists(foldseek_clustertsv):

    subprocess.run(['foldseek', 'createdb', alphafold_folder, db_folder])

    foldseek_out = output_folder + 'all_by_all'
    foldseek_tmp = output_folder + 'tmp'
    subprocess.run(['foldseek', 'search', db_folder, db_folder, foldseek_out, foldseek_tmp, '-a'])

    foldseek_tmscore = output_folder + 'all_by_all_tmscore'
    subprocess.run(['foldseek', 'aln2tmscore', db_folder, db_folder, foldseek_out, foldseek_tmscore])

    foldseek_tsv = foldseek_tmscore + '.tsv'
    subprocess.run(['foldseek', 'createtsv', db_folder, db_folder, foldseek_tmscore, foldseek_tsv])
    
    foldseek_cluster = output_folder + 'clu'
    subprocess.run(['foldseek', 'clust', db_folder, foldseek_out, foldseek_cluster, '--cluster-mode', '0', '--similarity-type', '2'])
    
    subprocess.run(['foldseek', 'createtsv', db_folder, db_folder, foldseek_cluster, foldseek_clustertsv])

else:
    print('final cluster tsv file already found at', foldseek_clustertsv)

final cluster tsv file already found at /home/ec2-user/glial-origins/output/DrerMmusXlae_adultbrain_FoldSeek/clu_greedy.tsv


In [16]:
df = pd.read_csv(foldseek_clustertsv, sep = '\t', names = ['ClusterRep', 'uniprot_id'])
df['ClusterRep'] = df['ClusterRep'].str.split('-', expand = True)[1]
df['uniprot_id'] = df['uniprot_id'].str.split('-', expand = True)[1]

df_merged = df.groupby('ClusterRep').agg({i: ('first' if i == 'ClusterRep' else lambda x: [i for i in x]) for i in df.columns}).reset_index(drop = True)
df_merged.drop(columns = ['ClusterRep'], inplace = True)
df_merged.insert(0, 'StruCluster', 'SC' + df_merged.index.astype('str'))

for pre in taxid_dict:
    species_genes = pd.read_csv(sample_MSD.species_BioFileDockets[pre].taxid_genes.path, sep = '\t')
    species_genelist = set(species_genes['Entry'])
    df_merged[pre] = df_merged['uniprot_id'].apply(lambda x: ','.join(set(x).intersection(species_genelist)))

struclusters_df = df_merged.drop(columns = ['uniprot_id'])
struclusters_filename = '_'.join([sample_MSD.species_concat, sample_MSD.global_conditions, 'struclusters_file.tsv'])

struclusters_file = FoldSeekOutputFile(
    filename = struclusters_filename,
    species_dict = sample_MSD.species_dict,
    sampledict = sample_MSD.sampledict
)

struclusters_df.to_csv(struclusters_file.path, sep = '\t', index = None)

display(struclusters_df)

Unnamed: 0,StruCluster,Drer,Mmus,Xlae
0,SC0,,"A0A023NE65,A0A4P2VRD2,A0A023NE78,B3Y998",
1,SC1,"Q4KMH7,E7FGK1,A0A1D5NSW6,E7EXY5,A0A2R8RMY7","D3Z4J9,A0A024A3D6,Q7TNG5,A0A0R4J1G7,A0A024A2Q0...","Q5U577,A0A1L8G1A2,A0A1L8GJX2,Q2TAF3"
2,SC2,,A0A024CD92,
3,SC3,,"A0A024CDU6,A0A024CD39,A0A024CEZ3",
4,SC4,,,A0A060CJT7
...,...,...,...,...
36646,SC36646,,Z4YN28,
36647,SC36647,,Z4YN31,
36648,SC36648,"A5D8S0,A0A2R8PWB6","Q3UXL4,Z4YN37","A0A1L8G1N3,A1L2H3"
36649,SC36649,,Z4YN77,


# 4. Generate Orthofinder_exc file from gxc file of each starting dataset

In [17]:
gxc_to_exc(sample_MSD = sample_MSD,
           embedding_df = struclusters_df, 
           exc_file = struclusters_file)

Using StruCluster embeddings as expected from FoldSeek
StruCluster_excfile already exists at /home/ec2-user/glial-origins/output/Drer_adultbrain/GSM3768152_Brain_8_dge_asStruCluster.txt skipping
StruCluster_excfile already exists at /home/ec2-user/glial-origins/output/Mmus_adultbrain/GSM2906405_Brain1_dge_coerced_asStruCluster.txt skipping
StruCluster_excfile already exists at /home/ec2-user/glial-origins/output/Xlae_adultbrain/GSM6214268_Xenopus_brain_COL65_dge_asStruCluster.txt skipping


# 5. Generate merged exc file for all samples in dataset

In [20]:
collector_df = pd.DataFrame()

export_filename = '_'.join([sample_MSD.species_concat, sample_MSD.global_conditions, 'asStruCluster', 'combined_exc.tsv'])

jointexc = JointExcFile(
    species_dict = sample_MSD.species_dict,
    sampledict = sample_MSD.sampledict,
    filename = export_filename,
    embedding = 'StruCluster',
    sources = [sample_MSD.species_BioFileDockets[pre].StruCluster_exc for pre in sample_MSD.species_BioFileDockets]
)

if not os.path.exists(jointexc.path):

    # Iterates through all of the species in the Species BioFileDocket
    for i, pre in enumerate(sample_MSD.species_BioFileDockets.keys()):
    
        df = pd.read_csv(sample_MSD.species_BioFileDockets[pre].StruCluster_exc.path, sep = '\t')
        df = df.add_prefix(pre + '_')
        df.rename(columns = {pre + '_' + 'StruCluster': 'StruCluster'}, inplace = True)
    
        if i == 0:
            collector_df = df
        else:
            collector_df = collector_df.merge(df, on = 'StruCluster', how = 'inner')

    collector_df.to_csv(jointexc.path, sep = '\t', index = None)

# Iterates through all of the species in the Species BioFileDocket
for pre in sample_MSD.species_BioFileDockets.keys():
    sample_MSD.species_BioFileDockets[pre].add_keyfile('StruCluster_jointexc', jointexc)

# 6. Pickle the species_BioFileDocket files

In [21]:
for pre in sample_MSD.species_BioFileDockets.keys():
    sample_MSD.species_BioFileDockets[pre].local_to_s3()
    sample_MSD.species_BioFileDockets[pre].pickle()
    sample_MSD.species_BioFileDockets[pre].push_to_s3(overwrite = True)

GCF_000002035.5_GRCz10_genomic.gff already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic.fna already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GSM3768152_Brain_8_dge.txt already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
Drer_adultbrain_ZFIN_UniProtIDs.txt already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
Drer_adultbrain_gtf-idmm.tsv already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic_cDNA.fna already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdecoder.bed already exists in S3 bucket, skipping upload. set overwrite = True to overwrite the existing file.
GCF_000002035.5_GRCz10_genomic_cDNA.fna.transdec

In [22]:
sample_MSD.pickle()
sample_MSD.push_to_s3()

upload: ../../output/DrerMmusXlae_adultbrain_FoldSeek/DrerMmusXlae_adultbrain_FoldSeek_MultiSpeciesBioFileDocket.pkl to s3://arcadia-reference-datasets/glial-origins-pkl/DrerMmusXlae_adultbrain_FoldSeek_MultiSpeciesBioFileDocket.pkl
