In [1]:
import os
import re
import yaml
import argparse
import pandas as pd
from Bio import SeqIO

def parse_arguments() -> argparse.Namespace:
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--config',
        type=argparse.FileType(mode='r'),
        default='config.yaml', 
        help='The config file to use. Must be placed in the root folder.',
    )

    args, unknown = parser.parse_known_args()
    if args.config:
        data = yaml.load(args.config, Loader=yaml.FullLoader)
        arg_dict = args.__dict__

        for key, value in data.items():
            arg_dict[key] = value

    return args

args = parse_arguments()

acc_to_name = dict()

ref_species_path = os.path.join('inputs/reference/',
                                [f for f in os.listdir('inputs/reference/') if f.startswith(args.species) and not f.endswith('.dmnd')][0])

reference_records = SeqIO.parse(ref_species_path, 'fasta')

gene_name_re = r'\[gene=(.*?)\]'

for record in reference_records:
    gene_name_match = re.search(gene_name_re, record.description)

    if gene_name_match:
        gene_name = gene_name_match.group(1)
        prot_id = record.id

        acc_to_name[prot_id] = gene_name


num_files = len(os.listdir('outputs'))

ortho = pd.read_csv("Orthogroups.tsv", sep='\t').set_index('Orthogroup')

In [2]:
acc_to_name

{'NP_009673.1': 'LYS2',
 'NP_010290.3': 'TRP1',
 'NP_010851.1': 'CAN1',
 'NP_010893.3': 'URA3',
 'NP_012965.3': 'GAP1',
 'NP_015387.1': 'FCY1'}

In [3]:
ortho['aspergillus_burnettii']

KeyError: 'aspergillus_burnettii'

In [4]:
def find_cols_with_x_not_found(df, x):
    return df.columns[df.eq('Not found').sum() == x].tolist()

d = dict()
missing_in = dict()

d = {key: [] for key in acc_to_name}
missing_in = {key: [] for key in acc_to_name}


In [5]:
not_found_counts = ortho.eq('Not found').sum()

In [6]:
not_found_counts

asterophora_parasitica                  3
lentinula_novae_zelandiae               2
echinodontium_tinctorium_aho_80_v1_0    3
pyrenophora_tritici_repentis            2
aspergillus_albertensis_v1_0            1
                                       ..
hericium_coralloides_fp_101451_v1_0     3
aspergillus_nutans_cbs_121_56_v1_0      1
clitocybe_gibba_ijfm_a808_v1_0          2
penicillium_egyptiacum                  1
monilinia_fructigena_str_mfrg269        1
Length: 2310, dtype: int64

In [7]:
for species in [e for e in ortho.columns.tolist()]:
    df = pd.read_csv(f'outputs/reciprocal/{species}.tsv', sep='\t', names=['scer_gene', 'ortho', 'identity', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'])

    for gene, l in d.items():
        if ortho[species][gene] != 'Not found' and gene in df['scer_gene'].values:
            d[gene].append(df[df['scer_gene'] == gene]['identity'].values[0])
        else:
            missing_in[gene].append(species)
            
total_missing = 0
for gene, missing_in_species in missing_in.items():
    length = len(missing_in_species)
    print(f'Gene {gene} {acc_to_name[gene]} is missing its ortholog in {length} species')
    total_missing += length

print(f'Total missing orthologs {total_missing}')

Gene NP_009673.1 LYS2 is missing its ortholog in 46 species
Gene NP_010290.3 TRP1 is missing its ortholog in 2128 species
Gene NP_010851.1 CAN1 is missing its ortholog in 1175 species
Gene NP_010893.3 URA3 is missing its ortholog in 72 species
Gene NP_012965.3 GAP1 is missing its ortholog in 570 species
Gene NP_015387.1 FCY1 is missing its ortholog in 838 species
Total missing orthologs 4829


## Mark orthologs with low (< 30%) identity as "Not found"

In [8]:
ortho_high_identity = ortho.copy(deep=True)

for species in ortho.columns.tolist():
    df = pd.read_csv(f'outputs/reciprocal/{species}.tsv', sep='\t')
    low_identities = df[df['identity'].astype(int) < 29]['query'].tolist()
    ortho_high_identity.loc[ortho_high_identity.index.isin(low_identities), species] = 'Not found'

ortho_high_identity

Unnamed: 0_level_0,asterophora_parasitica,lentinula_novae_zelandiae,echinodontium_tinctorium_aho_80_v1_0,pyrenophora_tritici_repentis,aspergillus_albertensis_v1_0,trichoderma_cremeoides_cbs_131486_v1_0,talaromyces_stipitatus,aspergillus_iizukae_cbs_541_69_v1_0,microthyrium_microscopicum,pseudovirgaria_hyperparasitica,...,flavomyces_fulophazii_dse8309_v1_0,trichoderma_sinense_daom_230004_v1_0,fusarium_tjaetaba,galerina_marginata_cbs_339_88,neocallimastix_constans_g3_v1_0,hericium_coralloides_fp_101451_v1_0,aspergillus_nutans_cbs_121_56_v1_0,clitocybe_gibba_ijfm_a808_v1_0,penicillium_egyptiacum,monilinia_fructigena_str_mfrg269
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NP_009673.1,KAG5643080.1,KAJ3864078.1,fgenesh1_kg.17_#_321_#_TRINITY_DN20072_c0_g1_i2,XP_001935976.1,CE60723_2593,fgenesh1_kg.1_#_304_#_TRINITY_DN64_c3_g2_i1,XP_002480033.1,CE153030_3531,KAF2666154.1,XP_033596428.1,...,fgenesh1_kg.93_#_58_#_TRINITY_DN4977_c3_g2_i2,CE149144_25113,XP_037212304.1,KDR72580,e_gw1.13.106.1,CE352478_12376,fgenesh1_kg.8_#_608_#_TRINITY_DN11024_c0_g1_i7,fgenesh1_pm.31_#_47,CAG8904822.1,RAL64359
NP_010290.3,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found,...,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found
NP_010851.1,Not found,Not found,Not found,XP_001932482.1,CE53660_1995,fgenesh1_kg.8_#_666_#_TRINITY_DN5445_c0_g2_i1,Not found,CE228629_2288,Not found,XP_033604410.1,...,fgenesh1_kg.35_#_251_#_TRINITY_DN5543_c0_g1_i4,CE43583_1555,Not found,KDR69137,Not found,Not found,CE166465_8796,Not found,CAG8907289.1,RAL64634
NP_010893.3,KAG5648969.1,KAJ3869448.1,fgenesh1_kg.20_#_209_#_TRINITY_DN19773_c0_g2_i1,XP_001940698.2,fgenesh1_kg.18_#_167_#_TRINITY_DN323_c0_g1_i1,CE62015_19051,XP_002478821.1,fgenesh1_kg.3_#_607_#_TRINITY_DN1531_c0_g2_i1,KAF2665404.1,XP_033599265.1,...,e_gw1.15.312.1,CE282625_9301,XP_037207718.1,KDR83643,estExt_Genewise1Plus.C_400096,fgenesh1_kg.19_#_334_#_TRINITY_DN4771_c0_g5_i1,CE85965_321,e_gw1.2.862.1,CAG8889705.1,RAL62221
NP_012965.3,KAG5641088.1,KAJ3865198.1,Not found,XP_001933206.1,fgenesh1_kg.49_#_110_#_TRINITY_DN4837_c0_g1_i3,CE137514_5108,XP_002478659.1,fgenesh1_pm.2_#_44,KAF2671377.1,XP_033599698.1,...,fgenesh1_kg.34_#_99_#_TRINITY_DN242_c0_g1_i4,CE307390_2479,XP_037203345.1,KDR79443,Not found,Not found,fgenesh1_kg.69_#_19_#_TRINITY_DN9768_c0_g1_i1,e_gw1.121.4.1,CAG8903919.1,RAL59412
NP_015387.1,Not found,KAJ3862919.1,CE133509_1510,Not found,CE3665_142,Not found,XP_002484082.1,CE278982_806,KAF2674182.1,XP_033596706.1,...,fgenesh1_kg.7_#_472_#_TRINITY_DN6029_c0_g1_i2,Not found,Not found,KDR73199,Not found,CE16674_1415,gm1.281_g,gm1.2445_g,CAG8893190.1,RAL66932


## Remove species with all missing orthologs

In [9]:
cols_with_all_not_found = find_cols_with_x_not_found(ortho_high_identity, len(ortho_high_identity))
len(cols_with_all_not_found)

40

In [10]:
ortho_high_identity.drop(cols_with_all_not_found, axis=1, inplace=True)

ortho_high_identity

Unnamed: 0_level_0,asterophora_parasitica,lentinula_novae_zelandiae,echinodontium_tinctorium_aho_80_v1_0,pyrenophora_tritici_repentis,aspergillus_albertensis_v1_0,trichoderma_cremeoides_cbs_131486_v1_0,talaromyces_stipitatus,aspergillus_iizukae_cbs_541_69_v1_0,microthyrium_microscopicum,pseudovirgaria_hyperparasitica,...,flavomyces_fulophazii_dse8309_v1_0,trichoderma_sinense_daom_230004_v1_0,fusarium_tjaetaba,galerina_marginata_cbs_339_88,neocallimastix_constans_g3_v1_0,hericium_coralloides_fp_101451_v1_0,aspergillus_nutans_cbs_121_56_v1_0,clitocybe_gibba_ijfm_a808_v1_0,penicillium_egyptiacum,monilinia_fructigena_str_mfrg269
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NP_009673.1,KAG5643080.1,KAJ3864078.1,fgenesh1_kg.17_#_321_#_TRINITY_DN20072_c0_g1_i2,XP_001935976.1,CE60723_2593,fgenesh1_kg.1_#_304_#_TRINITY_DN64_c3_g2_i1,XP_002480033.1,CE153030_3531,KAF2666154.1,XP_033596428.1,...,fgenesh1_kg.93_#_58_#_TRINITY_DN4977_c3_g2_i2,CE149144_25113,XP_037212304.1,KDR72580,e_gw1.13.106.1,CE352478_12376,fgenesh1_kg.8_#_608_#_TRINITY_DN11024_c0_g1_i7,fgenesh1_pm.31_#_47,CAG8904822.1,RAL64359
NP_010290.3,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found,...,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found,Not found
NP_010851.1,Not found,Not found,Not found,XP_001932482.1,CE53660_1995,fgenesh1_kg.8_#_666_#_TRINITY_DN5445_c0_g2_i1,Not found,CE228629_2288,Not found,XP_033604410.1,...,fgenesh1_kg.35_#_251_#_TRINITY_DN5543_c0_g1_i4,CE43583_1555,Not found,KDR69137,Not found,Not found,CE166465_8796,Not found,CAG8907289.1,RAL64634
NP_010893.3,KAG5648969.1,KAJ3869448.1,fgenesh1_kg.20_#_209_#_TRINITY_DN19773_c0_g2_i1,XP_001940698.2,fgenesh1_kg.18_#_167_#_TRINITY_DN323_c0_g1_i1,CE62015_19051,XP_002478821.1,fgenesh1_kg.3_#_607_#_TRINITY_DN1531_c0_g2_i1,KAF2665404.1,XP_033599265.1,...,e_gw1.15.312.1,CE282625_9301,XP_037207718.1,KDR83643,estExt_Genewise1Plus.C_400096,fgenesh1_kg.19_#_334_#_TRINITY_DN4771_c0_g5_i1,CE85965_321,e_gw1.2.862.1,CAG8889705.1,RAL62221
NP_012965.3,KAG5641088.1,KAJ3865198.1,Not found,XP_001933206.1,fgenesh1_kg.49_#_110_#_TRINITY_DN4837_c0_g1_i3,CE137514_5108,XP_002478659.1,fgenesh1_pm.2_#_44,KAF2671377.1,XP_033599698.1,...,fgenesh1_kg.34_#_99_#_TRINITY_DN242_c0_g1_i4,CE307390_2479,XP_037203345.1,KDR79443,Not found,Not found,fgenesh1_kg.69_#_19_#_TRINITY_DN9768_c0_g1_i1,e_gw1.121.4.1,CAG8903919.1,RAL59412
NP_015387.1,Not found,KAJ3862919.1,CE133509_1510,Not found,CE3665_142,Not found,XP_002484082.1,CE278982_806,KAF2674182.1,XP_033596706.1,...,fgenesh1_kg.7_#_472_#_TRINITY_DN6029_c0_g1_i2,Not found,Not found,KDR73199,Not found,CE16674_1415,gm1.281_g,gm1.2445_g,CAG8893190.1,RAL66932


In [11]:
ortho_high_identity = ortho_high_identity.reset_index()
ortho_high_identity.to_csv('Orthogroups_HI.tsv', sep='\t', index=False)

In [12]:
ortho_high_identity['kluyveromyces_marxianus']

0    XP_022678348.1
1    XP_022677214.1
2    XP_022673643.1
3    XP_022674242.1
4    XP_022673914.1
5    XP_022678383.1
Name: kluyveromyces_marxianus, dtype: object