In [4]:
from Bio import SeqIO
import pandas as pd
import re
import os

In [5]:
#Set working directory so that everything can be called uniformly
os.chdir('/Users/Andi')
print(os.getcwd())

/Users/Andi


In [6]:
#call in all current Orthomcl IDs matched with Gene names.
orth_caurisdf = pd.read_csv('Desktop/funpath/DB/orth_genes/cauris_OG6_genes.tsv', sep='\t')
orth_chaedf = pd.read_csv('Desktop/funpath/DB/orth_genes/chaemulonii_OG6_genes.tsv', sep='\t')
orth_calbicansdf = pd.read_csv('Desktop/funpath/DB/orth_genes/calbicans_OG6_genes.tsv', sep='\t')
orth_sceredf= pd.read_csv('Desktop/funpath/DB/orth_genes/scerevisiae_OG6_genes.tsv', sep='\t')

In [7]:
orth_calbicansdf

Unnamed: 0,Accession,Gene Names
0,OG6r21_167609,C100010WA
1,OG6r21_119614,C100020CA
2,OG6r21_119615,C100030CA
3,OG6_115022,C100040WA
4,OG6r21_139948,C100050CA
...,...,...
6207,OG6_102452,CR10820WA
6208,OG6_123089,CR10830CA
6209,OG6_100349,CR10840CA
6210,OG6_101225,CR10850CA


In [8]:
#call in all current annotations. We want Uniprot entry IDs and Gene names specifically.
caurisdf = pd.read_csv('Desktop/funpath/DB/annotations/cauris_annotations_2025_05_08.tsv', sep='\t')
chaedf = pd.read_csv('Desktop/funpath/DB/annotations/chaemulonii_annotations_2025_05_06.tsv', sep='\t')
calbicansdf = pd.read_csv('Desktop/funpath/DB/annotations/calbicans_annotations_2025_05_06.tsv', sep='\t')
sceredf= pd.read_csv('Desktop/funpath/DB/annotations/scerevisiae_annotations_2025_05_06.tsv', sep='\t')

In [9]:
calbicansdf

Unnamed: 0,Entry,Gene Names,Protein names,Mass
0,A0A1D8PCA8,CMP1 CAALFM_C100730CA orf19.13454,Serine/threonine-protein phosphatase (EC 3.1.3...,69624
1,A0A1D8PCC3,SIN3 CAALFM_C100930CA orf19.13432,Transcriptional regulator,159985
2,A0A1D8PCL1,HGT1 HGT11 CAALFM_C101980WA orf19.4527,High-affinity glucose transporter 1,60670
3,A0A1D8PCN0,GAL10 CAALFM_C102150WA orf19.11156,Bifunctional UDP-glucose 4-epimerase/aldose 1-...,75451
4,A0A1D8PCV9,PDE2 CAALFM_C102840WA orf19.10489,Phosphodiesterase (EC 3.1.4.-),65514
...,...,...,...,...
6036,Q5APF3,PPT2 CAALFM_C109480WA orf19.12275,Holo-[acyl-carrier-protein] synthase,15782
6037,Q5APH1,CAALFM_C109330WA orf19.12257,Translation machinery-associated protein 16,23335
6038,Q5APH8,FGR43 CAALFM_C109270WA orf19.12250,Fgr43p,62930
6039,Q5API4,CAALFM_C109220WA orf19.12244,Major facilitator superfamily (MFS) profile do...,49746


In [10]:
# clean up the Orthomcl Gene accessions Remove everything before and including '|'
orth_caurisdf['Accession']=orth_caurisdf['Accession'].str.replace(r'^.*\|', '', regex=True)
orth_chaedf['Accession'] = orth_chaedf['Accession'].str.replace(r'^.*\|', '', regex=True)
orth_calbicansdf['Accession'] = orth_calbicansdf['Accession'].str.replace(r'^.*\|', '', regex=True) 
orth_sceredf['Accession']= orth_sceredf['Accession'].str.replace(r'^.*\|', '', regex=True)

In [11]:
#Here I make the albicans gene names consistent across dataframes
#orth_calbicansdf['Accession'] = orth_calbicansdf['Accession'].str.replace('_', '', regex=False)
calbicansdf['Gene Names'] = calbicansdf['Gene Names'].str.replace('_', ' ', regex=False)

In [12]:
def merge_on_token_match(orthomcldf, GeneName, annotationdf, Gene_Names):
    """
    Merge rows from orthomcldf and annotationdf where orthomcldf[Accession] matches any space-separated token in annotationdf[Gene_Names].

    Parameters:
        orthomcldf (pd.DataFrame): First dataframe.
        GeneName (str): Column name in orthomcldf to match.
       annotationdf (pd.DataFrame): Second dataframe.
        Gene_Names (str): Column name in annotationdf containing space-separated strings.

    Returns:
        pd.DataFrame: Merged DataFrame with matched rows.
    """
    # Create exploded df2: one token per row
    annotationdf_exploded = annotationdf.copy()
    annotationdf_exploded[Gene_Names] = annotationdf_exploded[Gene_Names].str.split()
    annotationdf_exploded = annotationdf_exploded.explode(Gene_Names).rename(columns={Gene_Names: 'token'})

    # Merge on matching token
    merged = pd.merge(orthomcldf, annotationdf_exploded, left_on=GeneName, right_on='token', how='inner')
    
    #take the unused uniprot annotations and connect them back to the df
    uniprot= annotationdf['Gene Names']
    merged1= pd.merge(merged, uniprot,on='Gene Names', how='left')
    
    return merged1.drop(columns='token')

In [13]:
calbicans_gene_orth_uniprot = merge_on_token_match(orth_calbicansdf, 'Gene Names', calbicansdf, 'Gene Names')
cauris_gene_orth_uniprot = merge_on_token_match(orth_caurisdf, 'Gene Names', caurisdf, 'Gene Names')
chae_gene_orth_uniprot = merge_on_token_match(orth_chaedf, 'Gene Names', chaedf, 'Gene Names')
scere_gene_orth_uniprot = merge_on_token_match(orth_sceredf, 'Gene Names', sceredf, 'Gene Names')


In [14]:
calbicans_gene_orth_uniprot

Unnamed: 0,Accession,Gene Names,Entry,Protein names,Mass
0,OG6r21_119614,C100020CA,Q5AB58,Uncharacterized protein,11597
1,OG6r21_119615,C100030CA,Q5AB59,Uncharacterized protein,11776
2,OG6_115022,C100040WA,A0A1D8PC38,Cta2p,29210
3,OG6_108387,C100060WA,P0CY34,Transcriptional repressor TUP1,57603
4,OG6_102327,C100070WA,A0A1D8PC43,Diphosphomevalonate decarboxylase (EC 4.1.1.33...,39544
...,...,...,...,...,...
6025,OG6_102452,CR10820WA,A0A1D8PUC1,Small nuclear ribonucleoprotein Sm D1 (snRNP c...,12611
6026,OG6_123089,CR10830CA,A0A1D8PUC5,Mitochondrial 54S ribosomal protein YmL25,18506
6027,OG6_100349,CR10840CA,A0A1D8PUB4,L-iditol 2-dehydrogenase,38771
6028,OG6_101225,CR10850CA,A0A1D8PUD2,"ATP synthase subunit delta, mitochondrial (F-A...",17582


In [12]:
#save the orthomcl-uniprot ids with the annotations
calbicans_gene_orth_uniprot.to_csv('Desktop/funpath/DB/annotations/calbicans_orth_annotations_202050513.tsv', sep='\t', index=False)
cauris_gene_orth_uniprot.to_csv('Desktop/funpath/DB/annotations/cauris_orth_annotations_202050513.tsv', sep='\t', index=False) 
chae_gene_orth_uniprot.to_csv('Desktop/funpath/DB/annotations/chae_orth_annotations_202050513.tsv', sep='\t', index=False)
scere_gene_orth_uniprot.to_csv('Desktop/funpath/DB/annotations/scere_orth_annotations_202050513.tsv', sep='\t', index=False)


In [15]:
#call in all current Orthomcl IDs matched with Gene names.
orthuni_caurisdf = pd.read_csv('Desktop/funpath/DB/orth_genes/cauris_OG6_genes.tsv', sep='\t')
orthuni_chaedf = pd.read_csv('Desktop/funpath/DB/orth_genes/chaemulonii_OG6_genes.tsv', sep='\t')
orthuni_calbicansdf = pd.read_csv('Desktop/funpath/DB/orth_genes/calbicans_OG6_genes.tsv', sep='\t')
orthuni_sceredf= pd.read_csv('Desktop/funpath/DB/orth_genes/scerevisiae_OG6_genes.tsv', sep='\t')

In [16]:
#Drop all columns but Entry and Gene Names
calbicans_gene_orth_uniprot.drop(columns=['Mass', 'Protein names'], inplace=True)
cauris_gene_orth_uniprot.drop(columns=['Mass', 'Protein names'], inplace=True)
chae_gene_orth_uniprot.drop(columns=['Mass', 'Protein names'], inplace=True)
scere_gene_orth_uniprot.drop(columns=['Mass', 'Protein names'], inplace=True)

In [15]:
calbicans_gene_orth_uniprot.to_csv('Desktop/funpath/DB/gene_orth_uniprot/calbicans_gene_og6orth_unipro.tsv', sep='\t', index=False)
cauris_gene_orth_uniprot.to_csv('Desktop/funpath/DB/gene_orth_uniprot/cauris_genegen_og6orth_unipro.tsv', sep='\t', index=False) 
chae_gene_orth_uniprot.to_csv('Desktop/funpath/DB/gene_orth_uniprot/chae_gene_og6orth_unipro.tsv', sep='\t', index=False)
scere_gene_orth_uniprot.to_csv('Desktop/funpath/DB/gene_orth_uniprot/scere_gene_og6orth_unipro.tsv', sep='\t', index=False)


In [17]:
#Drop gene names column for a uniprot v orthomcl only df
calbicans_orth_uniprot= calbicans_gene_orth_uniprot.drop(columns='Gene Names')
cauris_orth_uniprot= cauris_gene_orth_uniprot.drop(columns='Gene Names')
chae_orth_uniprot= chae_gene_orth_uniprot.drop(columns='Gene Names')
scere_orth_uniprot= scere_gene_orth_uniprot.drop(columns='Gene Names')


In [17]:
calbicans_orth_uniprot.to_csv('Desktop/funpath/DB/orth_uniprot/calbicans_og6orth_uniprot.tsv', sep='\t', index=False)
cauris_orth_uniprot.to_csv('Desktop/funpath/DB/orth_uniprot/cauris_og6orth_uniprot.tsv', sep='\t', index=False) 
chae_orth_uniprot.to_csv('Desktop/funpath/DB/orth_uniprot/chae_og6orth_uniprot.tsv', sep='\t', index=False)
scere_orth_uniprot.to_csv('Desktop/funpath/DB/orth_uniprot/scere_og6orth_uniprot.tsv', sep='\t', index=False)


In [55]:
orth_sceredf

Unnamed: 0,Gene Names,Accession
0,Q0045,OG6_102770
1,Q0050,OG6_129836
2,Q0055,OG6_129836
3,Q0060,OG6_181774
4,Q0065,OG6_102770
...,...,...
5902,YPR200C,OG6_119686
5903,YPR201W,OG6_105259
5904,YPR202W,OG6_108906
5905,YPR203W,OG6_108906


In [18]:
def improved_switch_function(entry_list, df):
    """
    Improved version with better debugging and data type handling
    """
    
    # Handle string input by splitting on whitespace
    if isinstance(entry_list, str):
        entries = entry_list.split()
    else:
        entries = entry_list
    
    # Convert DataFrame Entry column to string and strip whitespace
    df_clean = df.copy()
    df_clean['Entry'] = df_clean['Entry'].astype(str).str.strip()
    df_clean['Accession'] = df_clean['Accession'].astype(str).str.strip()
    
    # Create a mapping dictionary from the DataFrame (Entry -> Accession)
    entry_to_accession = dict(zip(df_clean['Entry'], df_clean['Accession']))
    
    # Switch entries to accessions
    switched_accessions = []
    not_found = []
    
    for entry in entries:
        entry_clean = str(entry).strip()  # Ensure string and strip whitespace
        if entry_clean in entry_to_accession:
            switched_accessions.append(entry_to_accession[entry_clean])
        else:
            not_found.append(entry_clean)
            switched_accessions.append(None)
    
    # Print detailed info about not found entries
    if not_found:
        print(f"Warning: {len(not_found)} entries were not found in the DataFrame")
        print(f"First few not found: {not_found[:5]}")
        
        # Try to find partial matches
        df_entries = set(df_clean['Entry'])
        for nf_entry in not_found[:3]:  # Check first 3 not found
            partial_matches = [e for e in df_entries if nf_entry in e or e in nf_entry]
            if partial_matches:
                print(f"Partial matches for '{nf_entry}': {partial_matches[:3]}")
    
    print(f"Successfully matched: {len(switched_accessions) - len(not_found)} out of {len(entries)}")
    
    return switched_accessions



In [19]:
import pandas as pd

def debug_entry_matching(entry_list, df):
    """
    Debug function to see why entries aren't being found
    """
    
    # Handle string input by splitting on whitespace
    if isinstance(entry_list, str):
        entries = entry_list.split()
    else:
        entries = entry_list
    
    print(f"Number of entries to look for: {len(entries)}")
    print(f"First few entries: {entries[:5]}")
    print(f"DataFrame shape: {df.shape}")
    print(f"DataFrame columns: {df.columns.tolist()}")
    print()
    
    # Check the Entry column
    print("Entry column info:")
    print(f"Entry column dtype: {df['Entry'].dtype}")
    print(f"Number of unique entries in DF: {df['Entry'].nunique()}")
    print(f"First few entries in DF: {df['Entry'].head().tolist()}")
    print()
    
    # Check for exact matches
    print("Checking for matches:")
    entry_set = set(df['Entry'].astype(str))  # Convert to string for comparison
    
    found_count = 0
    for i, entry in enumerate(entries[:5]):  # Check first 5
        entry_clean = entry.strip()
        is_found = entry_clean in entry_set
        if is_found:
            found_count += 1
        print(f"Entry '{entry_clean}' found: {is_found}")
        
        # Check if there are similar entries (case differences, extra spaces)
        similar = [e for e in df['Entry'].astype(str) if entry_clean.lower() in e.lower() or e.lower() in entry_clean.lower()]
        if similar and not is_found:
            print(f"  Similar entries found: {similar[:3]}")
    
    print(f"\nTotal found in first 5: {found_count}")
    
    # Check for any whitespace or special characters
    print("\nChecking for whitespace issues:")
    sample_entry = entries[0].strip()
    sample_df_entry = str(df['Entry'].iloc[0])
    print(f"Sample search entry repr: {repr(sample_entry)}")
    print(f"Sample DF entry repr: {repr(sample_df_entry)}")
    
    return entries, df['Entry'].tolist()

def improved_switch_function(entry_list, df):
    """
    Improved version with better debugging and data type handling
    """
    
    # Handle string input by splitting on whitespace
    if isinstance(entry_list, str):
        entries = entry_list.split()
    else:
        entries = entry_list
    
    # Convert DataFrame Entry column to string and strip whitespace
    df_clean = df.copy()
    df_clean['Entry'] = df_clean['Entry'].astype(str).str.strip()
    df_clean['Accession'] = df_clean['Accession'].astype(str).str.strip()
    
    # Create a mapping dictionary from the DataFrame (Entry -> Accession)
    entry_to_accession = dict(zip(df_clean['Entry'], df_clean['Accession']))
    
    # Switch entries to accessions
    switched_accessions = []
    not_found = []
    
    for entry in entries:
        entry_clean = str(entry).strip()  # Ensure string and strip whitespace
        if entry_clean in entry_to_accession:
            switched_accessions.append(entry_to_accession[entry_clean])
        else:
            not_found.append(entry_clean)
            switched_accessions.append(None)
    
    # Print detailed info about not found entries
    if not_found:
        print(f"Warning: {len(not_found)} entries were not found in the DataFrame")
        print(f"First few not found: {not_found[:5]}")
        
        # Try to find partial matches
        df_entries = set(df_clean['Entry'])
        for nf_entry in not_found[:3]:  # Check first 3 not found
            partial_matches = [e for e in df_entries if nf_entry in e or e in nf_entry]
            if partial_matches:
                print(f"Partial matches for '{nf_entry}': {partial_matches[:3]}")
    
    print(f"Successfully matched: {len(switched_accessions) - len(not_found)} out of {len(entries)}")
    
    return switched_accessions

def format_as_space_separated_string(result_list):
    """
    Convert list to space-separated string, filtering out None values
    """
    # Filter out None values and convert to strings
    clean_list = [str(item) for item in result_list if item is not None]
    
    # Join with spaces
    return ' '.join(clean_list)

def switch_accessions_to_entries(accession_list, df):
    """
    Takes a list of accession IDs and switches them with corresponding Entry values from a DataFrame.
    
    Parameters:
    accession_list (str or list): Either a string of space-separated accession IDs or a list of accession IDs
    df (pd.DataFrame): DataFrame with 'Accession' and 'Entry' columns
    
    Returns:
    list: List of Entry values corresponding to the input accession IDs
    """
    
    # Handle string input by splitting on whitespace
    if isinstance(accession_list, str):
        accessions = accession_list.split()
    else:
        accessions = accession_list
    
    # Convert DataFrame columns to string and strip whitespace
    df_clean = df.copy()
    df_clean['Entry'] = df_clean['Entry'].astype(str).str.strip()
    df_clean['Accession'] = df_clean['Accession'].astype(str).str.strip()
    
    # Create a mapping dictionary from the DataFrame (Accession -> Entry)
    accession_to_entry = dict(zip(df_clean['Accession'], df_clean['Entry']))
    
    # Switch accessions to entries
    switched_entries = []
    not_found = []
    
    for acc in accessions:
        acc_clean = str(acc).strip()  # Ensure string and strip whitespace
        if acc_clean in accession_to_entry:
            switched_entries.append(accession_to_entry[acc_clean])
        else:
            not_found.append(acc_clean)
            switched_entries.append(None)
    
    # Print detailed info about not found entries
    if not_found:
        print(f"Warning: {len(not_found)} accessions were not found in the DataFrame")
        print(f"First few not found: {not_found[:5]}")
        
        # Try to find partial matches
        df_accessions = set(df_clean['Accession'])
        for nf_acc in not_found[:3]:  # Check first 3 not found
            partial_matches = [a for a in df_accessions if nf_acc in a or a in nf_acc]
            if partial_matches:
                print(f"Partial matches for '{nf_acc}': {partial_matches[:3]}")
    
    print(f"Successfully matched: {len(switched_entries) - len(not_found)} out of {len(accessions)}")
    
    return switched_entries

In [46]:
entries= 'A0A2H0ZJF3 A0A2H1A814 A0A2H0ZFG6 A0A2H0ZCQ8 A0A2H1A1D4 A0A2H1A814 A0A2H0ZFE5 A0A2H0ZLQ9 A0A2H0ZLR5 A0A2H0ZLX7 A0A2H0ZNF0 A0A2H0ZP51 A0A2H0ZW51 A0A2H1A200 A0A2H1A2I4 A0A2H1A2J4 A0A2H1A6P1 A0A2H1A7A6 A0A5Q7YHZ5 A0A2H0ZLS2'

In [20]:
import pandas as pd

def debug_entry_matching(entry_list, df):
    """
    Debug function to see why entries aren't being found
    """
    
    # Handle string input by splitting on whitespace
    if isinstance(entry_list, str):
        entries = entry_list.split()
    else:
        entries = entry_list
    
    print(f"Number of entries to look for: {len(entries)}")
    print(f"First few entries: {entries[:5]}")
    print(f"DataFrame shape: {df.shape}")
    print(f"DataFrame columns: {df.columns.tolist()}")
    print()
    
    # Check the Entry column
    print("Entry column info:")
    print(f"Entry column dtype: {df['Entry'].dtype}")
    print(f"Number of unique entries in DF: {df['Entry'].nunique()}")
    print(f"First few entries in DF: {df['Entry'].head().tolist()}")
    print()
    
    # Check for exact matches
    print("Checking for matches:")
    entry_set = set(df['Entry'].astype(str))  # Convert to string for comparison
    
    found_count = 0
    for i, entry in enumerate(entries[:5]):  # Check first 5
        entry_clean = entry.strip()
        is_found = entry_clean in entry_set
        if is_found:
            found_count += 1
        print(f"Entry '{entry_clean}' found: {is_found}")
        
        # Check if there are similar entries (case differences, extra spaces)
        similar = [e for e in df['Entry'].astype(str) if entry_clean.lower() in e.lower() or e.lower() in entry_clean.lower()]
        if similar and not is_found:
            print(f"  Similar entries found: {similar[:3]}")
    
    print(f"\nTotal found in first 5: {found_count}")
    
    # Check for any whitespace or special characters
    print("\nChecking for whitespace issues:")
    sample_entry = entries[0].strip()
    sample_df_entry = str(df['Entry'].iloc[0])
    print(f"Sample search entry repr: {repr(sample_entry)}")
    print(f"Sample DF entry repr: {repr(sample_df_entry)}")
    
    return entries, df['Entry'].tolist()

def improved_switch_function(entry_list, df):
    """
    Improved version with better debugging and data type handling
    """
    
    # Handle string input by splitting on whitespace
    if isinstance(entry_list, str):
        entries = entry_list.split()
    else:
        entries = entry_list
    
    # Convert DataFrame Entry column to string and strip whitespace
    df_clean = df.copy()
    df_clean['Entry'] = df_clean['Entry'].astype(str).str.strip()
    df_clean['Accession'] = df_clean['Accession'].astype(str).str.strip()
    
    # Create a mapping dictionary from the DataFrame (Entry -> Accession)
    entry_to_accession = dict(zip(df_clean['Entry'], df_clean['Accession']))
    
    # Switch entries to accessions
    switched_accessions = []
    not_found = []
    
    for entry in entries:
        entry_clean = str(entry).strip()  # Ensure string and strip whitespace
        if entry_clean in entry_to_accession:
            switched_accessions.append(entry_to_accession[entry_clean])
        else:
            not_found.append(entry_clean)
            switched_accessions.append(None)
    
    # Print detailed info about not found entries
    if not_found:
        print(f"Warning: {len(not_found)} entries were not found in the DataFrame")
        print(f"First few not found: {not_found[:5]}")
        
        # Try to find partial matches
        df_entries = set(df_clean['Entry'])
        for nf_entry in not_found[:3]:  # Check first 3 not found
            partial_matches = [e for e in df_entries if nf_entry in e or e in nf_entry]
            if partial_matches:
                print(f"Partial matches for '{nf_entry}': {partial_matches[:3]}")
    
    print(f"Successfully matched: {len(switched_accessions) - len(not_found)} out of {len(entries)}")
    
    return switched_accessions


In [21]:
def format_as_space_separated_string(result_list):
    """
    Convert list to space-separated string, filtering out None values
    """
    # Filter out None values and convert to strings
    clean_list = [str(item) for item in result_list if item is not None]
    
    # Join with spaces
    return ' '.join(clean_list)

In [22]:
improved_switch_function(entries, cauris_orth_uniprot)

NameError: name 'entries' is not defined

In [49]:
format_as_space_separated_string(switch)

'OG6_101773 OG6_102249 OG6_102343 OG6_100190 OG6_100210 OG6_102249 OG6_102180 OG6_102817 OG6_102459 OG6_101824 OG6_100210 OG6_500269 OG6_102056 OG6_100691 OG6_103939 OG6_101831 OG6_100190 OG6_101548 OG6_103317 OG6r20_110715'

In [23]:
listacc= 'OG6_100420 OG6_101400 OG6_101074 OG6_102545 OG6_103234 OG6_101821 OG6_100420 OG6_101226 OG6_102463 OG6_129058'

In [28]:
switch1=switch_accessions_to_entries(listacc, cauris_orth_uniprot)

Successfully matched: 10 out of 10


In [29]:
format_as_space_separated_string(switch1)

'A0A2H1A3V1 A0A2H0ZHD5 A0A2H1A1V9 A0A2H0ZIB8 A0A2H0ZRD4 A0A2H0ZE44 A0A2H1A3V1 A0A2H0ZNV5 A0A2H0ZME1 A0A2H1A7K3'