Searches raw_data/yeast_uniprot_proteins.fasta and makes a dictionary of gene symbol: uniprot_id

Goes through annotation datasets and appends a new uniprot_id column for each gene name 

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import re

symbol_names_dict = dict()

for f in SeqIO.parse("raw_data/yeast_uniprot_proteins.fasta", 'fasta'):
    gene_ref = re.search(r"GN=(.*?) ", f.description)  # Matches GN=TFC3
    gene_name = re.search(r"\|(.*)\|", f.description)  # Matches |P34111|
    
    if gene_ref and gene_name:
        symbol_names_dict[gene_ref.groups()[0]] = gene_name.groups()[0]  # e.g. 'CYC1': 'P00044'

In [2]:
def find_uniprot_ids(df, symbol_names_dict, dataframe_gene_ref_col_name="#Gene_Ref"):
    import pandas as pd
    import numpy as np

    uniprot_ids = list()
    count = 0

    for ref in list(df[dataframe_gene_ref_col_name]):
        if ref in symbol_names_dict.keys():
            uniprot_ids.append(symbol_names_dict[ref])
            count += 1
        else:
            uniprot_ids.append("-")

    print("Found", count, "uniprot ids out of", df.shape[0])
    print("Lost", df.shape[0] - count, "examples")
    
    return uniprot_ids
    

import pandas as pd
ess_df = pd.read_csv("processed_ess/degannotation-e.dat", sep="\t")
ness_df = pd.read_csv("processed_ness/degannotation-ne.dat", sep="\t")  

ess_ids  = find_uniprot_ids(ess_df, symbol_names_dict)
ness_ids = find_uniprot_ids(ness_df, symbol_names_dict)

ess_df["#Uniprot_ID"] = pd.Series(ess_ids)
ness_df["#Uniprot_ID"] = pd.Series(ness_ids)

# Re-order columns
ess_df = ess_df[["#DEG_AC", "#Gene_Name", "#Gene_Ref", "#Uniprot_ID"] +
                [c for c in ess_df.columns if c not in ["#DEG_AC", "#Gene_Name", "#Gene_Ref", "#Uniprot_ID"]]]
ness_df = ness_df[["#DEG_AC", "#Gene_Name", "#Gene_Ref", "#Uniprot_ID"] +
                [c for c in ness_df.columns if c not in ["#DEG_AC", "#Gene_Name", "#Gene_Ref", "#Uniprot_ID"]]]

ess_df.to_csv("processed_ess/degannotation-e-uniprot.dat", sep ="\t", index=False)
ness_df.to_csv("processed_ness/degannotation-ne-uniprot.dat", sep ="\t", index=False)

Found 1092 uniprot ids out of 1110
Lost 18 examples
Found 3953 uniprot ids out of 4020
Lost 67 examples
