In [1]:
from collections import defaultdict
from Bio import Entrez, SeqIO, pairwise2
from IPython.display import Markdown
import requests
import time
import json
import re


#### A workflow to find the protein ID 'WP_XXXXXXXXX' of a specific gene (e.g. 'lysA')

**Input:** a list of **genes** to find their corresponding protein IDs

**Output:** a json of the inputed genes and their protein IDs (if found, else None)


### Steps of the algorithm:
1. Building a local list **protein_id_to_gene_map** (.json file) containing all the published genes of **C. glutamicum ATCC 13032** in RefSeq database.
2. Searching through protein_id_to_gene_map for matches based on **gene name**

If not found

3. Collecting all the gene names from homologous C. glutamicum (12 strains available in KEGG)
4. Searching through them for the gene name, when found, quering its amino acid sequence
5. Searching through protein_id_to_gene_map for matches based on **AA sequence** homology


### Getting the gene data from the RefSeq database based on its protein ID

Fetching every protein Id in **gene_to_reactions** and returning the data for its gene: gene_name, product, organism, sequence.

The data is stored locally in **protein_id_to_gene_map** for future use, avoiding searching every time.

### Serching the protein ID of a known gene 
The locally stored **protein_id_to_gene_map** are used to search for the **gene name** or its **sequence**

### 1. Direct search through the collected list of annoctated genes:
### 2. Searching through AA sequence homology:

In [2]:
from cobra.io import read_sbml_model

model = read_sbml_model('iCGB21FR.xml')

SBML package 'layout' not supported by cobrapy, information is not parsed
https://juser.fz-juelich.de/record/188973 does not conform to 'http(s)://identifiers.org/collection/id' or'http(s)://identifiers.org/COLLECTION:id


In [81]:
# 1. Direct search through the collected list of annoctated genes:¶

def printBold(text1, text2=""):
    if len(text2)>0:
        display(Markdown(f"{text1} **{text2}**"))
    else:
        display(Markdown(f"{text1}"))


def getFullGeneName(protein_id):
    for gene in model.genes:
        if protein_id in gene.id:
            return gene
        
def getProteinID(full_gene):
    return "WP_" + re.search(r"WP_(\d{9})", full_gene).group(1) if re.search(r"WP_(\d{9})", full_gene) else None

def fetchProteinID(protein_id, max_retries=50, delay_between_retries=1):
    for attempt in range(max_retries):
        try:
            Entrez.email = "h.bensaadi@gmail.com"

            handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
            record = SeqIO.read(handle, "genbank")
            handle.close()

            gene_name, product, organism, sequence = None, None, None, None

            for feature in record.features:
                if 'gene' in feature.qualifiers:
                    gene_name = feature.qualifiers['gene'][0]
                if 'product' in feature.qualifiers:
                    product = feature.qualifiers['product'][0]
                if 'organism' in feature.qualifiers:
                    organism = feature.qualifiers['organism'][0]

            sequence = str(record.seq)

            return protein_id, gene_name, product, organism, sequence

        except Entrez.HTTPError as e:
            if attempt < max_retries - 1:
                print(f"{protein_id}: retrying in {delay_between_retries} seconds...")
                time.sleep(delay_between_retries)
            else:
                print(f"Max retries reached. Unable to fetch data for {protein_id}")
                raise
        except Exception as e:
            print(f"Error: {e}")
            raise

def loadProteinIDToGeneMap():
    try:
        with open("protein_id_to_gene_map.json", "r") as file:
            protein_id_to_gene_map = json.load(file)
    except FileNotFoundError:
        protein_id_to_gene_map = {}
    
    return protein_id_to_gene_map
    
def updateProteinIDToGeneMap(protein_id_to_gene_map):
    with open("protein_id_to_gene_map.json", "w") as file:
        json.dump(protein_id_to_gene_map, file)
           
            
def getNumberOfUniqueGenes():
    
    unique_protein_ids = []
    
    for gene in model.genes:
        if getProteinID(gene.id) not in unique_protein_ids:
            unique_protein_ids.append(getProteinID(gene.id))
    return len(unique_protein_ids)
    
def mapProteinIDtoGene():
    global protein_id_to_gene_map
    
    printBold(f"Total number of unique genes in the model is: ", f"{getNumberOfUniqueGenes()} genes")
    
    protein_id_to_gene_map = loadProteinIDToGeneMap()

    if len(protein_id_to_gene_map) < 1:
        printBold(f"\nFetching RefSeq data for C. glutamicum ATCC 13032: ", f"")        

        count_found_genes = 0
        for gene in model.genes:
            if "WP_" in gene.id:
                protein_id = getProteinID(gene.id)

                if protein_id not in protein_id_to_gene_map:
                    protein_id, gene_name, product, organism, sequence = fetchProteinID(protein_id)
                    protein_id_to_gene_map[protein_id] = [gene_name, product, organism, sequence]

                    updateProteinIDToGeneMap(protein_id_to_gene_map)

                    count_found_genes +=1

                    print(f"\n{protein_id}, {gene_name}, {product}, {organism}, {sequence}")

        printBold(f"Total number of fetched genes online is: ", f"{count_found_genes}")
    
    else:
        printBold(f"\nRefSeq data from  C. glutamicum ATCC 13032 is already stored locally: ", f"{len(protein_id_to_gene_map)+1} genes")        

# 2. Searching through AA sequence homology:

def calculate_similarity(seq1, seq2):
    alignments = pairwise2.align.globalxx(seq1, seq2, one_alignment_only=True)
    if alignments:
        alignment = alignments[0]
        alignment_length = max(len(alignment[0]), len(alignment[1]))
        similarity = alignment[2] / alignment_length
        return similarity
    return 0

def searchGene(gene=None, reaction=None, seq=None, similarity_threshold=0.80):
    global protein_id_to_gene_map
    
    matches = []
    
    for protein_id, data in protein_id_to_gene_map.items():
        gene_name, product, organism, sequence = data
        
        if gene and gene_name and gene.lower() in gene_name.lower():
            matches.append((protein_id, data))
        
        if reaction and product and reaction.lower() in product.lower():
            matches.append((protein_id, data))
        
        if seq and sequence:
            similarity = calculate_similarity(seq.lower(), sequence.lower())            
            if similarity >= similarity_threshold:
                matches.append((protein_id, f"Similarity: {round(similarity*100, 2)}%", data))
    
    if gene and len(matches)>0:
        text_1_plural = "matches" if len(matches) != 1 else "match"
        text_2_plural = "were" if len(matches) != 1 else "was"
        
        printBold(f"Searching by gene name: ", f"{len(matches)} {text_1_plural} {text_2_plural} found for {gene}")
    
        return matches

def getDataFromProtreinID(searched_protein_id):
    protein_id_to_gene_map = loadMapFile()
    for protein_id, data in protein_id_to_gene_map.items():
        if searched_protein_id == protein_id:
            return data

def getGeneNameFromProteinID(searched_protein_id):
    for gene, reactions in list(gene_to_reactions.items()):
        if searched_protein_id == getProteinID(gene):
            return gene
    
def getReactionsFromProteinID(searched_protein_id):
    for gene, reactions in list(gene_to_reactions.items()):
        if searched_protein_id == getProteinID(gene):
            return reactions

        
def loadDataFromTaxonomySearch():
    try:
        with open("data_from_taxonomy_search.json", "r") as file:
            data_from_taxonomy_search = json.load(file)
    except FileNotFoundError:
        data_from_taxonomy_search = []
    return data_from_taxonomy_search
    
def saveDataFromTaxonomySearch(data_from_taxonomy_search):
    with open("data_from_taxonomy_search.json", "w") as file:
        json.dump(data_from_taxonomy_search, file)
        
def getDataBasedOnTaxonomy():
    global data_from_taxonomy_search
    taxonomy_codes = {
        "cgl": "Corynebacterium glutamicum ATCC 13032 (Kyowa Hakko)",
        "cgb": "Corynebacterium glutamicum ATCC 13032 (Bielefeld)",
        "cgu": "Corynebacterium glutamicum K051",
        "cgt": "Corynebacterium glutamicum R",
        "cgs": "Corynebacterium glutamicum SCgG1",
        "cgg": "Corynebacterium glutamicum SCgG2",
        "cgm": "Corynebacterium glutamicum MB001",
        "cgj": "Corynebacterium glutamicum ATCC 21831",
        "cgq": "Corynebacterium glutamicum AR1",
        "cgx": "Corynebacterium glutamicum B253"}

    data_from_taxonomy_search = loadDataFromTaxonomySearch()
    
    if len(data_from_taxonomy_search) < 1:
        print(f"Fetching KEGG data for {len(taxonomy_codes)} C. glutamicum strain: \n")
        
        for taxonomy_code in taxonomy_codes:
            search_url = f"http://rest.kegg.jp/find/genes/{taxonomy_code}"

            try:
                print(f"Getting data for {taxonomy_codes[taxonomy_code]}")
                response = requests.get(search_url)
                if response.status_code == 200:
                    data_from_taxonomy_search.append(response.text.split('\n'))
                else:
                    print(f"Error: {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"Request Exception: {e}")

        saveDataFromTaxonomySearch(data_from_taxonomy_search)
    else:
        printBold(f"\nKEGG data from other C. glutamicum strains is already stored locally: ", f"{len(data_from_taxonomy_search)} strains")
            
def getAminoAcidsSequence(gene_id):
    response = requests.get(f"http://rest.kegg.jp/get/{gene_id}")
    
    if response.status_code == 200:        
        lines = response.text.split('\n')
        amino_acid_sequence = ""
        is_aa_sequence_section = False

        for line in lines:
            if line.startswith("AASEQ"):
                is_aa_sequence_section = True
            elif line.startswith("NTSEQ"):
                is_aa_sequence_section = False
            elif is_aa_sequence_section and line.strip(): 
                amino_acid_sequence += line.strip()
        return amino_acid_sequence

    else:
        return f"Error: {response.status_code}"
    
    
def getMatchedGenesByKEGGIdentifier():
    global data_from_taxonomy_search
    global genes_not_found
    
    found_homologous_genes = {}
    
    for gene_name_to_search in genes_not_found:
        for data in data_from_taxonomy_search:
            for entry in data:
                if gene_name_to_search in entry:
                    if gene_name_to_search not in found_homologous_genes:
                        found_homologous_genes[gene_name_to_search] = [entry]
                    else:
                        found_homologous_genes[gene_name_to_search].append(entry)

        print(f"Homologous genes for {gene_name_to_search}: {len(found_homologous_genes[gene_name_to_search])} results")

    return found_homologous_genes


def fetchAAsequenceOfHomologousGenes(found_homologous_genes):
    
    aa_sequence_results = {}
    
    print("\nFetching AA sequence for the homologous genes: \n")

    for gene_name, entries in list(found_homologous_genes.items()):
        for entry in entries:
            gene_id_kegg = entry.split("; ")[0].split("\t")[0]
            gene_name_kegg = entry.split("; ")[0].split("\t")[1]
            try:
                gene_description_kegg = entry.split("; ")[1]
            except:
                pass

            aa_seq = getAminoAcidsSequence(gene_id_kegg)

            print(f"{gene_name_kegg} ({gene_id_kegg}): {aa_seq[:25]}...")

            if gene_name not in aa_sequence_results:
                aa_sequence_results[gene_name] = [{"kegg_id": gene_id_kegg, "aa_seq": aa_seq}]
            else:
                aa_sequence_results[gene_name].append({"kegg_id": gene_id_kegg, "aa_seq": aa_seq})

    return aa_sequence_results

In [None]:
genes_to_find = ["lysC", "ddh", "fbp", "tkt"]
found_genes_by_gene_name = []
found_genes_by_aa_sequence = {}
matches = []

mapProteinIDtoGene()

# 1. searching by gene_name
for gene_name in genes_to_find:
    match_results = searchGene(gene=gene_name)    
    if match_results:
        found_genes_by_gene_name.append(gene_name)
        matches.append(match_results)

# 2. searching by AA sequence homolgy

getDataBasedOnTaxonomy()

genes_not_found = list(set(genes_to_find) - set(found_genes_by_gene_name))

found_homologous_genes = getMatchedGenesByKEGGIdentifier()
aa_sequence_results = fetchAAsequenceOfHomologousGenes(found_homologous_genes)

print("\nSearching for AA sequence homolgy against genes from C. glutamicum ATCC 13032:\n")

for gene_id, data in list(aa_sequence_results.items()):    
    for element in data:
        print(f'Searching {gene_id} -> {element["kegg_id"]}: \n')
        match_results = searchGene(seq= element["aa_seq"], similarity_threshold=0.65)
        if match_results:
            for match in match_results:
                print(match)


Total number of unique genes in the model is:  **786 genes**


RefSeq data from  C. glutamicum ATCC 13032 is already stored locally:  **786 genes**

Searching by gene name:  **1 match was found for tkt**


KEGG data from other C. glutamicum strains is already stored locally:  **10 strains**

Homologous genes for lysC: 2 results
Homologous genes for ddh: 2 results
Homologous genes for fbp: 11 results

Fetching AA sequence for the homologous genes: 

lysC (cgb:cg0306): MALVVQKYGGSSLESAERIRNVAER...
lysC (cgf:CGUA_00995): MALIVQKYGGSSLESAERIRAVAER...
ddh (cgf:CGUA_11365): MAEAKIRAAIVGYGNLGRSAEKLLA...
Obg family GTPase CgtA (ddh:Desde_3708): MFYDQAKIYVKGGDGGAGAVAFRRE...
fbpA1 (cgf:CGUA_01390): MKLLRKLAAPAAIAIALGAGMSGVA...
fbpA2 (cgf:CGUA_05650): MNRLPTRAGAVLTAVTVASLGVSGA...
fbpA3 (cgf:CGUA_05655): MNTRTTVRYAAALVVAAVAAAPFPA...
fbpA4 (cgf:CGUA_05670): MSPTPRRLAATAAALAVLANAFPAA...
fbpA5 (cgf:CGUA_09355): MSRFTHRSLPSVLALGASAFLLAAC...
fbpA6 (cgf:CGUA_10100): MRRSLTTVIATAVLAAAVTPAAAAQ...
fbpC (cgf:CGUA_11260): MSTTRVLAVTLAAVLLAWPSAGAQA...
fbpA7 (cgf:CGUA_12390): MRDTASSSRSPKNVPNLRRRGAALI...
fbpB (cgf:CGUA_12395): MMSATAGLRRIHTTILALIVAVATA...
Ferric transport system permease protein fbpB (hiq:CGSHiGG_03230): MNAKKLSIMHSAYFWIILSLLAFAL...
fbpC (hiq:CGSHiGG_03235): MKFNKISLSVSTALLAAGLAVS

## 3. Manual search of "fbp": 

In [None]:
for metabolite in model.metabolites.query("D-Fructose 1-phosphate", "name"):
    print(f"Metabolite ID for D-Fructose 1-phosphate is: {metabolite}")
    print("The reactions involving D-Fructose 1-phosphate  are:")
    for reaction in model.metabolites.f1p_c.reactions:
        print(f"\t- {reaction.id} -> {reaction.reaction}")

In [None]:
model.reactions.FRUK 

# Summary:

**Found through direct search:**

- dapB = 'WP_011014794'
- lysA = 'WP_011014180'
- tkt = 'WP_011014456'

**Found through AA sequence homology search:**

- lysC = 'WP_003855724', 'Similarity: 100.0%'
- ddh  = 'WP_011015446', 'Similarity: 66.49%'
- pck  = 'WP_011013816', 'Similarity: 100.0%'
- pyc  = 'WP_011013816', 'Similarity: 100.0%'
- icd  = 'WP_011013800', 'Similarity: 100.0%'
- hom  = 'WP_003854900', 'Similarity: 100.0%'

**Found through manual search:**

- fbp  = 'WP_011014765'


In [None]:
genes = {
    'dapB':'WP_011014794',
    'lysA': 'WP_011014180',
    'tkt': 'WP_011014456',
    'lysC': 'WP_003855724',
    'ddh' : 'WP_011015446',
    'pck': 'WP_011013816',
    'pyc' : 'WP_011013816', 
    'icd': 'WP_011013800',
    'hom' : 'WP_003854900', 
    'fbp' : 'WP_011014765'
}