# Translate gene IDs

## Mygene

In [4]:
import mygene

# Initialize the MyGene.info client
mg = mygene.MyGeneInfo()

# Query example: Convert Ensembl Gene IDs to other types
query = ['ENSG00000141510', 'ENSG00000139618']  # Example Ensembl Gene IDs (TP53, BRCA2)

# Query MyGene.info for mappings
print("Querying MyGene.info...")
results = mg.querymany(
    query, 
    scopes='ensembl.gene', 
    fields='symbol,entrezgene,uniprot', 
    species='human'
)

# Display the results
print("\nResults from MyGene.info:")
for res in results:
    print(f"Ensembl ID: {res.get('query')}")
    print(f"  Symbol: {res.get('symbol')}")
    print(f"  Entrez Gene ID: {res.get('entrezgene')}")
    print(f"  UniProt ID: {res.get('uniprot', {}).get('Swiss-Prot')}")
    print("-" * 40)


Querying MyGene.info...

Results from MyGene.info:
Ensembl ID: ENSG00000141510
  Symbol: TP53
  Entrez Gene ID: 7157
  UniProt ID: P04637
----------------------------------------
Ensembl ID: ENSG00000139618
  Symbol: BRCA2
  Entrez Gene ID: 675
  UniProt ID: P51587
----------------------------------------


### Example with adata file

In [5]:
import pandas as pd
import numpy as np
import anndata as ad
import mygene

In [8]:
# Load the adata file
adata = ad.read_h5ad("/storage/users/data/PANC/H5AD_file/adata_filtered_no2D_hvg_clust_time_pub.h5ad")
adata.var_names

Index(['ENSG00000187608', 'ENSG00000224969', 'ENSG00000184163',
       'ENSG00000142609', 'ENSG00000287586', 'ENSG00000049246',
       'ENSG00000238290', 'ENSG00000204624', 'ENSG00000117122',
       'ENSG00000117115',
       ...
       'ENSG00000013619', 'ENSG00000029993', 'ENSG00000011677',
       'ENSG00000198883', 'ENSG00000182492', 'ENSG00000067842',
       'ENSG00000130821', 'ENSG00000198910', 'ENSG00000099721',
       'ENSG00000198899'],
      dtype='object', name='feature_id', length=1992)

In [9]:
# Extract Ensembl Gene IDs
ensembl_ids = adata.var_names.tolist()

# Initialize MyGene.info client
mg = mygene.MyGeneInfo()

# Query MyGene.info for mappings
print("Querying MyGene.info...")
results = mg.querymany(
    ensembl_ids,
    scopes="ensembl.gene",
    fields="symbol,entrezgene,uniprot",
    species="human"
)

# Create dictionaries for mappings
ensembl_to_symbol = {}
ensembl_to_entrez = {}
ensembl_to_uniprot = {}

# Populate dictionaries
print("Processing results...")
for res in results:
    ensembl_id = res.get("query")
    if "notfound" in res:
        continue  # Skip if not found
    if ensembl_id:
        ensembl_to_symbol[ensembl_id] = res.get("symbol")
        ensembl_to_entrez[ensembl_id] = res.get("entrezgene")
        uniprot = res.get("uniprot", {}).get("Swiss-Prot")  # Get Swiss-Prot ID
        ensembl_to_uniprot[ensembl_id] = uniprot

# Display dictionary samples
print("\nSample mappings:")
print("Ensembl to Gene Symbol:", list(ensembl_to_symbol.items())[:15])
print("Ensembl to Entrez Gene ID:", list(ensembl_to_entrez.items())[:15])
print("Ensembl to UniProt ID:", list(ensembl_to_uniprot.items())[:15])


Querying MyGene.info...


3 input query terms found dup hits:	[('ENSG00000227110', 2), ('ENSG00000226519', 2), ('ENSG00000262352', 3)]
34 input query terms found no hit:	['ENSG00000286863', 'ENSG00000275557', 'ENSG00000231877', 'ENSG00000233005', 'ENSG00000237838', 'ENS


Processing results...

Sample mappings:
Ensembl to Gene Symbol: [('ENSG00000187608', 'ISG15'), ('ENSG00000224969', None), ('ENSG00000184163', 'C1QTNF12'), ('ENSG00000142609', 'CFAP74'), ('ENSG00000287586', None)]
Ensembl to Entrez Gene ID: [('ENSG00000187608', '9636'), ('ENSG00000224969', None), ('ENSG00000184163', '388581'), ('ENSG00000142609', '85452'), ('ENSG00000287586', None)]
Ensembl to UniProt ID: [('ENSG00000187608', 'P05161'), ('ENSG00000224969', None), ('ENSG00000184163', 'Q5T7M4'), ('ENSG00000142609', 'Q9C0B2'), ('ENSG00000287586', None)]


In [13]:
# Extract Ensembl Gene IDs
ensembl_ids = adata.var_names.tolist()

# Initialize MyGene.info client
mg = mygene.MyGeneInfo()

# Query MyGene.info for mappings
print("Querying MyGene.info...")
results = mg.querymany(
    ensembl_ids,
    scopes="ensembl.gene",
    fields="symbol,entrezgene,uniprot",
    species="human"
)

# Create dictionaries for mappings
ensembl_to_symbol = {}
ensembl_to_entrez = {}
ensembl_to_uniprot = {}

# Counters for matched and unmatched IDs
matched_count = 0
unmatched_count = 0

# Populate dictionaries
print("Processing results...")
for res in results:
    ensembl_id = res.get("query")
    if "notfound" in res:
        # If not found, use Ensembl ID as a fallback
        ensembl_to_symbol[ensembl_id] = ensembl_id
        ensembl_to_entrez[ensembl_id] = ensembl_id
        ensembl_to_uniprot[ensembl_id] = ensembl_id
        unmatched_count += 1
        continue

    # Populate dictionaries with retrieved or fallback values
    ensembl_to_symbol[ensembl_id] = res.get("symbol", ensembl_id)
    ensembl_to_entrez[ensembl_id] = res.get("entrezgene", ensembl_id)
    uniprot = res.get("uniprot", {}).get("Swiss-Prot", ensembl_id)
    ensembl_to_uniprot[ensembl_id] = uniprot
    matched_count += 1

# Display dictionary samples
print("\nSample mappings:")
print("Ensembl to Gene Symbol:", list(ensembl_to_symbol.items())[:5])
print("Ensembl to Entrez Gene ID:", list(ensembl_to_entrez.items())[:5])
print("Ensembl to UniProt ID:", list(ensembl_to_uniprot.items())[:5])

# Print summary of matches
total_ids = len(ensembl_ids)
print(f"\nSummary:")
print(f"Total Ensembl IDs: {total_ids}")
print(f"Matched IDs: {matched_count}")
print(f"Unmatched IDs: {unmatched_count}")


Querying MyGene.info...


3 input query terms found dup hits:	[('ENSG00000227110', 2), ('ENSG00000226519', 2), ('ENSG00000262352', 3)]
34 input query terms found no hit:	['ENSG00000286863', 'ENSG00000275557', 'ENSG00000231877', 'ENSG00000233005', 'ENSG00000237838', 'ENS


Processing results...

Sample mappings:
Ensembl to Gene Symbol: [('ENSG00000187608', 'ISG15'), ('ENSG00000224969', 'ENSG00000224969'), ('ENSG00000184163', 'C1QTNF12'), ('ENSG00000142609', 'CFAP74'), ('ENSG00000287586', 'ENSG00000287586')]
Ensembl to Entrez Gene ID: [('ENSG00000187608', '9636'), ('ENSG00000224969', 'ENSG00000224969'), ('ENSG00000184163', '388581'), ('ENSG00000142609', '85452'), ('ENSG00000287586', 'ENSG00000287586')]
Ensembl to UniProt ID: [('ENSG00000187608', 'P05161'), ('ENSG00000224969', 'ENSG00000224969'), ('ENSG00000184163', 'Q5T7M4'), ('ENSG00000142609', 'Q9C0B2'), ('ENSG00000287586', 'ENSG00000287586')]

Summary:
Total Ensembl IDs: 1992
Matched IDs: 1962
Unmatched IDs: 34


## Other ID conversion tools

### Entrez

In [2]:
from Bio import Entrez
# Set your email for NCBI Entrez queries
Entrez.email = "your_email@example.com"

# Function to search for a gene using NCBI Entrez
def search_entrez_gene(term):
    print(f"Searching Entrez for term: {term}")
    handle = Entrez.esearch(db="gene", term=term, retmax=5)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

# Search for TP53 gene in humans
entrez_ids = search_entrez_gene("TP53[Gene Name] AND human[Organism]")
print(f"Entrez Gene IDs for TP53: {entrez_ids}")

Searching Entrez for term: TP53[Gene Name] AND human[Organism]
Entrez Gene IDs for TP53: ['7157']


### Expasy and Swissport 

In [3]:
from Bio import ExPASy
from Bio import SwissProt


# Using ExPASy and SwissProt to fetch protein details
def fetch_swissprot_record(uniprot_id):
    print(f"\nFetching SwissProt record for UniProt ID: {uniprot_id}")
    try:
        handle = ExPASy.get_sprot_raw(uniprot_id)
        record = SwissProt.read(handle)
        handle.close()
        return record
    except Exception as e:
        print(f"Error fetching record: {e}")
        return None

# Example UniProt ID for TP53
uniprot_id = "P04637"  # UniProt ID for TP53
record = fetch_swissprot_record(uniprot_id)
if record:
    print(f"Description: {record.description}")
    print(f"Gene Names: {record.gene_name}")
    print(f"Organism: {record.organism}")
    print(f"Keywords: {record.keywords}")





Fetching SwissProt record for UniProt ID: P04637
Description: RecName: Full=Cellular tumor antigen p53; AltName: Full=Antigen NY-CO-13; AltName: Full=Phosphoprotein p53; AltName: Full=Tumor suppressor p53;
Gene Names: [{'Name': 'TP53', 'Synonyms': ['P53']}]
Organism: Homo sapiens (Human).
Keywords: ['3D-structure', 'Acetylation', 'Activator', 'Alternative promoter usage', 'Alternative splicing', 'Apoptosis', 'Biological rhythms', 'Cell cycle', 'Cytoplasm', 'Cytoskeleton', 'Direct protein sequencing', 'Disease variant', 'DNA-binding', 'Endoplasmic reticulum', 'Glycoprotein', 'Host-virus interaction', 'Isopeptide bond', 'Li-Fraumeni syndrome', 'Metal-binding', 'Methylation', 'Mitochondrion', 'Necrosis', 'Nucleus', 'Phosphoprotein', 'Proteomics identification', 'Reference proteome', 'Repressor', 'Transcription', 'Transcription regulation', 'Tumor suppressor', 'Ubl conjugation', 'Zinc']
