In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Link for 1 article with many genes from PubTator. https://drive.google.com/file/d/1IKBb1XsG_PCLUCf8100GUtThi31SqxtF/view?usp=drive_link

# For the GO relation (Local Annotation)
ex) alpha-synuclein |	SNCA | enables | GO:0000149 | IDA

In [72]:
import pandas as pd

def load_pubtator_genes(filename):
    """Load genes from a PubTator3 file, extracting NCBI Gene IDs and names."""
    genes = {}
    gene_symbols = {}
    with open(filename, 'r') as file:
        for line in file:
            parts = line.strip().split()  # Splitting on whitespace (space or tab)
            if len(parts) >= 5 and parts[1] == "Gene":  # Adjusted for correct indexing
                ncbi_id = parts[2]
                gene_name = " ".join(parts[3:-1]) if len(parts) > 4 else parts[3]  # Handles both multi-word and single-word names
                gene_symbol = parts[3].strip()
                genes[ncbi_id] = gene_name
                gene_symbols[gene_name.lower()] = gene_symbol  # Map gene name to symbol in lowercase

    print("Loaded genes:", genes)  # Debugging
    print("gene symbols: ", gene_symbols)
    return genes, gene_symbols


def load_go_annotations(goa_filename, gene_symbols):
    """Load GO annotations for a set of gene symbols from the GOA file."""
    go_annotations = set()
    gene_symbol_set = set(gene_symbols.values())
    with open(goa_filename, 'r') as file:
        for line in file:
            if line.startswith("#"):
                continue  # Skip comment lines
            parts = line.strip().split('\t')

            # Search for exact gene symbol match in the parts of the line
            for symbol in gene_symbol_set:
                # Check if any part of the line exactly matches the gene symbol (case-insensitive)
                if any(symbol == part.lower() for part in parts):
                    # If found, append the relevant data (e.g., parts[2], parts[3], parts[4], etc.)
                    go_annotations.add((symbol, parts[2], parts[3], parts[4], parts[6]))  # Gene name, Gene symbol, GO term, GO ID, Evidence
                    break  # Once a match is found, no need to check further parts

    return go_annotations

def main():
    pubtator_file = "/content/drive/MyDrive/pubtator_genes.txt"  # Input from PubTator3
    goa_file = "/content/drive/MyDrive/goa_human.gaf"  # GO annotation file

    # Load genes from PubTator3
    genes, gene_symbols = load_pubtator_genes(pubtator_file)

    # Load GO annotations for extracted genes
    go_annotations = load_go_annotations(goa_file, gene_symbols)

    print("Gene Symbol\tGO Term\tGO ID\tEvidence")
    if not go_annotations:
        print("No matching GO annotations found.")  # Debugging
    else:
        for gene_name, gene_symbol, go_term, go_id, evidence in sorted(go_annotations):
            print(f"{gene_name}\t{gene_symbol}\t{go_term}\t{go_id}\t{evidence}")  # Unique GO terms only

if __name__ == "__main__":
    main()


Loaded genes: {'361': 'aquaporin-4', '10215': 'Olig2', '17433': 'myelin-associated oligodendrocyte basic protein', '16287': 'RD4', '6622': 'alpha-synuclein', '5354': 'PLP', '4155': 'MBP|myelin basic protein', '20682': 'Sox9', '6662': 'Sox9', '351': 'amyloid beta', '17196': 'myelin basic protein MBP', '2705': 'connexin-32|Cx32', '343035': 'RD3', '50913': 'Olig2', '2697': 'Connexin-43 |connexin-43', '2201': 'DA9', '146713': 'Neun', '4137': 'Mapt |MAPT|microtubule associated protein tau|tau|Tau', '19122': 'prion protein', '12799': 'CNP'}
gene symbols:  {'aquaporin-4': 'aquaporin-4', 'olig2': 'Olig2', 'myelin-associated oligodendrocyte basic protein': 'myelin-associated', 'rd4': 'RD4', 'alpha-synuclein': 'alpha-synuclein', 'plp': 'PLP', 'mbp|myelin basic protein': 'MBP|myelin', 'sox9': 'Sox9', 'amyloid beta': 'amyloid', 'myelin basic protein mbp': 'myelin', 'connexin-32|cx32': 'connexin-32|Cx32', 'rd3': 'RD3', 'connexin-43 |connexin-43': 'Connexin-43', 'da9': 'DA9', 'neun': 'Neun', 'mapt |



---



# For the BP, MF, CC (API calling Annotation)

***Get NCBI ID only. ***

In [128]:
import pandas as pd

def load_pubtator_genes(filename):
    """Load genes from a PubTator3 file, extracting NCBI Gene IDs."""
    ncbi_ids = []
    with open(filename, 'r') as file:
        for line in file:
            parts = line.strip().split()  # Splitting on whitespace (space or tab)
            ncbi_ids.append(parts[2])

    return ncbi_ids

def main():
    pubtator_file = "/content/drive/MyDrive/pubtator_genes.txt"  # Input from PubTator3

    ncbi_ids = load_pubtator_genes(pubtator_file)

    print(ncbi_ids)


if __name__ == "__main__":
    main()


['361', '10215', '17433', '16287', '6622', '5354', '4155', '20682', '6662', '351', '17196', '2705', '343035', '50913', '2697', '2201', '146713', '4137', '19122', '12799', '17762']


**Trying to change NCBI gene ID to Uniprot ID**

In [127]:
import requests
import time

API_URL = "https://rest.uniprot.org"
POLLING_INTERVAL = 3

def check_response(response):
    try:
        response.raise_for_status()
    except requests.HTTPError:
        print(response.json())
        raise

def submit_id_mapping(from_db, to_db, ids):
    """Submits an ID mapping request to UniProt"""
    request = requests.post(
        f"{API_URL}/idmapping/run",
        data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
    )
    check_response(request)
    return request.json()["jobId"]

def check_id_mapping_results_ready(job_id):
    """Checks if the ID mapping job is completed"""
    while True:
        request = requests.get(f"{API_URL}/idmapping/status/{job_id}")
        check_response(request)
        j = request.json()
        if "jobStatus" in j:
            if j["jobStatus"] in ("NEW", "RUNNING"):
                print(f"Retrying in {POLLING_INTERVAL}s...")
                time.sleep(POLLING_INTERVAL)
            else:
                raise Exception(j["jobStatus"])
        else:
            return bool(j["results"] or j["failedIds"])

def get_id_mapping_results_link(job_id):
    """Retrieves the results link for mapped UniProt IDs"""
    url = f"{API_URL}/idmapping/details/{job_id}"
    request = requests.get(url)
    check_response(request)
    return request.json()["redirectURL"]

def fetch_uniprot_mappings(job_id):
    """Gets mapped UniProt IDs from NCBI Gene IDs"""
    link = get_id_mapping_results_link(job_id)
    request = requests.get(link)
    check_response(request)
    return request.json()["results"]

# Example: NCBI Gene IDs from your data
ncbi_gene_ids = ["361", "10215"]

# Step 1: Convert NCBI Gene IDs to UniProt IDs
job_id = submit_id_mapping(from_db="GeneID", to_db="UniProtKB", ids=ncbi_gene_ids)

if check_id_mapping_results_ready(job_id):
    mapped_results = fetch_uniprot_mappings(job_id)

    # Extract only the 'to' field (UniProt IDs)
    uniprot_ids = [entry["to"] for entry in mapped_results if "to" in entry]  # Extract UniProt IDs from 'to' field
    print(f"Extracted UniProt IDs: {uniprot_ids}")


Extracted UniProt IDs: [{'entryType': 'UniProtKB reviewed (Swiss-Prot)', 'primaryAccession': 'P55087', 'secondaryAccessions': ['P78564'], 'uniProtkbId': 'AQP4_HUMAN', 'entryAudit': {'firstPublicDate': '1996-10-01', 'lastAnnotationUpdateDate': '2025-02-05', 'lastSequenceUpdateDate': '1997-11-01', 'entryVersion': 202, 'sequenceVersion': 2}, 'annotationScore': 5.0, 'organism': {'scientificName': 'Homo sapiens', 'commonName': 'Human', 'taxonId': 9606, 'lineage': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']}, 'proteinExistence': '1: Evidence at protein level', 'proteinDescription': {'recommendedName': {'fullName': {'value': 'Aquaporin-4'}, 'shortNames': [{'value': 'AQP-4'}]}, 'alternativeNames': [{'fullName': {'evidences': [{'evidenceCode': 'ECO:0000303', 'source': 'PubMed', 'id': '7559426'}], 'value': 'Mercurial-insensitive water channel'}, 'shortNames

# Getting BP, MF, CC

In [121]:
import requests

def fetch_go_terms_quickgo(uniprot_ids):
    """ Fetch GO terms from QuickGO API for a list of UniProt IDs """
    go_terms = []
    base_url = "https://www.ebi.ac.uk/QuickGO/services/annotation/search"

    for uniprot_id in uniprot_ids:
        # Construct the URL for QuickGO query
        url = f"{base_url}?query=uniProt:{uniprot_id}&limit=10"  # Filter for GO terms related to UniProt ID
        # print(f"Querying QuickGO with URL: {url}")  # Debugging line to print the URL

        response = requests.get(url)

        # Debugging response status and content
        # print(f"Response Status Code: {response.status_code}")
        if response.status_code == 200:
            data = response.json()
            # print(f"Response Data: {data}")  # Debugging line to print the response data
            if 'results' in data:
                for result in data['results']:
                    # Correct keys for accessing GO term data
                    go_terms.append({
                        'uniprot_id': uniprot_id,
                        'go_term': result['goId'],  # Corrected key for GO term ID
                        'go_aspect': result['goAspect'],  # Corrected key for GO aspect
                        'go_evidence': result['goEvidence']  # Additional GO evidence information
                    })
            else:
                print(f"No results found for UniProt ID {uniprot_id}")
        else:
            print(f"Error: Unable to fetch data for UniProt ID {uniprot_id}, Status Code: {response.status_code}")

    return go_terms

# Example UniProt IDs (replace these with actual ones)
uniprot_ids = ["P55087", "Q8N5M1"]

# Fetch GO terms for the UniProt IDs
go_results = fetch_go_terms_quickgo(uniprot_ids)

if go_results:
    # Track previously printed UniProt IDs to avoid repetition
    printed_uniprot_ids = set()

    for result in go_results:
        uniprot_id = result['uniprot_id']

        # Print the UniProt ID once
        if uniprot_id not in printed_uniprot_ids:
            print(f"\n UniProt ID: {uniprot_id}")
            print("="*23)
            printed_uniprot_ids.add(uniprot_id)

        # Create the aspect string (BP, MF, CC) based on the 'go_aspect' value
        aspect_map = {
            "biological_process": "BP",
            "molecular_function": "MF",
            "cellular_component": "CC"
        }
        aspect = aspect_map.get(result['go_aspect'], "Unknown")

        # Display results in the requested format
        print(f"{aspect} | {result['go_term']} | {result['go_evidence']}")
        print("-"*23)  # Divider for each GO term
else:
    print("No GO terms found.")



 UniProt ID: P55087
MF | GO:0003870 | IEA
-----------------------
MF | GO:0016746 | IEA
-----------------------
MF | GO:0030170 | IEA
-----------------------
BP | GO:0009058 | IEA
-----------------------
BP | GO:0033014 | IEA
-----------------------
MF | GO:0005524 | IEA
-----------------------
MF | GO:0016887 | IEA
-----------------------
MF | GO:0042626 | IEA
-----------------------
MF | GO:0140359 | IEA
-----------------------
BP | GO:0055085 | IEA
-----------------------

 UniProt ID: Q8N5M1
MF | GO:0003870 | IEA
-----------------------
MF | GO:0016746 | IEA
-----------------------
MF | GO:0030170 | IEA
-----------------------
BP | GO:0009058 | IEA
-----------------------
BP | GO:0033014 | IEA
-----------------------
MF | GO:0005524 | IEA
-----------------------
MF | GO:0016887 | IEA
-----------------------
MF | GO:0042626 | IEA
-----------------------
MF | GO:0140359 | IEA
-----------------------
BP | GO:0055085 | IEA
-----------------------
