In [8]:
!pip install spacy
# !pip install transformers torch
# !pip install goatools



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# packages
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoModelForTokenClassification

# from goatools import obo_parser

### 1. download the obo file from the gene ontology consortium https://current.geneontology.org/products/pages/downloads.html

In [3]:
print("Loading Gene Ontology database...")
GO_OBO_FILE = "/content/drive/MyDrive/goa_human.gaf"    # DESTINATION OF THE GO OBO FILE
# go_dag = obo_parser.GODag(GO_OBO_FILE)

def load_human_go_annotations(filepath=GO_OBO_FILE):
    """Load human-specific GO annotations from the GOA database"""
    go_annotations = {}

    with open(filepath, "r") as f:
        for line in f:
            if line.startswith("!"):  # Ignore header lines
                continue

            columns = line.strip().split("\t")
            if len(columns) < 9:
                continue  # Ensure we have enough columns

            gene_name = columns[2]  # DB_Object_Symbol (gene name)
            go_id = columns[4]  # GO term
            qualifier = columns[3]  # Relationship (enables, involved_in, etc.)
            aspect = columns[8]  # BP, MF, or CC

            if gene_name not in go_annotations:
                go_annotations[gene_name] = {"BP": [], "MF": [], "CC": []}

            if aspect == "P":
                go_annotations[gene_name]["BP"].append((go_id, qualifier))
            elif aspect == "F":
                go_annotations[gene_name]["MF"].append((go_id, qualifier))
            elif aspect == "C":
                go_annotations[gene_name]["CC"].append((go_id, qualifier))

    return go_annotations

# Load species-specific GO annotations
human_go_annotations = load_human_go_annotations()

# Check if insulin is in the dataset
if "INSULIN" in human_go_annotations:
    print("Found INSULIN in GO annotations!")
else:
    print("INSULIN NOT found in GO annotations!")



Loading Gene Ontology database...
INSULIN NOT found in GO annotations!


### 2. Use pip to install the ner model
- got it from https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy
- github page: https://github.com/allenai/scispacy

In [10]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz
import spacy
print("Loading biomedical NLP model...")
nlp = spacy.load("en_ner_jnlpba_md")  # Scientific/biomedical entity recognition

# Load BioBERT model and tokenizer (set a distinct name for BioBERT)
# from transformers import BertTokenizer, BertForTokenClassification

# # Load the fine-tuned model and tokenizer
# biobert_model = BertForTokenClassification.from_pretrained('/content/drive/MyDrive/fine_tuned_biobert')
# biobert_tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/fine_tuned_biobert')

# print("BioBERT model loaded!")

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz (119.8 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Loading biomedical NLP model...


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


### load the tokenizer and the model and put it to cuda

In [4]:
print("Loading LLaMA model.")
model_dir = "/content/drive/MyDrive/Llama 3.2-3B-Instruct-model"
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
import torch
print(torch.cuda.is_available())

model = AutoModelForCausalLM.from_pretrained(model_dir, local_files_only=True).to(device)


Loading LLaMA model.
True


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# import scispacy
# from scispacy.linking import EntityLinker

# # Load the UMLS Entity Linker (can also use "EntrezGene" for genes)
# linker = EntityLinker(name="umls")

# nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True})

# def get_standardized_gene_name(entity):
#     """Use SciSpacy's entity linker to find the standard gene name."""
#     if entity._.kb_ents:
#         first_match = entity._.kb_ents[0]
#         matched_id = first_match[0]  # UMLS or EntrezGene ID
#         matched_name = linker.kb.cui_to_entity[matched_id].canonical_name
#         return matched_name
#     return entity.text  # Fallback: Use original text if no match is found

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [15]:
def analyze_article(article_text: str) -> dict:
    """Process article text and infer Gene Ontology terms"""

    # Step 1: Extract genes/proteins using SciSpacy
    doc = nlp(article_text)

    detected_genes = {}
    genes = list({ent.text for ent in doc.ents
                if ent.label_ in ["GENE", "PROTEIN"]})

    print(f"Detected entities: {genes}")  # Debugging entity extraction

    # Ensure function always returns a dictionary with expected keys
    if not genes:
        return {"genes": [], "result_text": ""}

    # Step 3: Build structured prompt
    prompt = f"""You are an expert biomedical researcher. Analyze this biomedical article and extract Gene Ontology (GO) terms.

    Article excerpt:
    {article_text[:3000]}... [truncated]

    Detected entities: {', '.join(genes)}

    [OUTPUT FORMAT]
    Return **ONLY** GO terms in this structured format:
    - BP: GO:####### (Biological Process Name)
    - MF: GO:####### (Molecular Function Name)
    - CC: GO:####### (Cellular Component Name)

    Example Output:
    - BP: GO:0006006 (glucose metabolic process)
    - MF: GO:0005543 (insulin receptor binding)
    - CC: GO:0005886 (plasma membrane)
    """


    # Step 4: Generate predictions
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.1,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )

    # Step 5: Process and validate results
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    result_text = generated_text[len(prompt):].strip()

    print(f"\nGenerated GO Terms:\n{result_text}")  # Debugging LLM output

    return {"genes": genes, "result_text": result_text}

In [16]:
import re  # Import regex for better parsing

def remove_duplicates(go_terms):
    """Remove duplicate GO terms while preserving order"""
    seen = set()
    unique_terms = []
    for term in go_terms:
        if term not in seen:
            seen.add(term)
            unique_terms.append(term)
    return unique_terms

def parse_and_validate_results(result_text, genes):
    """Validate generated GO annotations against GO database"""

    # Detect broken output
    if "[IMPORTANT]" in result_text or "Please provide" in result_text:
        print("⚠️ Warning: The model failed to generate GO terms correctly.")
        return {gene: {"BP": [], "MF": [], "CC": []} for gene in genes}

    validated_results = {gene: {"BP": [], "MF": [], "CC": []} for gene in genes}

    bp_terms = re.findall(r"BP:\s*(GO:\d+ \(.+?\))", result_text)
    mf_terms = re.findall(r"MF:\s*(GO:\d+ \(.+?\))", result_text)
    cc_terms = re.findall(r"CC:\s*(GO:\d+ \(.+?\))", result_text)

    for gene in genes:
        validated_results[gene]["BP"] = remove_duplicates(bp_terms)
        validated_results[gene]["MF"] = remove_duplicates(mf_terms)
        validated_results[gene]["CC"] = remove_duplicates(cc_terms)

    print(f"\n✅ Parsed GO terms for {genes}: {validated_results}")  # Debugging

    return validated_results



# ----------------------------
# Test with the sample article
# ----------------------------
sample_article = "insulin is a hormone that regulates blood sugar levels"
print("\nAnalyzing sample article...")
analysis = analyze_article(sample_article)

if not analysis["genes"]:  # Ensure genes exist before proceeding
    print("No genes/proteins detected.")
else:
    genes = analysis["genes"]
    result_text = analysis["result_text"]

    print("\nResults:")
    print(parse_and_validate_results(result_text, genes))



Analyzing sample article...
Detected entities: ['insulin']

Generated GO Terms:
GO terms extracted from the article excerpt:
    - BP: GO:0008150 (carbohydrate metabolic process)
    - MF: GO:0005518 (protein binding)
    - CC: GO:0005886 (plasma membrane)
    - BP: GO:0006006 (glucose metabolic process)
    - MF: GO:0005543 (insulin receptor binding)
    - MF: GO:0005519 (insulin binding)
    - CC: GO:0009611 (extracellular space)
    - MF: GO:0005515 (transmembrane receptor activity)
    - MF: GO:0005516 (transmembrane receptor activity)
    - MF: GO:0005517 (receptor activity)
    - BP: GO:0008152 (glucose-6-phosphate metabolic process)
    - MF: GO:0005518 (protein binding)
    - MF: GO:0005519 (insulin binding)
    - MF: GO:0005515 (transmembrane receptor activity)
    - MF: GO:0005516 (transmembrane receptor activity)
    - MF: GO:0005517 (receptor activity)
    - BP: GO:0008150 (carbohydrate metabolic process)
    - MF: GO:0005543 (insulin receptor binding)
    - MF: GO:0005518