In [1]:
!pip install spacy
!pip install goatools

Collecting goatools
  Downloading goatools-1.4.12-py3-none-any.whl.metadata (14 kB)
Collecting docopt (from goatools)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftpretty (from goatools)
  Downloading ftpretty-0.4.0-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting xlsxwriter (from goatools)
  Downloading XlsxWriter-3.2.2-py3-none-any.whl.metadata (2.8 kB)
Downloading goatools-1.4.12-py3-none-any.whl (15.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftpretty-0.4.0-py2.py3-none-any.whl (8.2 kB)
Downloading XlsxWriter-3.2.2-py3-none-any.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.1/165.1 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# packages
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import spacy
from goatools import obo_parser

### 1. download the obo file from the gene ontology consortium https://geneontology.org/docs/download-ontology/

In [4]:
print("Loading Gene Ontology database...")
GO_OBO_FILE = "/content/drive/MyDrive/GOLLM/go-basic.obo"    # DESTINATION OF THE GO OBO FILE
go_dag = obo_parser.GODag(GO_OBO_FILE)

Loading Gene Ontology database...
/content/drive/MyDrive/GOLLM/go-basic.obo: fmt(1.2) rel(2024-10-27) 44,017 Terms


### 2. Use pip to install the ner model
- got it from https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy
- github page: https://github.com/allenai/scispacy

In [5]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz
print("Loading biomedical NLP model...")
nlp = spacy.load("en_ner_jnlpba_md")  # Scientific/biomedical entity recognition


Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_jnlpba_md-0.5.4.tar.gz (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: en_ner_jnlpba_md
  Building wheel for en_ner_jnlpba_md (setup.py) ... [?25l[?25hdone
  Created wheel for en_ner_jnlpba_md: filename=en_ner_jnlpba_md-0.5.4-py3-none-any.whl size=119812563 sha256=712b5f524ac1433f1b84473a92f5e3d7523ec0b76a5353e8515d2fb26fe76241
  Stored in directory: /root/.cache/pip/wheels/54/08/75/70edaa77aaa14899be57afbfc00d0973376abb89d215540fa2
Successfully built en_ner_jnlpba_md
Installing collected packages: en_ner_jnlpba_md
Successfully installed en_ner_jnlpba_md-0.5.4
Loading biomedical NLP model...


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


### load the tokenizer and the model and put it to cuda

In [6]:
print("Loading LLaMA model.")
model_dir = "/content/drive/MyDrive/GOLLM/Llama 3.2-3B-Instruct-model"
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, local_files_only=True).to(device)


Loading LLaMA model.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [13]:

# ----------------------------
# 4. Gene Ontology Analysis Pipeline
# ----------------------------
def analyze_article(article_text: str) -> dict:
    """Process article text and infer Gene Ontology terms"""

    # Step 1: Extract genes/proteins using SciSpacy
    doc = nlp(article_text)
    genes = list({ent.text for ent in doc.ents
                if ent.label_ in ["GENE", "PROTEIN"]})

    if not genes:
        return {"error": "No genes/proteins detected"}

    # Step 2: Build structured prompt
    prompt = f"""Analyze this biomedical article and infer Gene Ontology terms:

    Article excerpt:
    {article_text[:3000]}... [truncated]

    Detected entities: {', '.join(genes)}

    [IMPORTANT]
    For each entity, provide:
    1. Biological Process (BP)
    2. Molecular Function (MF)
    3. Cellular Component (CC)

    [IMPORTANT]
    For insulin, the cellular component is 'insulin receptor complex'

    Format exactly like this:
    Gene: [NAME]
    - BP: [TERM] (confidence: high/medium/low)
    - MF: [TERM]
    - CC: [TERM]
    """

    # Step 3: Generate predictions
    # - add max_length (optional)
    # - truncation (optional)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )

    # Step 4: Process and validate results
    result_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return parse_and_validate_results(result_text, genes)

def parse_and_validate_results(text: str, expected_genes: list) -> dict:
    """Parse model output and validate against GO database"""
    results = {}
    current_gene = None

    for line in text.split('\n'):
        if line.startswith("Gene:"):
            current_gene = line.split(":")[1].strip()
            if current_gene in expected_genes:
                results[current_gene] = {"BP": None, "MF": None, "CC": None}
        elif current_gene:
            if "- BP:" in line:
                results[current_gene]["BP"] = validate_term(line, "biological_process")
            elif "- MF:" in line:
                results[current_gene]["MF"] = validate_term(line, "molecular_function")
            elif "- CC:" in line:
                results[current_gene]["CC"] = validate_term(line, "cellular_component")

    return results

def validate_term(line: str, ontology_type: str) -> dict:
    """Validate a GO term against the ontology database"""
    term = line.split(":")[1].split("(")[0].strip()
    return {
        "term": term,
        "valid": any(term.lower() == go_term.name.lower()
                    for go_term in go_dag.values()
                    if go_term.namespace == ontology_type),
        "original_line": line.strip()
    }

# ----------------------------
# 5. Example Usage
# ----------------------------
if __name__ == "__main__":
    # sample_article = """
    # The tumor suppressor protein p53 (TP53) plays a crucial role in DNA repair mechanisms
    # through regulation of downstream targets including MDM2 and BRCA1. Recent studies
    # demonstrate that TP53-mediated activation of these genes facilitates homologous
    # recombination repair via interaction with RAD51.
    # """
    # sample_article = open("C:/Users/aivan/Desktop/BIOIN 401/GOLLM/data/fetched_articles/20833636.txt", "r").read()
    sample_article = "insulin is a hormone that regulates blood sugar levels"
    print("\nAnalyzing sample article...")
    analysis = analyze_article(sample_article)

    print("\nResults:")
    if 'error' in analysis:
        print(f"Error: {analysis['error']}")
    else:
        for gene, terms in analysis.items():
            print(f"\nGene: {gene}")
            for ontology in ["BP", "MF", "CC"]:
                data = terms[ontology]
                status = "✓" if data["valid"] else "✗"
                print(f"  {ontology}: {status} {data['term']}")


Analyzing sample article...

Results:

Gene: insulin
  BP: ✗ glucose metabolism
  MF: ✗ glucose uptake
  CC: ✓ insulin receptor complex
