In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# packages
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# from goatools import obo_parser

In [None]:
!pip install biopython
import xml.etree.ElementTree as ET
from Bio import Entrez
import re
import time

class Gene:
    def __init__(self, gene_id):
        self.gene_id = gene_id
        self.occurrences = []  # list to store snippet(s) where the gene was mentioned
        self.symbol = None
        self.organism = None
        self.full_name = None
        self.also_known_as = None
        self.name_from_article = None

    def add_occurrence(self, snippet):
        if snippet not in self.occurrences:  # Avoid duplicates
            self.occurrences.append(snippet)

    def set_name_from_article(self, name_from_article):
        """Sets the temporary name of the gene. This is the name accroding to the article"""
        # will be used if the official name is not available
        self.name_from_article = name_from_article

    def get_name_from_article(self):
        return self.name_from_article

    def get_occurrences(self):
        return self.occurrences

    def get_also_known_as(self):
        return self.also_known_as

    def get_gene_id(self):
        return self.gene_id

    def update_info(self, symbol, organism, full_name, also_known_as):
        self.symbol = symbol
        self.organism = organism
        self.full_name = full_name
        self.also_known_as = also_known_as

    def __repr__(self):
        return (f"Gene({self.gene_id})\n"
                f"  Symbol          : {self.symbol}\n"
                f"  Organism        : {self.organism}\n"
                f"  Full Name       : {self.full_name}\n"
                f"  Also Known As   : {self.also_known_as}\n"
                f"  In-text Name    : {self.name_from_article}\n"
                f"  Occurrences     : {self.occurrences}")   # occurrences is a list of 3-sentence snippets

def parse_xml_file(xml_path):
    """Parses the XML file and returns a gene dictionary keyed by gene ID."""
    tree = ET.parse(xml_path)
    root = tree.getroot()
    gene_dict = {}

    for document in root.findall('document'):
        for passage in document.findall('passage'):
            section_type_elem = passage.find("infon[@key='section_type']")
            if section_type_elem is not None and section_type_elem.text.upper() == "METHODS":
                continue  # Skip passages under METHODS
            # Get the full passage text.
            passage_text_elem = passage.find("text")
            passage_text = passage_text_elem.text if passage_text_elem is not None else ""
            # determine the starting offset for this passage.
            passage_offset_elem = passage.find("offset")
            passage_offset = int(passage_offset_elem.text) if passage_offset_elem is not None else 0
            # Split the passage into sentences using regex.
            sentences = re.split(r'(?<=[.!?])\s+', passage_text)
            # Compute start indices for each sentence within the passage text.
            start_indices = []
            current_index = passage_offset
            for sentence in sentences:
                start_indices.append(current_index)
                current_index += len(sentence) + 1  # account for the delimiter space

            processed_ranges = set()  # To track which sentences have already been covered

            # Process each annotation in the passage.
            for annotation in passage.findall('annotation'):
                ann_type = annotation.find("infon[@key='type']")
                if ann_type is not None and ann_type.text == "Gene":
                    # Check for both 'identifier' and 'NCBI Gene' keys
                    gene_id_elem = annotation.find("infon[@key='identifier']")
                    if gene_id_elem is None:
                        gene_id_elem = annotation.find("infon[@key='NCBI Gene']")
                    gene_id = gene_id_elem.text if gene_id_elem is not None else None

                    # Extract the gene name from the annotation text. (Temporary name if official name is not available)
                    in_text_gene_name_elem = annotation.find("text")
                    in_text_gene_name = in_text_gene_name_elem.text if in_text_gene_name_elem is not None else None

                    # Extract the annotation location (offset) to find the sentence containing the gene.
                    location_elem = annotation.find("location")
                    ann_offset = int(location_elem.attrib.get('offset', 0)) if location_elem is not None else 0

                    # Determine which sentence contains the annotation based on its offset.
                    sentence_index = None
                    for i, start in enumerate(start_indices):
                        if start <= ann_offset < start + len(sentences[i]):
                            sentence_index = i
                            break

                    # sentence_buffer
                    if sentence_index is not None:
                        start_sentence = max(0, sentence_index - 1)  # one sentence before
                        end_sentence = min(len(sentences), sentence_index + 2)  # one sentence after
                        range_tuple = (start_sentence, end_sentence)
                        if range_tuple in processed_ranges:
                            continue  # Skip duplicate extractions
                        processed_ranges.add(range_tuple)
                        snippet = " ".join(sentences[start_sentence:end_sentence])
                        if gene_id:
                            if gene_id in gene_dict:
                                gene_dict[gene_id].add_occurrence(snippet)
                            else:
                                gene_obj = Gene(gene_id)
                                gene_obj.add_occurrence(snippet)
                                gene_obj.set_name_from_article(in_text_gene_name)       # Temporary name if official name is not available
                                gene_dict[gene_id] = gene_obj
    return gene_dict

def fetch_and_update_gene_info(gene_dict):
    """Retrieves gene information from NCBI and updates the genes in gene_dict."""
    gene_ids = list(gene_dict.keys())
    if gene_ids:
        # Post the gene IDs to NCBI.
        handle = Entrez.epost(db="gene", id=",".join(gene_ids))
        result = Entrez.read(handle)
        handle.close()

        webenv = result["WebEnv"]
        query_key = result["QueryKey"]

        handle = Entrez.esummary(db="gene", webenv=webenv, query_key=query_key)
        record = Entrez.read(handle)
        handle.close()

        for docsum in record["DocumentSummarySet"]["DocumentSummary"]:
            gene_id = docsum.attributes["uid"]
            symbol = docsum.get('NomenclatureSymbol', 'No symbol')
            organism = docsum.get('Organism', {}).get('ScientificName', 'No organism')
            full_name = docsum.get('NomenclatureName', gene_dict[gene_id].get_name_from_article())
            also_known_as = docsum.get('OtherAliases', gene_dict[gene_id].get_name_from_article())

            # if full name and also known as are not available, use the name from the article
            if (full_name == ''):
                full_name = gene_dict[gene_id].get_name_from_article()
            if (also_known_as == ''):
                also_known_as = gene_dict[gene_id].get_name_from_article()

            if gene_id in gene_dict:
                gene_dict[gene_id].update_info(symbol, organism, full_name, also_known_as)
            # Pause briefly to avoid overwhelming NCBI servers.
            time.sleep(0.5)     # 2 requests per second (safe). WITH API KEY, be increased if needed to 10 requests per second.


# Set your email (and API key if available)
Entrez.email = "email here"
# Entrez.api_key = "your_api_key"

# example2.xml takes a while since it has 64 genes.
xml_path = "/content/drive/MyDrive/GOLLM/input/full_text_annotated_example2.xml"
gene_dict = parse_xml_file(xml_path)
fetch_and_update_gene_info(gene_dict)           # Fetch gene information from NCBI (Optional but has good information)
for gene in gene_dict.values():
    print(gene)
    print(f"There are {len(gene.get_occurrences())} occurrences (each are AT MOST 3-setences long SO MAX of {int(3*len(gene.get_occurrences()))} sentences in total) of {gene.get_gene_id()}:{gene.get_name_from_article()}.")
    print("")

Gene(433766)
  Symbol          : Trim63
  Organism        : Mus musculus
  Full Name       : tripartite motif-containing 63
  Also Known As   : MuRF1, RF1, Rnf28
  In-text Name    : Muscle-specific RING finger protein-1
  Occurrences     : ['In particular, atrogenes are thought to be important for protein loss. Muscle-specific RING finger protein-1 (MuRF1) is an E3 ubiquitin ligase selectively expressed in cardiac and skeletal muscles. This protein is upregulated during skeletal muscle atrophy and is able to control sarcomere muscle protein degradation.', 'The first muscle-specific ubiquitin ligases that were discovered to have a role in muscle loss were Atrogin-1/MAFbx and muscle ring finger-1 (MuRF1). Substrates of MuRF1 are myosin heavy chains, myosin light chains, actin, myosin binding protein C, and troponin I. It has been found that MuRF1 knockout mice are more resistant to muscle atrophy produced by different pathological conditions.', 'Substrates of MuRF1 are myosin heavy chain

In [4]:
def make_go_term_prompt(gene_dict, batch_size=1, batch_index=0, direct=False, customized_prompt=None):
    """
    Create a prompt for Llama to output Gene Ontology (GO) terms (without IDs) for a batch of genes
    from the gene dictionary. The output is expected to be categorized into three sections for each gene:
      1. Biological Processes
      2. Cellular Components
      3. Molecular Functions

    For each gene, the output should follow the format below without mixing results across different genes.

    :param gene_dict: A dictionary where each key is a gene ID and each value is a Gene object that
                      contains attributes such as symbol, organism, full_name, and occurrences.
    :param batch_size: The number of genes to include in this prompt batch.
    :param batch_index: The index (0-based) of the batch to process. For instance, if batch_size=1,
                        batch_index=2 will include the 3rd gene in the dictionary.
    :param direct: If True, use a more direct instruction for GO term extraction.
    :param customized_prompt: If provided, use this custom prompt text in place of the default.
    :return: A string containing the complete prompt.
    """

    # Instruction blocks

    context = """You are an efficient and insightful assistant to a geneticist."""

    general_instructions = """Focus on extracting the most relevant Gene Ontology (GO) terms for the following aspects:
    1. Biological Processes
    2. Cellular Components
    3. Molecular Functions"""

    task_instructions = """Analyze the provided occurrences to identify and extract the most pertinent Gene Ontology (GO) terms categorized into:
    - Biological Processes
    - Cellular Components
    - Molecular Functions."""

    direct_instructions = """Extract the Gene Ontology (GO) terms for Biological Processes, Cellular Components, and Molecular Functions from the given occurrences.
    """

    format_instructions = """Use the following output format:
    Gene: <Gene Symbol or ID>

    Biological Processes:
    - <GO term>
    - <GO term>
    ...

    Cellular Components:
    - <GO term>
    - <GO term>
    ...

    Molecular Functions:
    - <GO term>
    - <GO term>
    ...
    """

    # Example output to illustrate the expected format
    example_output = """ Example Output:

    Gene: GFAP

    Biological Processes:
    - intermediate filament organization
    - gene expression
    - astrocyte development

    Cellular Components:
    - cytoplasm
    - cell projection
    - cell body

    Molecular Functions:
    - integrin binding
    - kinase binding
    - protein binding
    """

    # Assemble the prompt text based on the chosen mode
    if direct:
        prompt_text = context
        prompt_text += direct_instructions
        prompt_text += format_instructions
        prompt_text += example_output
    elif customized_prompt:
        prompt_text = context
        prompt_text += customized_prompt
        prompt_text += format_instructions
        prompt_text += example_output
    else:
        # prompt_text = context
        # prompt_text += task_instructions
        # prompt_text += general_instructions
        # prompt_text += format_instructions
        # prompt_text += example_output
        prompt_text = """Extract the Gene Ontology (GO) terms for Biological Processes, Cellular Components, and Molecular Functions from the given occurrences. Use the following output format:
Gene: <Gene Symbol or ID>

Biological Processes:
- <GO term>
- <GO term>
...

Cellular Components:
- <GO term>
- <GO term>
...

Molecular Functions:
- <GO term>
- <GO term>
...
        """
    prompt_text += "\nGene Context:\n"

    # Convert the gene dictionary to a list and determine the batch slice.
    gene_items = list(gene_dict.items())
    start_index = batch_index * batch_size
    end_index = start_index + batch_size
    batch_genes = gene_items[start_index:end_index]

    # Add details and occurrence context for each gene in the selected batch.
    for gene_id, gene_obj in batch_genes:
        # prompt_text += f"\nGene ID: {gene_id}\n"
        # prompt_text += f"Symbol: {gene_obj.symbol}\n"
        # prompt_text += f"Organism: {gene_obj.organism}\n"
        # prompt_text += f"Full Name: {gene_obj.full_name}\n"
        prompt_text += f"\nGene: {gene_obj.get_name_from_article()}\n"
        prompt_text += "Occurrences:\n"
        for occ in gene_obj.occurrences:
            prompt_text += f"- {occ}\n"

    return prompt_text

# Example usage:
# Assuming gene_dict is defined and contains Gene objects with attributes: symbol, organism, full_name, and occurrences.
# To process only one gene at a time:
# prompt = make_go_term_prompt(gene_dict, batch_size=1, batch_index=0)
# print(prompt)

prompt = make_go_term_prompt(gene_dict)
print(prompt)

Extract the Gene Ontology (GO) terms for Biological Processes, Cellular Components, and Molecular Functions from the given occurrences. Use the following output format:
Gene: <Gene Symbol or ID>

Biological Processes:
- <GO term>
- <GO term>
...

Cellular Components:
- <GO term>
- <GO term>
...

Molecular Functions:
- <GO term>
- <GO term>
...
        
Gene Context:

Gene: Muscle-specific RING finger protein-1
Occurrences:
- In particular, atrogenes are thought to be important for protein loss. Muscle-specific RING finger protein-1 (MuRF1) is an E3 ubiquitin ligase selectively expressed in cardiac and skeletal muscles. This protein is upregulated during skeletal muscle atrophy and is able to control sarcomere muscle protein degradation.
- The first muscle-specific ubiquitin ligases that were discovered to have a role in muscle loss were Atrogin-1/MAFbx and muscle ring finger-1 (MuRF1). Substrates of MuRF1 are myosin heavy chains, myosin light chains, actin, myosin binding protein C, an

In [5]:
print("Loading LLaMA model.")
# model_dir = "/content/drive/MyDrive/Llama 3.2-3B-Instruct-model"
model_dir = "/content/drive/MyDrive/GOLLM/Llama 3.2-3B-Instruct-model"
device = torch.device("cuda")
print(torch.cuda.is_available())

tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, local_files_only=True).to(device)


Loading LLaMA model.
True


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
test_prompt = """
Please analyze the following gene information and text snippets.
Identify all potentially relevant Gene Ontology (GO) terms for each of these three categories:
1) Biological Process
2) Molecular Function
3) Cellular Component

For each GO term you list, include a short justification that references clues from the text.
If you cannot identify any GO terms for a particular category, say “None found.”

Be concise and factual.
Avoid overly general statements like "it is involved in many cellular processes."

Below is an example format to illustrate how you might structure your output:

Example Analysis Output
----------------------------------------
Biological Processes:
- Negative regulation of cell proliferation
  Justification: The text mentions suppressed proliferation in certain cells.

Cellular Component:
- Intermediate filament
  Justification: The protein is described as an integral part of the cytoskeletal structure in astrocytes.

Molecular Function:
- Structural molecule activity
  Justification: GFAP provides structural support, as indicated by references to glial fibrillary structures.
----------------------------------------

Gene Information:
----------------------------------------
Gene: Muscle-specific RING finger protein-1
Occurrences:
- In particular, atrogenes are thought to be important for protein loss. Muscle-specific RING finger protein-1 (MuRF1) is an E3 ubiquitin ligase selectively expressed in cardiac and skeletal muscles. This protein is upregulated during skeletal muscle atrophy and is able to control sarcomere muscle protein degradation.
- The first muscle-specific ubiquitin ligases that were discovered to have a role in muscle loss were Atrogin-1/MAFbx and muscle ring finger-1 (MuRF1). Substrates of MuRF1 are myosin heavy chains, myosin light chains, actin, myosin binding protein C, and troponin I. It has been found that MuRF1 knockout mice are more resistant to muscle atrophy produced by different pathological conditions.
- Substrates of MuRF1 are myosin heavy chains, myosin light chains, actin, myosin binding protein C, and troponin I. It has been found that MuRF1 knockout mice are more resistant to muscle atrophy produced by different pathological conditions. Accordingly, the MuRF1 overexpression in mice enhances ubiquitination of myofibrillar and sarcoplasmic proteins, and causes neuromuscular junction instability and muscle atrophy.
- It has been found that MuRF1 knockout mice are more resistant to muscle atrophy produced by different pathological conditions. Accordingly, the MuRF1 overexpression in mice enhances ubiquitination of myofibrillar and sarcoplasmic proteins, and causes neuromuscular junction instability and muscle atrophy. However, the absence of MuRF1 does not protect soleus muscle atrophy induced by spaceflight, showing that this model is quite different with respect to other models of atrophy.
- Accordingly, the MuRF1 overexpression in mice enhances ubiquitination of myofibrillar and sarcoplasmic proteins, and causes neuromuscular junction instability and muscle atrophy. However, the absence of MuRF1 does not protect soleus muscle atrophy induced by spaceflight, showing that this model is quite different with respect to other models of atrophy. During atrophy, other E3s are produced in smaller quantities, including Trim32, a ligase important for actin, tropomyosin, and troponins degradation.
- For example, the inhibition of some specific muscle proteins such as MAFbx or Foxo, ATF4, Gadd45a, p21, and MEKK4 transcription factors can avoid the loss of muscle mass without leading to a hypertrophic condition. Among the different ubiquitin ligases, MuRF1 and TRAF6 have been studied for the development of muscle-specific inhibitors. However, the removal of MuRF1 or TRAF6 only partially preserves muscle atrophy during denervation in mice.
- Among the different ubiquitin ligases, MuRF1 and TRAF6 have been studied for the development of muscle-specific inhibitors. However, the removal of MuRF1 or TRAF6 only partially preserves muscle atrophy during denervation in mice. As an example, MG132 proteasome inhibitor can prevent muscle atrophy associated with muscle disuse.
- Therefore, the administration of protein supplements or compounds able to stimulate protein synthesis could be useful in the prevention of atrophy. For instance, ursolic acid stimulates muscle hypertrophy and reduces the expression of MuRF-1 in soleus and extensor digitorum longus (EDL) muscle of chronic kidney disease mice. It also reduced myostatin and insulin-like growth factor-binding protein 3 (IGFBP3) in dexamethasone-treated C2C12 cell lines.
- Identification of the MuRF1 Skeletal Muscle Ubiquitylome Through Quantitative Proteomics
- Skeletal muscle in MuRF1 null mice is not spared in low-gravity conditions, indicating atrophy proceeds by unique mechanisms in space

----------------------------------------

Now, please provide a similar list of GO terms (or “None found” if no terms apply) under each of the three categories, with a one-line justification for each.
"""
print(test_prompt)



Please analyze the following gene information and text snippets. 
Identify all potentially relevant Gene Ontology (GO) terms for each of these three categories:
1) Biological Process
2) Molecular Function
3) Cellular Component

For each GO term you list, include a short justification that references clues from the text. 
If you cannot identify any GO terms for a particular category, say “None found.”

Be concise and factual. 
Avoid overly general statements like "it is involved in many cellular processes."

Below is an example format to illustrate how you might structure your output:

Example Analysis Output
----------------------------------------
Biological Processes:
- Negative regulation of cell proliferation 
  Justification: The text mentions suppressed proliferation in certain cells.

Cellular Component:
- Intermediate filament
  Justification: The protein is described as an integral part of the cytoskeletal structure in astrocytes.

Molecular Function:
- Structural molecule ac

In [7]:
test_prompt1 = """
Please analyze the following gene information and text snippets under "Occurrences".
Identify all potentially relevant Gene Ontology (GO) terms for each of these three categories:
1) Biological Process
2) Molecular Function
3) Cellular Component

If you cannot identify any GO terms for a particular category, say “None found.”
For each term you list, provide a short one-line explanation or justification referencing the text.

Be concise and factual.
Avoid overly general statements like "it is involved in many cellular processes."

Below is an example format to illustrate how you should structure your output:

Example Analysis Output
----------------------------------------
Biological Processes (BP):
- Negative regulation of cell proliferation
  Justification: The text mentions suppressed proliferation in certain cells.

Cellular Components (CC):
- Intermediate filament
  Justification: The text identifies GFAP as part of the cytoskeleton structure in astrocytes.

Molecular Functions (MF):
- Structural molecule activity
  Justification: The text describes a role in providing structural support.

----------------------------------------

Gene Information:
----------------------------------------
Gene: Muscle-specific RING finger protein-1
Organism: Mus musculus
Occurrences:
- In particular, atrogenes are thought to be important for protein loss. Muscle-specific RING finger protein-1 (MuRF1) is an E3 ubiquitin ligase selectively expressed in cardiac and skeletal muscles. This protein is upregulated during skeletal muscle atrophy and is able to control sarcomere muscle protein degradation.
- The first muscle-specific ubiquitin ligases that were discovered to have a role in muscle loss were Atrogin-1/MAFbx and muscle ring finger-1 (MuRF1). Substrates of MuRF1 are myosin heavy chains, myosin light chains, actin, myosin binding protein C, and troponin I. It has been found that MuRF1 knockout mice are more resistant to muscle atrophy produced by different pathological conditions.
- Substrates of MuRF1 are myosin heavy chains, myosin light chains, actin, myosin binding protein C, and troponin I. It has been found that MuRF1 knockout mice are more resistant to muscle atrophy produced by different pathological conditions. Accordingly, the MuRF1 overexpression in mice enhances ubiquitination of myofibrillar and sarcoplasmic proteins, and causes neuromuscular junction instability and muscle atrophy.
- It has been found that MuRF1 knockout mice are more resistant to muscle atrophy produced by different pathological conditions. Accordingly, the MuRF1 overexpression in mice enhances ubiquitination of myofibrillar and sarcoplasmic proteins, and causes neuromuscular junction instability and muscle atrophy. However, the absence of MuRF1 does not protect soleus muscle atrophy induced by spaceflight, showing that this model is quite different with respect to other models of atrophy.
- Accordingly, the MuRF1 overexpression in mice enhances ubiquitination of myofibrillar and sarcoplasmic proteins, and causes neuromuscular junction instability and muscle atrophy. However, the absence of MuRF1 does not protect soleus muscle atrophy induced by spaceflight, showing that this model is quite different with respect to other models of atrophy. During atrophy, other E3s are produced in smaller quantities, including Trim32, a ligase important for actin, tropomyosin, and troponins degradation.
- For example, the inhibition of some specific muscle proteins such as MAFbx or Foxo, ATF4, Gadd45a, p21, and MEKK4 transcription factors can avoid the loss of muscle mass without leading to a hypertrophic condition. Among the different ubiquitin ligases, MuRF1 and TRAF6 have been studied for the development of muscle-specific inhibitors. However, the removal of MuRF1 or TRAF6 only partially preserves muscle atrophy during denervation in mice.
- Among the different ubiquitin ligases, MuRF1 and TRAF6 have been studied for the development of muscle-specific inhibitors. However, the removal of MuRF1 or TRAF6 only partially preserves muscle atrophy during denervation in mice. As an example, MG132 proteasome inhibitor can prevent muscle atrophy associated with muscle disuse.
- Therefore, the administration of protein supplements or compounds able to stimulate protein synthesis could be useful in the prevention of atrophy. For instance, ursolic acid stimulates muscle hypertrophy and reduces the expression of MuRF-1 in soleus and extensor digitorum longus (EDL) muscle of chronic kidney disease mice. It also reduced myostatin and insulin-like growth factor-binding protein 3 (IGFBP3) in dexamethasone-treated C2C12 cell lines.
- Identification of the MuRF1 Skeletal Muscle Ubiquitylome Through Quantitative Proteomics
- Skeletal muscle in MuRF1 null mice is not spared in low-gravity conditions, indicating atrophy proceeds by unique mechanisms in space

----------------------------------------

Now, please only provide a similar list of GO terms (or “None found” if no terms apply) under each of the three categories with a brief justification for each.
"""
print(test_prompt1)



Please analyze the following gene information and text snippets under "Occurrences". 
Identify all potentially relevant Gene Ontology (GO) terms for each of these three categories:
1) Biological Process
2) Molecular Function
3) Cellular Component

If you cannot identify any GO terms for a particular category, say “None found.”
For each term you list, provide a short one-line explanation or justification referencing the text.

Be concise and factual. 
Avoid overly general statements like "it is involved in many cellular processes."

Below is an example format to illustrate how you should structure your output:

Example Analysis Output
----------------------------------------
Biological Processes (BP):
- Negative regulation of cell proliferation
  Justification: The text mentions suppressed proliferation in certain cells.

Cellular Components (CC):
- Intermediate filament 
  Justification: The text identifies GFAP as part of the cytoskeleton structure in astrocytes.

Molecular Functions

In [8]:
test_prompt2 = """
Write a critical analysis of the biological processes performed by this system of interacting proteins.

Base your analysis on prior knowledge available in your training data. After completing your analysis, propose a brief and detailed name for the most prominent biological process performed by the system.

After completing your analysis, please also assign a confidence level to the process name you selected. This confidence level should follow the name in parentheses and be one of the following: Low, Medium, or High. A low confidence level indicates the least confidence, while a high confidence level reflects the most confidence. This confidence level helps gauge how accurately the chosen name represents the functions and activities within the system of interacting proteins. When determining your confidence level, consider the proportion of genes in the protein system that participate in the identified biological process. For instance, if you select “Ribosome biogenesis” as the process name but only a few genes in the system contribute to this process, the confidence level should be low compared to a scenario where a majority of the genes are involved in “Ribosome biogenesis.”

Put your chosen name at the top of the analysis as 'Name: <name>'.
Put your confidence in a new line after the name as ‘LLM self-assessed confidence: <confidence level>’

Be concise: Avoid unnecessary words.
Be factual: Do not editorialize.
Be specific: Avoid overly general statements such as ‘the proteins are involved in various cellular processes’.
Group proteins: If group of  proteins has similar functions then discuss their interplay, synergistic, or antagonistic effects, and functional integration within the system, instead of listing facts about individual proteins
Avoid generic process names: Choose detailed and specific names, avoiding generic names like ‘Cellular Signaling and Regulation’.
Contextual relevance: Ensure the analysis is relevant to the specific biological context of the system of interacting proteins if provided.

If you cannot identify a prominent biological process for the proteins in the system, I want you to communicate this in you analysis and name the process: "System of unrelated proteins". Provide a confidence of ‘None’ for a "System of unrelated proteins".

To help you in your work, I am providing an example system of interacting proteins and the corresponding example analysis output.

The example system of interacting proteins is:
PDX1, SLC2A2, NKX6-1, GLP1, GCG.

The example analysis output is:
Name: Pancreatic development and glucose homeostasis
LLM self-assessed confidence: High

1. PDX1 is a homeodomain transcription factor involved in the specification of the early pancreatic epithelium and
its subsequent differentiation. It activates the transcription of several genes including insulin, somatostatin, glucokinase
and glucose transporter type 2. It is essential for maintenance of the normal hormone-producing phenotype in the
pancreatic beta-cell. In pancreatic acinar cells, forms a complex with PBX1b and MEIS2b and mediates the activation
of the ELA1 enhancer.

2. NKX6-1 is also a transcription factor involved in the development of pancreatic beta-cells during the secondary transition.
Together with NKX2-2 and IRX3, controls the generation of motor neurons in the neural tube and belongs to the neural progenitor
factors induced by Sonic Hedgehog (SHH) signals.

3.GCG and GLP1, respectively glucagon and glucagon-like peptide 1, are involved in glucose metabolism and homeostasis.
GCG raises blood glucose levels by promoting gluconeogenesis and is the counter regulatory hormone of Insulin.
GLP1 is a potent stimulator of Glucose-Induced Insulin Secretion (GSIS). Plays roles in gastric motility and
suppresses blood glucagon levels. Promotes growth of the intestinal epithelium and pancreatic islet mass both by islet
neogenesis and islet cell proliferation.

4. SLC2A2, also known as GLUT2, is a facilitative hexose transporter. In hepatocytes, it mediates bi-directional
transport of glucose accross the plasma membranes, while in the pancreatic beta-cell, it is the main transporter responsible
for glucose uptake and part of the cell's glucose-sensing mechanism. It is involved in glucose transport in the small intestine
and kidney too.

To summarize, the genes in this set are involved in the specification, differentiation, growth and functionality of the pancreas,
with a particular emphasis on the pancreatic beta-cell. Particularly, the architecture of the pancreatic islet ensures proper
glucose sensing and homeostasis via a number of different hormones and receptors that can elicit both synergistic and antagonistic
effects in the pancreas itself and other peripheral tissues.


Here are the interacting proteins:

Proteins:
GNG5 TBX5 ISL1 RBPJ CTNNB1 NOTCH1 SMAD4 EYA1 BMP10 SOX9 HES1 ENG MKS1 SIX1 TBX3 HAND2 PIM1 BMPR2
"""

In [9]:
def run_inference(prompt):
    """
    Run inference on the provided prompt using the specified model and tokenizer.

    :param prompt: The prompt string to send to the model.
    :return: The generated text.
    """
    # Encode the prompt and move to the appropriate device
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    attention_mask = inputs["attention_mask"]

    # Greedy or minimal variation == When: You want consistent, short answers, and you don’t mind if it’s a bit “blunt.”
    # outputs = model.generate(
    #     input_ids=inputs["input_ids"],
    #     attention_mask=attention_mask,
    #     max_new_tokens=150,
    #     do_sample=False,     # no sampling
    #     num_beams=1,         # purely greedy
    #     no_repeat_ngram_size=3,
    #     pad_token_id=tokenizer.eos_token_id
    # )

    # Beam Search (Higher Quality / Less Randomness) == You want a more “global optimum” text.
    # outputs = model.generate(
    #     input_ids=inputs["input_ids"],
    #     attention_mask=attention_mask,
    #     max_new_tokens=200,
    #     do_sample=False,
    #     num_beams=4,      # search multiple beams
    #     length_penalty=1.0,  # see if you want to encourage or discourage long outputs
    #     no_repeat_ngram_size=3,
    #     pad_token_id=tokenizer.eos_token_id
    # )

    # Beam SearchV2
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=attention_mask,
        max_new_tokens=200,       # Enough tokens for a concise but thorough answer
        do_sample=False,          # Turn off sampling; we want a more deterministic, stable answer
        num_beams=4,              # Explore multiple beams for higher-quality completions
        length_penalty=1.0,       # 1.0 means "neutral" length preference (>=1.0 encourages longer outputs)
        no_repeat_ngram_size=3,   # Helps avoid repeating the same phrase
        early_stopping=True,      # Stops as soon as the best beam is complete
        pad_token_id=tokenizer.eos_token_id,
        temperature=None,
        top_p=None,
    )
    # Sampling (More Creative / Less Deterministic)
    # outputs = model.generate(
    #     input_ids=inputs["input_ids"],
    #     attention_mask=attention_mask,
    #     max_new_tokens=200,
    #     do_sample=True,       # sampling
    #     temperature=0.7,      # moderate creativity
    #     top_p=0.9,            # nucleus sampling
    #     no_repeat_ngram_size=3,
    #     pad_token_id=tokenizer.eos_token_id
    # )

    # Decode the output and return the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    completion = generated_text[len(prompt):].strip()
    return completion

for i in range(10):
    start_time = time.time()
    output = run_inference(test_prompt1)
    print(output)

    print(f"Model loaded on {device} and it took {time.time()-start_time:.2f} seconds.")
    print("-"*50)

# prompt = "what is 2+2"
# print("Running inference...")
# start_time = time.time()
# output = run_inference(prompt)
# print(f"Model loaded on {device} in {time.time()-start_time:.2f} seconds.")
# print(output)


Biological Process:
- Muscle atrophy
- Regulation of protein degradation
- Ubiquitination
- Response to denervation
- Muscle disuse atrophy

Justification:
These terms are relevant to the biological processes described in the text, such as muscle loss, protein degradation, and the effects of denervation and disuse on muscle tissue. 

Molecular Function:
- Ligase activity
- Protein degradation (ubiquitin-dependent)
- Muscle protein degradation (specific)
- Structural support
- Actin degradation (part of ubiquitin-dependent process)

Justification: 
These terms describe the molecular functions attributed to the protein, including its role in ligating ubiquitin to target proteins, degrading muscle proteins specifically, and contributing to structural support. 

Cellular Component:
- Cytoskeleton
- Sarcomeres
- Myofibrils
- Sarcoplasm
- Intermediate filaments
- Cytoplasm

 Justification:
The text mentions the involvement of
Model loaded on cuda and it took 36.62 seconds.
------------------