In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4,5,6,7"

from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained("/storage1/fs1/yinjie.tang/Active/Shawn_Xiao/Qwen1.5-72B-Chat",
    device_map="auto", torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained("/storage1/fs1/yinjie.tang/Active/Shawn_Xiao/Qwen1.5-72B-Chat")

#Qwen1.5-72B-Chat-GPTQ-Int4
#Qwen1.5-14B-Chat

Loading checkpoint shards:   0%|          | 0/38 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from pyvis.network import Network
import pandas as pd
import re
import networkx as nx

# Load the Excel file
filepath = 'modified_updated(Qwen1.5 72b)_GGPPS_causal.xlsx'
df = pd.read_excel(filepath, engine='openpyxl')

# Initialize NetworkX Graph
G = nx.Graph()

# Nodes to exclude
# words_to_exclude = ['Synechocystis', 'Cyanobacteria', 'cyanobacteria']
words_to_exclude = []

# Regular expression to match the pattern (entity A, entity B)
pattern = r'\(([^,]+), ([^\)]+)\)'

# Iterate over the DataFrame rows to extract entity pairs and their sources
for _, row in df.iterrows():
    # value = row['Response to New Question']
    value = row['Answer to Question 2']
    source = row['Title']  # Extract source for each pair

    matches = re.findall(pattern, value)
    for entity_a, entity_b in matches:
        # Check if any word to exclude is part of the entity names
        if not any(word in entity_a for word in words_to_exclude) and not any(word in entity_b for word in words_to_exclude):
            G.add_node(entity_a, label=entity_a)
            G.add_node(entity_b, label=entity_b)
            G.add_edge(entity_a, entity_b, title=source)

def search_network(graph, keyword, depth=2):
    nodes_of_interest = {n for n, attr in graph.nodes(data=True) if keyword.lower() in attr['label'].lower()}
    for _ in range(depth):
        for node in list(nodes_of_interest):
            nodes_of_interest.update(set(nx.neighbors(graph, node)))
    return graph.subgraph(nodes_of_interest)

# Perform search
keyword = "expression"  # Replace with your keyword
filtered_graph = search_network(G, keyword)

# Extract node names from the filtered graph
node_names = list(filtered_graph.nodes())

# Prepare a simple text summary of node names
node_names_text = ", ".join(node_names)

# Now, `node_names_text` contains a clean, comma-separated list of node names, ready for summarization
print(node_names_text)

# Initialize Pyvis network with the filtered graph
net = Network(height="2160px", width="100%", bgcolor="#222222", font_color="white")
net.from_nx(filtered_graph)

# Continue with setting options and saving the network as before
net.set_options("""
{
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -80000,
      "centralGravity": 0.5,
      "springLength": 75,
      "springConstant": 0.05,
      "damping": 0.09,
      "avoidOverlap": 0.5
    },
    "maxVelocity": 100,
    "minVelocity": 0.1,
    "solver": "barnesHut",
    "timestep": 0.3,
    "stabilization": {
        "enabled": true,
        "iterations": 500,
        "updateInterval": 10,
        "onlyDynamicEdges": false,
        "fit": true
    }
  },
  "nodes": {
    "font": {
      "size": 30,
      "color": "white"
    }
  }
}
""")





# Save and show the network
net.write_html('filterd_entity_network.html')

HpGGPPS, mutant fungi, taxa-4(5, cloned, functional role, DXS expression, IdsA, stationary phase, increased cell apoptosis, SdGGPPS1, overexpression of GGPPS, Rac1 most activity, GGPPS paralogues, Jatropha, FPPS, GGPPS, terpenoid pheromone precursors, bisphosphonates, GGPPS transcription, cell viability, tanshinone biosynthesis pathway, simvastatin effects, P1A peptide sequences, Plasmodium-specific, (quantitative real time-PCR, gene expression analysis, heterologous complementation, carotenoid accumulation, baccatin III accumulation, genome, geranylgeranyl pyrophosphate synthase gene, idiopathic pulmonary fibrosis, AACT1 expression, overexpression, matrix mineralization, proteins prenylation, HPT1, expression, isoprenoids, prevention of metabolic adverse effects of olanzapine, adipogenesis, CAS, photosynthesis-related metabolites, cigarette smoke induced pulmonary disease, carotenoid biosynthetic pathway, Microbial heterologous expression, metalloenzyme, polycistronic expression const

In [3]:
from IPython.display import Markdown

def trim_text(text, max_length):
    if len(text) > max_length:
        return text[:max_length].rsplit(' ', 1)[0] + "..."  # Trim to max_length, avoid cutting words in half
    else:
        return text
    
# Apply the trimming function to node_names_text
cut_off_chunk_size = 30000
trimmed_node_names_text = trim_text(node_names_text, cut_off_chunk_size)

# Construct the prompt with the potentially trimmed node_names_text
# prompt = "These are the terms related to " + filepath + keyword + ", categorize them and write a summary report." + trimmed_node_names_text

prompt = "These are the terms related to " + filepath + keyword + ", I am producing b-carotene in Y. lipolytica, but I have trouble expressing GGPPS, given these information, write a summary report and give me some suggestions" + node_names_text

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=10000
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response1 = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
display(Markdown(response1))

Summary:

The provided terms relate to various aspects of a research project focused on enhancing the production of b-carotene in Yarrowia lipolytica by manipulating the expression of geranylgeranyl pyrophosphate synthase (GGPPS). GGPPS is a key enzyme in the biosynthesis of isoprenoids, which includes carotenoids like b-carotene. Several strategies are discussed, such as overexpression of GGPPS, manipulation of other enzymes in the metabolic pathway, and the use of different organisms or mutants.

Some highlights from the information include:

1. The study involves comparing wild-type and mutant fungi, as well as GGPPS paralogues from Jatropha and other sources, for optimal GGPPS functionality.
2. Overexpression of GGPPS has been shown to impact cell viability, stationary phase, and potentially increase apoptosis in some cases.
3. The balance between FPP and GGPP is crucial for terpenoid biosynthesis, including the production of carotenoids and paclitaxel, an important anticancer drug.
4. GGPPS expression levels are interconnected with the expression of other genes in the mevalonate pathway, such as DXS and HMGR, and enzymes involved in terpenoid pheromone precursors.
5. Different environmental factors, chemical treatments, and signaling pathways (e.g., MAPK, RhoA/Rock, and ERK1/2) can modulate GGPPS expression and activity.
6. Heterologous expression of GGPPS in Y. lipolytica aims to enhance carotenoid accumulation, similar to approaches used in other organisms like E. coli and plants.
7. The regulation of GGPPS expression can affect various biological processes, including cell proliferation, differentiation, stress resistance, and hormone signaling.
8. GGPPS expression is also linked to the biosynthesis of tocopherols, flavonoids, and other bioactive compounds, as well as plant defense mechanisms against pathogens and environmental stresses.
9. GGPPS activity can be influenced by inhibitors, such as bisphosphonates, which can have implications in metabolic disorders, osteoporosis, and cancer treatments.
10. The study explores the potential therapeutic applications of GGPPS manipulation, including renal angiomyolipomas, inflammation, and neurodegenerative diseases.

Suggestions:

1. Perform gene expression analysis using techniques like qRT-PCR to quantify GGPPS expression levels under different conditions.
2. Investigate the effect of overexpression of GGPPS paralogues to determine if they have distinct functional roles or if they compensate for each other's activity.
3. Study the interaction between GGPPS and upstream/downstream enzymes in the isoprenoid pathway to optimize the biosynthesis of b-carotene.
4. Evaluate the impact of environmental cues and signaling molecules on GGPPS expression and activity to identify potential regulators.
5. Explore the use of inducible promoters to control GGPPS expression dynamically, allowing for fine-tuning of carotenoid production.
6. Consider the effect of GGPPS overexpression on cellular homeostasis, cell viability, and potential toxicities to ensure sustainable production.
7. Investigate the potential of combining GGPPS overexpression with metabolic engineering of other pathways to further enhance b-carotene synthesis.
8. Conduct in vivo studies to assess the effectiveness of GGPPS manipulation in increasing b-carotene levels in the context of whole organisms or tissues.
9. Collaborate with computational biology and bioinformatics experts to analyze gene co-expression networks and predict potential regulatory elements of GGPPS.
10. Monitor the stability and long-term effects of GGPPS overexpression, as well as potential off-target effects on other cellular processes.