In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# os.environ["USE_FLASH_ATTENTION"] = "1"

from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(r"D:\Local LLM\models\Qwen\Qwen2.5\Qwen2.5-32B-Instruct-GPTQ-Int4",
    device_map="auto", torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(r"D:\Local LLM\models\Qwen\Qwen2.5\Qwen2.5-32B-Instruct-GPTQ-Int4")

#Qwen1.5-72B-Chat-GPTQ-Int4
#Qwen1.5-14B-Chat



Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
from pyvis.network import Network
import pandas as pd
import re
import networkx as nx

# Load the Excel file
filepath = 'modified_updated(Qwen2.5 32b)_saccharomyces cerevisiae production gene expressionsaccharomyces cerevisiae production gene deletion_causal.xlsx'
df = pd.read_excel(filepath, engine='openpyxl')

# Initialize NetworkX Graph
G = nx.Graph()

# Nodes to exclude
words_to_exclude = []

# Regular expression to match the pattern (entity A, entity B)
pattern = r'\(([^,]+), ([^\)]+)\)'

# Iterate over the DataFrame rows to extract entity pairs and their sources
for _, row in df.iterrows():
    value = row['Answer to Question 2']
    source = row['Title']  # Extract source for each pair

    matches = re.findall(pattern, value)
    for entity_a, entity_b in matches:
        # Check if any word to exclude is part of the entity names
        if not any(word in entity_a for word in words_to_exclude) and not any(word in entity_b for word in words_to_exclude):
            G.add_node(entity_a, label=entity_a)
            G.add_node(entity_b, label=entity_b)
            G.add_edge(entity_a, entity_b, title=source)

def search_network(graph, keywords, depth=1):
    # Ensure all keywords are lowercase for case-insensitive search
    keyword_list = [kw.lower() for kw in keywords]

    # Helper function to check if a node label contains all keywords
    def contains_all_keywords(label):
        return all(kw in label.lower() for kw in keyword_list)

    # Collect nodes that contain all keywords in their label
    nodes_of_interest = set()
    for node, attr in graph.nodes(data=True):
        if 'label' in attr and contains_all_keywords(attr['label']):
            nodes_of_interest.add(node)

    # Expand search to include neighbors up to the specified depth
    for _ in range(depth):
        neighbors = set()
        for node in nodes_of_interest:
            neighbors.update(nx.neighbors(graph, node))
        nodes_of_interest.update(neighbors)
    
    # Return a subgraph containing only relevant nodes and edges
    return graph.subgraph(nodes_of_interest).copy()

# Perform search with a list of keywords
word_combinations = ["ethanol","increase"]  # Replace with your keywords
filtered_graph = search_network(G, word_combinations)

# Extract node names from the filtered graph
node_names = list(filtered_graph.nodes())

# Prepare a simple text summary of node names
node_names_text = ", ".join(node_names)

# Now, `node_names_text` contains a clean, comma-separated list of node names, ready for summarization
print(node_names_text)

# Initialize Pyvis network with the filtered graph
net = Network(height="2160px", width="100%", bgcolor="#222222", font_color="white")
net.from_nx(filtered_graph)

# Continue with setting options and saving the network as before
net.set_options("""
{
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -80000,
      "centralGravity": 0.5,
      "springLength": 75,
      "springConstant": 0.05,
      "damping": 0.09,
      "avoidOverlap": 0.5
    },
    "maxVelocity": 100,
    "minVelocity": 0.1,
    "solver": "barnesHut",
    "timestep": 0.3,
    "stabilization": {
        "enabled": true,
        "iterations": 500,
        "updateInterval": 10,
        "onlyDynamicEdges": false,
        "fit": true
    }
  },
  "nodes": {
    "font": {
      "size": 30,
      "color": "white"
    }
  }
}
""")

# Save and show the network
net.write_html('filtered_entity_' + "_".join(word_combinations) + '_network.html')


overexpression of gene HST3, deletion of gene RPS4a, deletion of gene DSE2, loss of gene YKL222C, knockout of SDH subunit gene, no increase in autophagy activity upon ethanol stress, ethanol production increased, loss of gene YHL044W, increased yield of ethanol, loss of gene ARN1, overexpression of TAL2, engineering of subcellular trafficking, overexpression of genes involved in pyruvate metabolism, expression of gene HuPFK1, expression of gene pathways responsible for pyruvate-to-ethanol pathway, overexpression of gene HXS1, replacement of gene TDH3 with GDP1, expression of gene pathways responsible for pyruvate-to-tricarboxylic acid (TCA) pathway, overexpression of gene ENA5, deletion of gene ERG4, deletion of gene ERG3, turning down expression of pyc gene, increased specific ethanol production rate, deletion of gene PHO13, increase in ethanol productivity, deletion of gene SCP160, increase in ethanol production from cellobiose by 200%, deletion of ATG13, increase in ethanol producti

In [3]:
from IPython.display import Markdown

def trim_text(text, max_length):
    if len(text) > max_length:
        return text[:max_length].rsplit(' ', 1)[0] + "..."  # Trim to max_length, avoid cutting words in half
    else:
        return text
    
# Apply the trimming function to node_names_text
cut_off_chunk_size = 5000
trimmed_node_names_text = trim_text(node_names_text, cut_off_chunk_size)
keyword = ", ".join(word_combinations)

# Construct the prompt with the potentially trimmed node_names_text
prompt = "These are the terms related to " + filepath + keyword + ", categorize them and write a summary report.   " + trimmed_node_names_text

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=5000
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response1 = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
display(Markdown(response1))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


### Summary Report: Gene Modifications and Their Impact on Ethanol Production in Saccharomyces cerevisiae

#### Introduction
This report summarizes the effects of various gene modifications on ethanol production in Saccharomyces cerevisiae. The modifications include gene deletions, overexpressions, and other genetic interventions aimed at enhancing ethanol yield, productivity, and tolerance.

#### Gene Deletions
1. **Deletion of Genes Involved in Metabolism and Stress Response**
   - *RPS4a*, *DSE2*, *YKL222C*, *SDH subunit*, *YHL044W*, *ARN1*, *ERG4*, *ERG3*, *PHO13*, *SCP160*, *ATG13*, *Tcb1*, *PFK1*, *HapB*, *Bud27*, *isu1*, *znf1*, *RTT109*, *GRE3*, *pheA*, *CTS1*, *EGT2*, *SSK1*, *FPS1*, *B3GNT3*, *ACE2*, *Fpt1*, *ADY2*, *SRY1*, *HAP4-A*, *ira2*, *metJ*, *JEN1*, *ADH2*, *SPT15*, *RPN4*, *YHR087w*, *ERG5*, *CAT8*.
   - These deletions often led to increased ethanol production rates, improved ethanol yield, or enhanced tolerance to ethanol stress.

2. **Deletions Leading to Increased Ethanol Yield**
   - *YHL044W*, *ARN1*, *ERG4*, *ERG3*, *PHO13*, *SCP160*, *ATG13*, *GRE3*, *CTS1*, *EGT2*, *FPS1*, *B3GNT3*, *ACE2*, *Fpt1*, *ADY2*, *SRY1*, *HAP4-A*, *ira2*, *metJ*, *JEN1*, *ADH2*, *SPT15*, *RPN4*, *YHR087w*, *ERG5*, *CAT8*.
   - Specific deletions like *PHO13*, *SCP160*, and *FPS1* resulted in significant increases in ethanol production, particularly from glucose and xylose.

#### Gene Overexpressions
1. **Overexpression of Genes Involved in Metabolism and Stress Response**
   - *HST3*, *TAL2*, *genes involved in pyruvate metabolism*, *HuPFK1*, *pathways responsible for pyruvate-to-ethanol pathway*, *HXS1*, *ENA5*, *DAS1*, *GAPN*, *genes involved in reducing power production*, *HXK2S14A*, *QDR3*, *GLT-1*, *genes involved in glycolysis*, *PGM2*.
   - Overexpression of these genes often led to increased ethanol yield and productivity.

2. **Overexpressions Leading to Enhanced Ethanol Production**
   - *HST3*, *TAL2*, *genes involved in pyruvate metabolism*, *HuPFK1*, *pathways responsible for pyruvate-to-ethanol pathway*, *HXS1*, *ENA5*, *DAS1*, *GAPN*, *genes involved in reducing power production*, *HXK2S14A*, *QDR3*, *GLT-1*, *genes involved in glycolysis*, *PGM2*.
   - For example, overexpression of *HXS1* and *ENA5* significantly increased ethanol production rates.

#### Other Genetic Interventions
1. **Engineering of Subcellular Trafficking**
   - Engineering of subcellular trafficking mechanisms can enhance ethanol production by optimizing the localization and function of key metabolic enzymes.

2. **Introduction of External Enzymes and Amino Acid Substitutions**
   - Introduction of external enzymes and amino acid substitutions (e.g., *aro9 I544W*) can improve ethanol yield and tolerance.

3. **Expression of Pathways Responsible for Specific Functions**
   - Expression of pathways responsible for acetic acid stress, DNA repair, oxidative stress, and growth under ethanol stress can enhance overall ethanol production efficiency.

#### Summary of Key Findings
- **Increased Ethanol Production**: Several gene deletions and overexpressions led to increased ethanol production, particularly from glucose and xylose. Examples include the deletion of *PHO13*, *SCP160*, and *FPS1* and the overexpression of *HXS1* and *ENA5*.
- **Enhanced Ethanol Yield**: Modifying genes involved in metabolic pathways and stress responses often resulted in higher ethanol yields. Notable examples include the deletion of *GRE3* and *metJ* and the overexpression of *TAL2* and *DAS1*.
- **Improved Ethanol Tolerance**: Certain gene modifications, such as the deletion of *ERG4* and *ERG5*, and the overexpression of *QDR3*, enhanced yeast tolerance to ethanol stress, allowing for higher ethanol concentrations.

#### Conclusion
The genetic modifications described in this report demonstrate significant potential for improving ethanol production in Saccharomyces cerevisiae. By strategically deleting or overexpressing specific genes, researchers can optimize yeast strains for higher ethanol yields, productivity, and tolerance. Future research should focus on combining multiple beneficial modifications to achieve even greater improvements in ethanol production efficiency.

In [4]:
# 