In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4,5,6,7"

from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained("/storage1/fs1/yinjie.tang/Active/Shawn_Xiao/Qwen1.5-72B-Chat",
    device_map="auto", torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained("/storage1/fs1/yinjie.tang/Active/Shawn_Xiao/Qwen1.5-72B-Chat")

#Qwen1.5-72B-Chat-GPTQ-Int4
#Qwen1.5-14B-Chat

Loading checkpoint shards:   0%|          | 0/38 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from pyvis.network import Network
import pandas as pd
import re
import networkx as nx

# Load the Excel file
filepath = 'modified_updated(Qwen1.5 14b)_Yarrowia_pdf_causal.xlsx'
df = pd.read_excel(filepath, engine='openpyxl')

# Initialize NetworkX Graph
G = nx.Graph()

# Nodes to exclude
# words_to_exclude = ['Synechocystis', 'Cyanobacteria', 'cyanobacteria']
words_to_exclude = []

# Regular expression to match the pattern (entity A, entity B)
pattern = r'\(([^,]+), ([^\)]+)\)'

# Iterate over the DataFrame rows to extract entity pairs and their sources
for _, row in df.iterrows():
    # value = row['Response to New Question']
    value = row['Answer to Question 2']
    source = row['Title']  # Extract source for each pair

    matches = re.findall(pattern, value)
    for entity_a, entity_b in matches:
        # Check if any word to exclude is part of the entity names
        if not any(word in entity_a for word in words_to_exclude) and not any(word in entity_b for word in words_to_exclude):
            G.add_node(entity_a, label=entity_a)
            G.add_node(entity_b, label=entity_b)
            G.add_edge(entity_a, entity_b, title=source)

def search_network(graph, keyword, depth=1):
    nodes_of_interest = {n for n, attr in graph.nodes(data=True) if keyword.lower() in attr['label'].lower()}
    for _ in range(depth):
        for node in list(nodes_of_interest):
            nodes_of_interest.update(set(nx.neighbors(graph, node)))
    return graph.subgraph(nodes_of_interest)

# Perform search
keyword = "transformation"  # Replace with your keyword
filtered_graph = search_network(G, keyword)

# Extract node names from the filtered graph
node_names = list(filtered_graph.nodes())

# Prepare a simple text summary of node names
node_names_text = ", ".join(node_names)

# Now, `node_names_text` contains a clean, comma-separated list of node names, ready for summarization
print(node_names_text)

# Initialize Pyvis network with the filtered graph
net = Network(height="2160px", width="100%", bgcolor="#222222", font_color="white")
net.from_nx(filtered_graph)

# Continue with setting options and saving the network as before
net.set_options("""
{
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -80000,
      "centralGravity": 0.5,
      "springLength": 75,
      "springConstant": 0.05,
      "damping": 0.09,
      "avoidOverlap": 0.5
    },
    "maxVelocity": 100,
    "minVelocity": 0.1,
    "solver": "barnesHut",
    "timestep": 0.3,
    "stabilization": {
        "enabled": true,
        "iterations": 500,
        "updateInterval": 10,
        "onlyDynamicEdges": false,
        "fit": true
    }
  },
  "nodes": {
    "font": {
      "size": 30,
      "color": "white"
    }
  }
}
""")





# Save and show the network
net.write_html('filterd_entity_network.html')

Integrative transformation of Yarrowia lipolytica)5. (Dower WJ, pINA1054, pINA1055, ('steroids biotransformation', 'one-step protocol', Yarrowia transformation efficiency, Dimorphic yeast Yarrowia lipolytica, Transformation method, Transformation efficiency, 'electroporation procedure', DH10β E. coli, Beckerich JM, et al.', Yarrowia lipolytica one-step transformation, 'YNB casa', transformation of E. coli strains, 'Saccharomyces cerevisiae transformation methods', LiAc concentration, 'TAGs', cloning and transformation, methods, Agrobacterium tumefaciens, similar transformation efficiency)) (pINA1053, Transformation protocol, 'lipase', lactone transformation, lithium acetate procedure, Barth and Gaillardin (1996, fatty acid biotransformation, 'electroporation method', Tn3 transposon, transformations (b-oxidation, lactones, post-transformation growth, 'Pichia', genetic transformation, '2, dimorphic yeast, Saccharomyces cerevisiae and Yarrowia lipolytica). Some key entities mentioned are:

In [4]:
from IPython.display import Markdown

def trim_text(text, max_length):

    if len(text) > max_length:
        return text[:max_length].rsplit(' ', 1)[0] + "..."  # Trim to max_length, avoid cutting words in half
    else:
        return text
    
# Apply the trimming function to node_names_text
cut_off_chunk_size = 30000
trimmed_node_names_text = trim_text(node_names_text, cut_off_chunk_size)

# Construct the prompt with the potentially trimmed node_names_text
# prompt = "These are the terms related to " + filepath + keyword + ", categorize them and write a summary report." + trimmed_node_names_text

prompt = "These are the terms related to " + filepath + keyword + ", I try to do yarrowia lipolytica strain engineering, given these information, write a experiment procedure" + node_names_text

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=10000
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response1 = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
display(Markdown(response1))

Experiment Procedure for Integrative Transformation of Yarrowia lipolytica:

1. **Strain and Media Preparation**:
   - Start with a fresh culture of Yarrowia lipolytica strain, such as PO1g or W29, grown in YPD media (yeast extract, peptone, and dextrose) until mid-log phase.
   - Prepare minimal medium (YNB Casa) for post-transformation growth.

2. **Plasmid Construction and Linearization**:
   - Clone the desired gene or construct into an integrative vector like pINA1054 or pINA1055, which contains a selectable marker like hygromycin-resistance or ura3d4 defective allele.
   - Linearize the plasmid using restriction enzymes like ApaI or EcoRI to facilitate homologous recombination.

3. **Transformation Efficiency Optimization**:
   - Adjust transformation conditions to maximize efficiency, possibly using modified electroporation methods or lithium acetate procedures.
   - Optimize cell density, typically using stationary phase cells at a high cell density.

4. **Electroporation**:
   - Resuspend the cells in ice-cold electroporation buffer (IM buffer) without lysine.
   - Add the linearized plasmid DNA to the cell suspension, maintaining a low DNA concentration (∼2 μg).
   - Perform electroporation using a specific voltage (e.g., 1.5 kV) and pulse duration.
   - Incubate the cells in a recovery medium (YPD containing DMSO) for 1 hour at 30°C.

5. **Transformation Selection and Growth**:
   - Plate the transformed cells on selective media containing the appropriate antibiotic (e.g., hygromycin or nourseothricin) to select for successful transformants.
   - Incubate the plates at 30°C for 2-5 days to allow colony formation.

6. **Screening and Confirmation**:
   - Pick white colonies for further analysis.
   - Perform colony PCR to confirm the presence of the integrated gene.
   - Optionally, use Southern blotting or other molecular techniques to verify correct integration.

7. **Biological Characterization**:
   - Assess the functionality of the introduced gene through assays related to the desired trait (e.g., enzyme activity, metabolite production, or biotransformation capacity).
   - Monitor growth and transformation efficiency under different conditions to optimize the process.

This experimental procedure outlines the general steps for integrating a gene into Yarrowia lipolytica using electroporation and a linearized plasmid. Note that specific details, such as exact DNA concentrations, voltages, and recovery times, may need to be adjusted based on the laboratory's equipment and experience. Additionally, optimization of the transformation process might involve testing different transformation methods (e.g., LiAc/SS carrier DNA/PEG method, direct transformation/in vivo transposition, or Agrobacterium-mediated transformation) and parameters to achieve the highest efficiency.