In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# os.environ["USE_FLASH_ATTENTION"] = "1"

from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(r"D:\Local LLM\models\Qwen\Qwen2.5\Qwen2.5-32B-Instruct-GPTQ-Int4",
    device_map="auto", torch_dtype="auto",attn_implementation="flash_attention_2",
)
tokenizer = AutoTokenizer.from_pretrained(r"D:\Local LLM\models\Qwen\Qwen2.5\Qwen2.5-32B-Instruct-GPTQ-Int4")

#Qwen1.5-72B-Chat-GPTQ-Int4
#Qwen1.5-14B-Chat



Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
from Bio import Entrez
import pandas as pd

# Define your email to use with NCBI Entrez
Entrez.email = "xiao.zhengyang@wustl.edu"

def search_pubmed(keyword):
    # Adjust the search term to focus on abstracts
    search_term = f"{keyword}[Abstract]"
    handle = Entrez.esearch(db="pubmed", term=search_term, retmax=1000)
    record = Entrez.read(handle)
    handle.close()
    # Get the list of Ids returned by the search
    id_list = record["IdList"]
    return id_list

def fetch_details(id_list):
    ids = ','.join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
    records = Entrez.read(handle)
    handle.close()

    # Create a list to hold our article details
    articles = []

    for pubmed_article in records['PubmedArticle']:
        article = {}
        article_data = pubmed_article['MedlineCitation']['Article']
        article['Title'] = article_data.get('ArticleTitle')
        
        # Directly output the abstract
        abstract_text = article_data.get('Abstract', {}).get('AbstractText', [])
        if isinstance(abstract_text, list):
            abstract_text = ' '.join(abstract_text)
        article['Abstract'] = abstract_text

        article['Journal'] = article_data.get('Journal', {}).get('Title')

        articles.append(article)

    return articles

def perform_search_and_fetch(keyword):
    id_list = search_pubmed(keyword)
    return fetch_details(id_list)

# Example usage: Performing two searches
keyword1 = "saccharomyces cerevisiae production gene expression"
keyword2 = "saccharomyces cerevisiae production gene deletion"
keyword = keyword1 + keyword2
# Fetch articles for both keywords
articles1 = perform_search_and_fetch(keyword1)
articles2 = perform_search_and_fetch(keyword2)

# Convert both lists of articles to DataFrames
df1 = pd.DataFrame(articles1)
df2 = pd.DataFrame(articles2)

# Add a column to differentiate the search terms in the final DataFrame
df1['SearchTerm'] = keyword1
df2['SearchTerm'] = keyword2

# Concatenate the DataFrames
combined_df = pd.concat([df1, df2], ignore_index=True)

# Save the combined DataFrame to an Excel file
excel_filename = keyword+"_pubmed_search_results.xlsx"
combined_df.to_excel(excel_filename, index=False)

print(f"Saved combined search results to {excel_filename}")


Saved combined search results to saccharomyces cerevisiae production gene expressionsaccharomyces cerevisiae production gene deletion_pubmed_search_results.xlsx


In [3]:
# Qwen reads abstract and identify knowledge

import pandas as pd
import os
import torch
import gc

# Assuming `tokenizer`, `model`, and `device` are already defined and initialized
# Example device initialization: device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to interact with the LLM API using the new method, now with customizable system prompts
def ask_questions(abstract, questions, system_prompts):
    responses = []
    for question, system_prompt in zip(questions, system_prompts):
        # Combine the question and abstract to form the prompt
        prompt_text = question + " " + str(abstract)
        
        # Prepare the messages for the new API, using a customizable system prompt
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt_text}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(device)
        
        # Generate response with the new API
        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=5000
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        
        # Decode the generated response
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        responses.append(response)
    return responses

# Read the Excel file
file_path = excel_filename  # Replace with your file path
df = pd.read_excel(file_path)

questions = [" "]  # Placeholder question that won't be used

system_prompts = [
    "You are specialized for analyzing scientific paper abstracts, Extract entities and causal relationships from scientific paper abstracts. Focus on genes overexpression/deletion, rationale for expression/deletion, products, and metabolites related to expression/deletion. Output in (expression/deletion of gene xxx, consequence 1), (expression/deletion of gene xxx, consequence 2)... format with no additional text."
]

# Process each abstract and store the response
total_rows = len(df)
for i, row in df.iterrows():
    # Clear the console at the beginning of each iteration
    os.system('cls' if os.name == 'nt' else 'clear')

    # Since we're only asking one question now, directly get the response for the second (index 0) system prompt
    response = ask_questions(row['Abstract'], [questions[0]], [system_prompts[0]])[0]

    # Store the response in the DataFrame
    df.at[i, 'Answer to Question 2'] = response

    # Show the response
    print(f"Response for Row {i+1}:")
    print(f"Answer to Question 2: {response}")

    # Calculate and show the progress percentage
    progress = ((i + 1) / total_rows) * 100
    print(f"Progress: {progress:.2f}% completed")

# Save the updated DataFrame back to an Excel file
output_file_path = 'updated(Qwen2.5 32b)_'+keyword+'_causal.xlsx'  # Replace with your desired output file path
df.to_excel(output_file_path, index=False)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Response for Row 1:
Answer to Question 2: (deletion of gene Acptp2,3, reduced sexual spore production by 4.4 times), (deletion of gene Acptp2,3, reduced asexual spore production by 4.6 times), (deletion of gene Acptp2,3, decreased response to sorbitol-induced osmotic stress), (deletion of gene Acptp2,3, slower growth in high concentration sucrose medium), (deletion of gene Acptp2,3, slower growth in hydrogen peroxide), (deletion of gene Acptp2,3, slower growth in Congo red), (deletion of gene Acptp2,3, slower growth in SDS), (deletion of gene Acptp2,3, deeper colony color), (deletion of gene Acptp2,3, altered expression levels of stress-related genes), (deletion of gene Acptp2,3, altered expression levels of pigments-related genes)
Progress: 0.05% completed
Response for Row 2:
Answer to Question 2: (deletion of gene tri18, absence of type D trichothecene toxin epiroridin E), (deletion of gene tri18, reduction in pathogenicity of P. roridum towards pumpkin), (heterologous expression of 

In [4]:
# remove repeat words
import pandas as pd
import re

# Load an Excel file
df = pd.read_excel(output_file_path, engine='openpyxl')

# Fill NaN values in 'Response to New Question' column with zero
df['Answer to Question 2'] = df['Answer to Question 2'].fillna(0)

# Convert the column values to strings (to ensure compatibility with re.findall)
column_values = df['Answer to Question 2'].astype(str).tolist()

# Initialize an empty list to hold entities
entities = []

# Regular expression to match the pattern (entity A, entity B)
pattern = r'\(([^,]+), ([^\)]+)\)'

# Iterate over each cell in the column
for value in column_values:
    # Find all matches of the pattern in the cell
    matches = re.findall(pattern, value)
    # For each match, extend the entities list with the extracted entities
    for match in matches:
        entities.extend(match)  # This adds both entity A and entity B to the list

# Remove duplicates if necessary
entities = list(dict.fromkeys(entities))

# Join the entities with commas
entities_string = ', '.join(entities)

print(entities_string)

import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load your Excel file
file_path = output_file_path
df = pd.read_excel(file_path, engine='openpyxl')

# Assuming you have a list of entities and their embeddings already
entities = [entity.strip() for entity in entities_string.split(',')]
t2vmodel = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = t2vmodel.encode(entities)

# Identify similar phrases and store them in a dictionary
similar_phrases = {}

# Calculate total iterations for progress tracking
total_iterations = sum(range(len(entities)))

# Initialize a counter to track progress
current_iteration = 0

for i in range(len(entities)):
    for j in range(i + 1, len(entities)):
        similarity = util.pytorch_cos_sim(embeddings[i], embeddings[j])
        if similarity.item() > 0.8:
            # Assuming entities[i] is the first phrase and entities[j] is the similar one
            similar_phrases[entities[j]] = entities[i]

        # Update the current iteration counter after each inner loop iteration
        current_iteration += 1

    # Print the percentage completed
    percentage_completed = (current_iteration / total_iterations) * 100
    print(f"Progress: {percentage_completed:.2f}%")

# Note: Printing progress in the inner loop might slow down your code execution,
# especially if 'entities' is very large. You might want to update the progress
# less frequently, for example, only after each completion of the outer loop.


# Specify the column you want to modify
specific_column = 'Answer to Question 2'

# Calculate total iterations for progress tracking (only for the specific column)
total_rows = len(df)
current_iteration = 0

# Iterate through the specific column to substitute similar phrases
for index, row in df.iterrows():
    current_iteration += 1
    # Print progress every 100 rows to avoid performance degradation
    if current_iteration % 100 == 0 or current_iteration == total_rows:
        progress_percentage = (current_iteration / total_rows) * 100
        print(f"Progress: {progress_percentage:.2f}% complete.")
    
    cell_value = str(row[specific_column])
    for similar, original in similar_phrases.items():
        # Check if the phrase contains 'Yarrowia', if so, skip substitution
        if 'Yarrowia' in cell_value or 'Yarrowia' in similar:
            continue
        if similar in cell_value:
            # Substitute similar phrase with the original phrase, ignoring errors if not found
            try:
                df.at[index, specific_column] = cell_value.replace(similar, original)
            except Exception as e:
                print(f"Error substituting phrase: {e}")
                continue

# Save the modified Excel file
modified_file_path = 'modified_' + file_path
df.to_excel(modified_file_path, index=False, engine='openpyxl')

print("Excel file has been modified and saved as:", modified_file_path)


deletion of gene tri18, absence of type D trichothecene toxin epiroridin E, reduction in pathogenicity of P. roridum towards pumpkin, heterologous expression of tri3, toxin resistance in Saccharomyces cerevisiae, heterologous expression of tri17KR, expression of genes related to the uptake of phosphate, improved wastewater treatment performance, expression of genes related to the uptake of ammonium ions, overexpression of metabolism-related genes, altered metabolic pathways, overexpression of stress response genes, enhanced stress tolerance, overexpression of cell adhesion genes, increased cell attachment, overexpression of gene C3H15, lower chlorophyll content, reduced photosynthetic efficiency, accelerated senescence, altered mobilization of copper to seeds, altered mobilization of zinc to seeds, deletion of gene zwf1, change in protein content, change in enzymatic activity, deletion of gene gnd1, deletion of gene ald6, overexpression of gene POS5Δ17, enhanced reduced nicotinamide ad

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Progress: 0.01%
Progress: 0.03%
Progress: 0.04%
Progress: 0.06%
Progress: 0.07%
Progress: 0.09%
Progress: 0.10%
Progress: 0.11%
Progress: 0.13%
Progress: 0.14%
Progress: 0.16%
Progress: 0.17%
Progress: 0.18%
Progress: 0.20%
Progress: 0.21%
Progress: 0.23%
Progress: 0.24%
Progress: 0.26%
Progress: 0.27%
Progress: 0.28%
Progress: 0.30%
Progress: 0.31%
Progress: 0.33%
Progress: 0.34%
Progress: 0.36%
Progress: 0.37%
Progress: 0.38%
Progress: 0.40%
Progress: 0.41%
Progress: 0.43%
Progress: 0.44%
Progress: 0.45%
Progress: 0.47%
Progress: 0.48%
Progress: 0.50%
Progress: 0.51%
Progress: 0.53%
Progress: 0.54%
Progress: 0.55%
Progress: 0.57%
Progress: 0.58%
Progress: 0.60%
Progress: 0.61%
Progress: 0.63%
Progress: 0.64%
Progress: 0.65%
Progress: 0.67%
Progress: 0.68%
Progress: 0.70%
Progress: 0.71%
Progress: 0.72%
Progress: 0.74%
Progress: 0.75%
Progress: 0.77%
Progress: 0.78%
Progress: 0.80%
Progress: 0.81%
Progress: 0.82%
Progress: 0.84%
Progress: 0.85%
Progress: 0.87%
Progress: 0.88%
Progress