This notebook is module 1 of NEKO.

It does the following steps:
1. Search PubMed and save an article list.
2. IUse LLM to identify causal relationships between knowledge entities.
3. Remove repetitive entities.
4. Plot all entities on a knowledge graph using Pyvis.

To use this notebook, you need to:

- Input your OpenAI API key.
- Input your Email to use NCBI Entrez PubMed search API.
- Set the maximum number of search results.
- Input your search keyword.
- Modify the prompt according to your needs.


In [1]:
from openai import OpenAI
# Set your OpenAI API key
client = OpenAI(api_key='your API key here') #******You can obtain your key from OpenAI

In [2]:
# Search PubMed and get article abstracts
from Bio import Entrez
import pandas as pd

# ******Define your email to use with NCBI Entrez
Entrez.email = "your_email@sample.com"
Entrez.api_key = "My_api_key"
#****** Add your search keyword here ******
keyword = "E coli fermentation mutation"


def search_pubmed(keyword):
    
    # Adjust the search term to focus on abstracts
    search_term = f"{keyword}[Abstract]"
    #****** You can modify retmax to change maximum number of search results
    handle = Entrez.esearch(db="pubmed", term=search_term, retmax=200) #******
    record = Entrez.read(handle)
    handle.close()
    # Get the list of Ids returned by the search
    id_list = record["IdList"]
    return id_list

def fetch_details(id_list):
    ids = ','.join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
    records = Entrez.read(handle)
    handle.close()

    # Create a list to hold our article details
    articles = []

    for pubmed_article in records['PubmedArticle']:
        article = {}
        article_data = pubmed_article['MedlineCitation']['Article']
        article['Title'] = article_data.get('ArticleTitle')
        
        # Directly output the abstract
        abstract_text = article_data.get('Abstract', {}).get('AbstractText', [])
        if isinstance(abstract_text, list):
            abstract_text = ' '.join(abstract_text)
        article['Abstract'] = abstract_text

        article['Journal'] = article_data.get('Journal', {}).get('Title')

        articles.append(article)

    return articles

id_list = search_pubmed(keyword)
articles = fetch_details(id_list)

# Convert our list of articles to a DataFrame
df = pd.DataFrame(articles)

# Saving the DataFrame to an Excel file
excel_filename = keyword+"_pubmed_search_results.xlsx"
df.to_excel(excel_filename, index=False)

print(f"Saved search results to {excel_filename}")


Saved search results to E coli fermentation mutation_pubmed_search_results.xlsx


In [3]:
import pandas as pd
def ask_a_follow_up_question(abstract, question):
    prompt_text = question + " " + str(abstract)
    #****** you can modify the system prompt for your specific needs ******
    messages = [{"role": "system", "content": "This GPT is specialized for analyzing scientific paper abstracts, focusing on identifying specific entities related to biological studies, such as performance, species, genes, methods of genetic engineering, enzymes, proteins, and bioprocess conditions (e.g., growth conditions), and determining causal relationships between them. It outputs all possible combinations of causal relationships among identified entities in structured pairs. The output strictly follows the format: (Entity A ,Entity B), with no additional text."},
                {"role": "user", "content": prompt_text}]

    response = client.chat.completions.create(
        model="gpt-4-turbo-preview",  # Use the appropriate model name
        messages=messages,
        max_tokens=2000
    )
    return response.choices[0].message.content.strip()

# Read the Excel file
df = pd.read_excel(excel_filename)

# Define your question, we already set the system prompt so left blank here
question = " "

# If the process stop due to internet connection problems, you can restart from where it stop
start_row = 0  # Adjusted to start from row number xxx
for i in range(start_row - 1, len(df)):  
    row = df.iloc[i]
    abstract = row['Abstract']
    response = ask_a_follow_up_question(abstract, question)
    df.at[i, 'Answer to Question 2'] = response

    # Optional: Print the response and progress to monitor execution
    print(f"Row {i+1} Response: {response}")
    progress = ((i + 1 - (start_row - 1)) / (len(df) - (start_row - 1))) * 100
    print(f"Progress: {progress:.2f}% completed\n")

# Save the DataFrame with new responses back to an Excel file
output_file_path = 'updated(GPT-4)_'+keyword+'_causal.xlsx' 
df.to_excel(output_file_path, index=False)



Row 0 Response: (crp* mutation, Disabled CCR system)  
(crp* mutation, Simultaneous consumption of glucose and xylose)  
(crp* mutant strain, Deregulated CCR system)  
(crp* mutant strain, Routed glucose to C5 carbon utilization pathways)  
(crp* mutant strain, Supported de novo nucleotide synthesis)  
(crp* mutant strain, Supported energy production)  
(crp* mutation, Contributed to slower growth due to overflow metabolism)  
(crp* mutant strain, Beneficial for chemical production utilizing C5 and C6 substrates)  
(Escherichia coli, Struggles with efficient utilization of hexose and pentose sugars due to CCR)  
(Hydrolyzed lignocellulosic biomass, Ideal feedstock for biofuels and commodity chemicals)  
(Escherichia coli BL21 star (DE3) with crp* chromosomal mutant, Enables co-utilization of glucose and xylose)  
(Escherichia coli strain with mutated Crp*, Enables co-utilization of multiple sugar sources)  
(Disabled CCR system, Enables simultaneous sugar consumption in fed-batch ferme

In [4]:
# remove repeat words
import pandas as pd
import re

# Load an Excel file
df = pd.read_excel(output_file_path, engine='openpyxl')

# Fill NaN values in 'Response to New Question' column with zero
df['Answer to Question 2'] = df['Answer to Question 2'].fillna(0)

# Convert the column values to strings (to ensure compatibility with re.findall)
column_values = df['Answer to Question 2'].astype(str).tolist()

# Initialize an empty list to hold entities
entities = []

# Regular expression to match the pattern (entity A, entity B)
pattern = r'\(([^,]+), ([^\)]+)\)'

# Iterate over each cell in the column
for value in column_values:
    # Find all matches of the pattern in the cell
    matches = re.findall(pattern, value)
    # For each match, extend the entities list with the extracted entities
    for match in matches:
        entities.extend(match)  # This adds both entity A and entity B to the list

# Remove duplicates if necessary
entities = list(dict.fromkeys(entities))

# Join the entities with commas
entities_string = ', '.join(entities)

print(entities_string)

import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load your Excel file
file_path = output_file_path
df = pd.read_excel(file_path, engine='openpyxl')

# Assuming you have a list of entities and their embeddings already
entities = [entity.strip() for entity in entities_string.split(',')]
t2vmodel = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = t2vmodel.encode(entities)

# Identify similar phrases and store them in a dictionary
similar_phrases = {}

# Calculate total iterations for progress tracking
total_iterations = sum(range(len(entities)))

# Initialize a counter to track progress
current_iteration = 0

for i in range(len(entities)):
    for j in range(i + 1, len(entities)):
        similarity = util.pytorch_cos_sim(embeddings[i], embeddings[j])
        if similarity.item() > 0.8:
            # Assuming entities[i] is the first phrase and entities[j] is the similar one
            similar_phrases[entities[j]] = entities[i]

        # Update the current iteration counter after each inner loop iteration
        current_iteration += 1

    # Print the percentage completed
    percentage_completed = (current_iteration / total_iterations) * 100
    print(f"Progress: {percentage_completed:.2f}%")

# Note: Printing progress in the inner loop might slow down your code execution,
# especially if 'entities' is very large. You might want to update the progress
# less frequently, for example, only after each completion of the outer loop.


# Specify the column you want to modify
specific_column = 'Answer to Question 2'

# Calculate total iterations for progress tracking (only for the specific column)
total_rows = len(df)
current_iteration = 0

# Iterate through the specific column to substitute similar phrases
for index, row in df.iterrows():
    current_iteration += 1
    # Print progress every 100 rows to avoid performance degradation
    if current_iteration % 100 == 0 or current_iteration == total_rows:
        progress_percentage = (current_iteration / total_rows) * 100
        print(f"Progress: {progress_percentage:.2f}% complete.")
    
    cell_value = str(row[specific_column])
    for similar, original in similar_phrases.items():
        # Check if the phrase contains 'Yarrowia', if so, skip substitution
        if 'Yarrowia' in cell_value or 'Yarrowia' in similar:
            continue
        if similar in cell_value:
            # Substitute similar phrase with the original phrase, ignoring errors if not found
            try:
                df.at[index, specific_column] = cell_value.replace(similar, original)
            except Exception as e:
                print(f"Error substituting phrase: {e}")
                continue

# Save the modified Excel file
modified_file_path = 'modified_' + file_path
df.to_excel(modified_file_path, index=False, engine='openpyxl')

print("Excel file has been modified and saved as:", modified_file_path)


Escherichia coli, multidrug-resistance (MDR, urinary tract infections (UTIs, high-risk clones, therapeutic management, cUTI etiology, uncomplicated infections (uUTIs, E. coli phylogroup B2, cUTIs, UPEC status, chuA gene, fyuA gene, vat gene, yfcV gene, MDR, E. coli isolates, fluoroquinolone resistance (FQR), E. coli, B2-CC131, fluoroquinolone resistance (FQR, B2-ST1193 (CH14-64), D-ST69 (CH35-27), D-ST405 (CH37-27), B2-ST429 (CH40-20), genetic diversity, CC131, blaCTX-M-15, CC131 isolates, gyrA p.S83L mutation, gyrA p.D87N mutation, parC p.S80I mutation, parC p.E84V mutation, parE p.I529L mutation, IncF plasmids, CC131 genomes, non-lactose fermenting screening, high-risk clone surveillance, rfbO25b gene, fliC_H4 gene, fliC_H5 gene, O25b:H4-B2-ST9126-CC131, E. coli CC131, community uUTIs, E. coli ST1193, MDR clones eradication, antibiotic use optimization, Modification of 5'UTR sequence, Altered expression of CASPON<sup>TM</sup>-tumour necrosis factor α, Expression enhancer element, Hig

In [None]:
from pyvis.network import Network
import pandas as pd
import re

# Load the Excel file
df = pd.read_excel(modified_file_path, engine='openpyxl')

# Column containing entity pairs
entity_pairs_column = 'Answer to Question 2'
# Column containing the source of these pairs
source_column = 'Title'  # Replace 'SourceColumnName' with the actual name of the source column

# Initialize the network
net = Network(height="2160px", width="100%", bgcolor="#222222", font_color="white")

# Regular expression to match the pattern (entity A, entity B)
pattern = r'\(([^,]+), ([^\)]+)\)'

# Iterate over the DataFrame rows to extract entity pairs and their sources
for _, row in df.iterrows():
    value = row[entity_pairs_column]
    source = row[source_column]  # Extract source for each pair

    matches = re.findall(pattern, value)
    for entity_a, entity_b in matches:
        # Directly add nodes for each entity. pyvis will automatically ignore duplicates.
        net.add_node(entity_a, label=entity_a)
        net.add_node(entity_b, label=entity_b)
        
        # Add an edge between the entities with the source as the label
        net.add_edge(entity_a, entity_b, title=source)

# Set options for the network
net.set_options("""
{
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -80000,
      "centralGravity": 0.5,
      "springLength": 75,
      "springConstant": 0.05,
      "damping": 0.09,
      "avoidOverlap": 0.5
    },
    "maxVelocity": 100,
    "minVelocity": 0.1,
    "solver": "barnesHut",
    "timestep": 0.3,
    "stabilization": {
        "enabled": true,
        "iterations": 500,
        "updateInterval": 10,
        "onlyDynamicEdges": false,
        "fit": true
    }
  },
  "nodes": {
    "font": {
      "size": 30,
      "color": "white"
    }
  }
}
""")

# Save and show the network
net.write_html(keyword+'(GPT-4)_entity_network.html')
