In [1]:
import pandas as pd

data = pd.read_excel("./resources/de_jong_et_al_2025_supp_mat.xlsx", sheet_name="Table S5")

In [2]:
data.columns

Index(['cell_status', 'annotation_coarse', 'annotation_granular',
       'top_sig_enriched_pathways', 'sig_upreglated_top10'],
      dtype='object')

In [3]:
data[0:3]

Unnamed: 0,cell_status,annotation_coarse,annotation_granular,top_sig_enriched_pathways,sig_upreglated_top10
0,Malignant,AC-gliosis-like,AC-gliosis-like 1,"['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION',...","['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC..."
1,Malignant,AC-gliosis-like,AC-gliosis-like 2,"['GOBP_WOUND_HEALING', 'HALLMARK_EPITHELIAL_ME...","['CFAP54', 'IGFBP7', 'VOPP1', 'AQP4', 'LANCL2'..."
2,Malignant,AC-gliosis-like,AC-gliosis-like 3,"['HALLMARK_TNFA_SIGNALING_VIA_NFKB', 'GOBP_MON...","['LAMA2', 'AC012405.1', 'COL8A1', 'AC007402.1'..."


In [4]:
data['plain_query'] = ''
data['contextual_query'] = ''
data['cell_status_sub'] = data['cell_status'].replace('TME', 'tumour microenvironment')
data['annotation_coarse_sub'] = data['annotation_coarse'].replace('AC', 'Astrocyte', regex=True)
out = data.to_dict('records')

In [5]:
#gene_list = ['CFAP54', 'IGFBP7', 'VOPP1', 'AQP4', 'LANCL2', 'AC074351.1', 'DTHD1', 'AC012405.1', 'RTN1', 'MTRNR2L12']
#annotated_type = 'astrocyte precursor'

for o in out:

    o['contextual_query'] = ' '.join([f"The following is a list of genes enriched in {o['cell_status_sub']} cells from a",
                             "(human) patient with glioblastoma ",
        f"annotated as {o['annotation_coarse_sub']}. ",
        "What might this list say about the state, properties and interactions of this cell type? ",
        "Do not limit your answers to the cancer literature, ",
        "include normal cell-cell interactions, development and cross-cutting metabolic states. ",
        f"Gene list: {o['sig_upreglated_top10']}"])

    o['plain_query'] = "What might the following enriched gene lists say about the type, " \
              f"properties and cell-cell interactions of the cells the list was derived from { o['sig_upreglated_top10'] }"


In [6]:
out[0]

{'cell_status': 'Malignant',
 'annotation_coarse': 'AC-gliosis-like',
 'annotation_granular': 'AC-gliosis-like 1',
 'top_sig_enriched_pathways': "['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION', 'GOBP_AMEBOIDAL_TYPE_CELL_MIGRATION', 'GOBP_TISSUE_MIGRATION', 'GOBP_REGULATION_OF_EPITHELIAL_CELL_MIGRATION', 'GOBP_ENDOTHELIAL_CELL_MIGRATION']",
 'sig_upreglated_top10': "['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC092957.1', 'TNC', 'IGFBP7', 'COL23A1', 'NAMPT']",
 'plain_query': "What might the following enriched gene lists say about the type, properties and cell-cell interactions of the cells the list was derived from ['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC092957.1', 'TNC', 'IGFBP7', 'COL23A1', 'NAMPT']",
 'contextual_query': "The following is a list of genes enriched in Malignant cells from a (human) patient with glioblastoma  annotated as Astrocyte-gliosis-like.  What might this list say about the state, properties and interactions of this cell type?  Do not limit your answers to 

In [7]:
import requests
import concurrent.futures
import os

# --- API Setup ---
key = os.getenv("PERPLEXITY_API_KEY")
url = "https://api.perplexity.ai/chat/completions"
headers = {
    "accept": "application/json",
    "authorization": f"Bearer {key}",
    "content-type": "application/json"
}

base_payload = {
    "model": "sonar-deep-research",
    "return_citations": True,
        "search_domain_filter": [
            "pubmed.ncbi.nlm.nih.gov",
            "ncbi.nlm.nih.gov/pmc/",
            "sciencedirect.com",
            "nature.com",
            "cell.com",
            "frontiersin.org",
            "journals.plos.org",
            "wikipedia.org",
        ],
    "messages": [
        {"role": "system", "content": "You are an expert biologist. Your answers must be based on primary scientific literature and major reviews from peer-reviewed sources."},
        {"role": "user", "content": ""}
    ]
}

In [8]:
import time

def query_perplexity(o):
    local_payload = base_payload.copy()
    
    # Plain query
    local_payload['messages'][1]['content'] = o['plain_query']
    try:
        plain_resp = requests.post(url, headers=headers, json=local_payload).json()
    except Exception as e:
        plain_resp = {"error": str(e)}

    # Add a 1-second delay between the two requests
    time.sleep(1) 


    # Contextual query
    local_payload['messages'][1]['content'] = o['contextual_query']
    try:
        contextual_resp = requests.post(url, headers=headers, json=local_payload).json()
    except Exception as e:
        contextual_resp = {"error": str(e)}
    
    # Store in the object
    o['plain_response'] = plain_resp
    o['contextual_response'] = contextual_resp

    return o

In [9]:
# Use a ThreadPoolExecutor to run the query function on all items in the 'out' list
# This will run multiple requests at the same time, making the process much faster.

with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
    # The map function applies 'query_perplexity' to each item in 'out'
    # and returns the results in the same order.
    print("Starting API calls...")
    results = list(executor.map(query_perplexity, out))
    print("All API calls completed.")

Starting API calls...
All API calls completed.


In [10]:
results[0]

{'cell_status': 'Malignant',
 'annotation_coarse': 'AC-gliosis-like',
 'annotation_granular': 'AC-gliosis-like 1',
 'top_sig_enriched_pathways': "['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION', 'GOBP_AMEBOIDAL_TYPE_CELL_MIGRATION', 'GOBP_TISSUE_MIGRATION', 'GOBP_REGULATION_OF_EPITHELIAL_CELL_MIGRATION', 'GOBP_ENDOTHELIAL_CELL_MIGRATION']",
 'sig_upreglated_top10': "['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC092957.1', 'TNC', 'IGFBP7', 'COL23A1', 'NAMPT']",
 'plain_query': "What might the following enriched gene lists say about the type, properties and cell-cell interactions of the cells the list was derived from ['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC092957.1', 'TNC', 'IGFBP7', 'COL23A1', 'NAMPT']",
 'contextual_query': "The following is a list of genes enriched in Malignant cells from a (human) patient with glioblastoma  annotated as Astrocyte-gliosis-like.  What might this list say about the state, properties and interactions of this cell type?  Do not limit your answers to 

In [16]:
def gen_bib(citations):
    """
    Corrected function to format a list of citation URLs into a Markdown list.
    """
    if not citations:
        return "## References\n\nNo citations provided."
        
    out = ['\n\n## References\n']
    # The citations are a list of URL strings, so we iterate through them directly.
    for i, url in enumerate(citations):
        out.append(f"- [{i+1}] {url}") # Format each URL as a list item
        
    return '\n'.join(out)

In [17]:
def rep(row, typ):
    """Creates one section of a report (plain or contextual)."""
    response_data = row.get(f'{typ}_response', {})
    
    # Check for errors first
    if "error" in response_data or 'choices' not in response_data:
        error_message = response_data.get("error", "Unknown error: Response format is incorrect.")
        return f"## {typ.capitalize()} Query Report: {row['annotation_granular']}\n\n**Error:**\n```\n{error_message}\n```"

    # Safely get content and citations
    content = response_data['choices'][0]['message'].get('content', 'No content found.')
    citations_list = response_data.get('citations', [])
    
    # Build the report section
    title = f"## {typ.capitalize()} Query Report: {row['annotation_granular']}"
    query_text = f"**Query:**\n> {row[f'{typ}_query']}"
    bibliography = gen_bib(citations_list)
    
    return "\n\n".join([title, query_text, "**Response:**\n" + content, bibliography])

In [18]:
def generate_report(row):
    """Generates a complete, formatted Markdown report for a single row."""
    # Combine the plain and contextual sections into one report
    plain_section = rep(row, typ='plain')
    contextual_section = rep(row, typ='contextual')
    
    main_title = f"# Full Report for: {row['annotation_granular']}"
    return f"{main_title}\n\n---\n\n{plain_section}\n\n---\n\n{contextual_section}"


In [25]:
def generate_and_save_all_reports(records, directory_path):
    """
    Generates and saves separate plain and contextual reports for each record.
    """
    # Create the target directory if it doesn't exist
    os.makedirs(directory_path, exist_ok=True)
    print(f"Saving separate reports to: {directory_path}")

    for r in records:
        # --- Save the Plain Report ---
        plain_filename = f"{r['annotation_granular'].replace(' ', '_')}_plain.md"
        plain_path = os.path.join(directory_path, plain_filename)
        plain_content = rep(r, typ='plain') # Get only the plain section
        
        with open(plain_path, 'w', encoding='utf-8') as file:
            file.write(plain_content)

        # --- Save the Contextual Report ---
        contextual_filename = f"{r['annotation_granular'].replace(' ', '_')}_contextual.md"
        contextual_path = os.path.join(directory_path, contextual_filename)
        contextual_content = rep(r, typ='contextual') # Get only the contextual section

        with open(contextual_path, 'w', encoding='utf-8') as file:
            file.write(contextual_content)
            
    print("All separate reports have been saved successfully.")

In [26]:
# testing a single report first to see the output
print("--- Generating a single report preview ---")
single_report = generate_report(results[0])
print(single_report)
print("------------------------------------------\n")

--- Generating a single report preview ---
# Full Report for: AC-gliosis-like 1

---

## Plain Query Report: AC-gliosis-like 1

**Query:**
> What might the following enriched gene lists say about the type, properties and cell-cell interactions of the cells the list was derived from ['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC092957.1', 'TNC', 'IGFBP7', 'COL23A1', 'NAMPT']

**Response:**
<think>
We are given a list of genes: ['CFAP54', 'IGFBP7', 'VOPP1', 'AQP4', 'LANCL2', 'AC074351.1', 'DTHD1', 'AC012405.1', 'RTN1', 'MTRNR2L12'].

We are to analyze what this enriched gene list might say about the type, properties, and cell-cell interactions of the cells from which the list was derived.

### Step 1: Understand the function of each gene

We have several genes. We will use the provided search results to understand each gene's function and relevance.

1. **CFAP54**: From search result [1], CFAP54 is required for proper ciliary motility and assembly. Mutations in CFAP54 lead to defects in 

In [28]:
output_directory = './output/deepsearch_caroline/'

# Call the function to generate and save all reports to the new directory
generate_and_save_all_reports(results, output_directory)

Saving separate reports to: ./output/deepsearch_caroline/
All separate reports have been saved successfully.
