In [1]:
import pandas as pd

data = pd.read_excel("./resources/de_jong_et_al_2025_supp_mat.xlsx", sheet_name="Table S5")

In [2]:
data.columns

Index(['cell_status', 'annotation_coarse', 'annotation_granular',
       'top_sig_enriched_pathways', 'sig_upreglated_top10'],
      dtype='object')

In [3]:
data[0:3]

Unnamed: 0,cell_status,annotation_coarse,annotation_granular,top_sig_enriched_pathways,sig_upreglated_top10
0,Malignant,AC-gliosis-like,AC-gliosis-like 1,"['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION',...","['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC..."
1,Malignant,AC-gliosis-like,AC-gliosis-like 2,"['GOBP_WOUND_HEALING', 'HALLMARK_EPITHELIAL_ME...","['CFAP54', 'IGFBP7', 'VOPP1', 'AQP4', 'LANCL2'..."
2,Malignant,AC-gliosis-like,AC-gliosis-like 3,"['HALLMARK_TNFA_SIGNALING_VIA_NFKB', 'GOBP_MON...","['LAMA2', 'AC012405.1', 'COL8A1', 'AC007402.1'..."


In [4]:
data['plain_query'] = ''
data['contextual_query'] = ''
data['cell_status_sub'] = data['cell_status'].replace('TME', 'tumour microenvironment')
data['annotation_coarse_sub'] = data['annotation_coarse'].replace('AC', 'Astrocyte', regex=True)
out = data.to_dict('records')


In [5]:
#gene_list = ['CFAP54', 'IGFBP7', 'VOPP1', 'AQP4', 'LANCL2', 'AC074351.1', 'DTHD1', 'AC012405.1', 'RTN1', 'MTRNR2L12']
#annotated_type = 'astrocyte precursor'

for o in out:

    o['contextual_query'] = ' '.join([f"The following is a list of genes enriched in {o['cell_status_sub']} cells from a",
                             "(human) patient with glioblastoma ",
        f"annotated as {o['annotation_coarse_sub']}. ",
        "What might this list say about the state, properties and interactions of this cell type? ",
        "Do not limit your answers to the cancer literature, ",
        "include normal cell-cell interactions, development and cross-cutting metabolic states. ",
        f"Gene list: {o['sig_upreglated_top10']}"])

    o['plain_query'] = "What might the following enriched gene lists say about the type, " \
              f"properties and cell-cell interactions of the cells the list was derived from { o['sig_upreglated_top10'] }"



In [6]:
out[0]

{'cell_status': 'Malignant',
 'annotation_coarse': 'AC-gliosis-like',
 'annotation_granular': 'AC-gliosis-like 1',
 'top_sig_enriched_pathways': "['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION', 'GOBP_AMEBOIDAL_TYPE_CELL_MIGRATION', 'GOBP_TISSUE_MIGRATION', 'GOBP_REGULATION_OF_EPITHELIAL_CELL_MIGRATION', 'GOBP_ENDOTHELIAL_CELL_MIGRATION']",
 'sig_upreglated_top10': "['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC092957.1', 'TNC', 'IGFBP7', 'COL23A1', 'NAMPT']",
 'plain_query': "What might the following enriched gene lists say about the type, properties and cell-cell interactions of the cells the list was derived from ['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC092957.1', 'TNC', 'IGFBP7', 'COL23A1', 'NAMPT']",
 'contextual_query': "The following is a list of genes enriched in Malignant cells from a (human) patient with glioblastoma  annotated as Astrocyte-gliosis-like.  What might this list say about the state, properties and interactions of this cell type?  Do not limit your answers to 

In [7]:
import requests
import os

key = os.getenv("PERPLEXITY_API_KEY")

url = "https://api.perplexity.ai/chat/completions"
headers = {
    "accept": "application/json",
    "authorization": f"Bearer {key}",
    "content-type": "application/json"
}

data = {
    "model": "sonar-deep-research ", # specifying deep search. 
    "messages": [
        {
            "role": "system",
            "content": "You are an expert biologist. Your answers must be based on primary scientific literature" \
            "and major reviews from peer-reviewed sources.",
        },
        {
            "role": "user",
            "content": '',
        },
    ],
#    "return_citations": True,  # No longer needed
    "search_mode": 'academic',
    "stream": False
} 

for o in out:
    data['messages'][1]['content'] = o['plain_query']
    o['plain_response'] = requests.post(url, headers=headers, json=data)
    data['messages'][1]['content'] = o['contextual_query']
    o['contextual_response'] = requests.post(url, headers=headers, json=data)
    

KeyboardInterrupt: 

In [9]:
def gen_bib(bib):
    out = ['\n\n## References\n']
    index = 1
    for ref in bib: 
        #print(ref.values())
        out.append(f"- [{str(index)}] {' '.join(str(x) for x in ref.values())}")
        index += 1
    return '\n'.join(out)

In [14]:
def rep(fu, typ):
    print(typ + '_response')
    print(type(fu))
    print(type(fu[typ + '_response']))
    r = fu[typ + '_response']
    j = r.json()
    title = f"# {typ} query { fu['annotation_granular'] }"
    q = fu[typ + '_query']
    query = f"Query: {q}"
    content = j['choices'][0]['message']['content']
    bib = gen_bib(j['citations'])
    return "\n\n".join([title, query, content, bib])

def generate_report(row):
    return "\n\n".join([rep(row, typ ='plain'), rep(row, typ='contextual')])

def generate_reports(records, directory_path):
    for r in records:
        with open(directory_path + r['annotation_granular']+ '_plain.md', 'r') as file:
            file.write(rep('r', 'plain')
        with open(directory_path + r['annotation_granular_plain']+ '_contextual.md', 'r') as file:
            file.write(rep('r', '_contextual')    

SyntaxError: invalid syntax (2687705649.py, line 21)

In [13]:
from rich.markdown import Markdown
from rich.console import Console
console = Console()
console.print(Markdown(generate_report(out[0])))

NameError: name 'generate_report' is not defined

In [None]:
generate_reports(out, './output/deepsearch/')