In [9]:
import pandas as pd

data = pd.read_excel("./resources/de_jong_et_al_2025_supp_mat.xlsx", sheet_name="Table S5")

In [10]:
data.columns

Index(['cell_status', 'annotation_coarse', 'annotation_granular',
       'top_sig_enriched_pathways', 'sig_upreglated_top10'],
      dtype='object')

In [11]:
data[0:3]

Unnamed: 0,cell_status,annotation_coarse,annotation_granular,top_sig_enriched_pathways,sig_upreglated_top10
0,Malignant,AC-gliosis-like,AC-gliosis-like 1,"['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION',...","['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC..."
1,Malignant,AC-gliosis-like,AC-gliosis-like 2,"['GOBP_WOUND_HEALING', 'HALLMARK_EPITHELIAL_ME...","['CFAP54', 'IGFBP7', 'VOPP1', 'AQP4', 'LANCL2'..."
2,Malignant,AC-gliosis-like,AC-gliosis-like 3,"['HALLMARK_TNFA_SIGNALING_VIA_NFKB', 'GOBP_MON...","['LAMA2', 'AC012405.1', 'COL8A1', 'AC007402.1'..."


In [12]:
data['plain_query'] = ''
data['contextual_query'] = ''
data['cell_status_sub'] = data['cell_status'].replace('TME', 'tumour microenvironment')
data['annotation_coarse_sub'] = data['annotation_coarse'].replace('AC', 'Astrocyte', regex=True)
out = data.to_dict('records')


In [13]:
#gene_list = ['CFAP54', 'IGFBP7', 'VOPP1', 'AQP4', 'LANCL2', 'AC074351.1', 'DTHD1', 'AC012405.1', 'RTN1', 'MTRNR2L12']
#annotated_type = 'astrocyte precursor'

for o in out:

    o['contextual_query'] = ' '.join([f"The following is a list of genes enriched in {o['cell_status_sub']} cells from a",
                             "(human) patient with glioblastoma ",
        f"annotated as {o['annotation_coarse_sub']}. ",
        "What might this list say about the state, properties and interactions of this cell type? ",
        "Do not limit your answers to the cancer literature, ",
        "include normal cell-cell interactions, development and cross-cutting metabolic states. ",
        f"Gene list: {o['sig_upreglated_top10']}"])

    o['plain_query'] = "What might the following enriched gene lists say about the type, " \
              f"properties and cell-cell interactions of the cells the list was derived from { o['sig_upreglated_top10'] }"



In [14]:
out[0]

{'cell_status': 'Malignant',
 'annotation_coarse': 'AC-gliosis-like',
 'annotation_granular': 'AC-gliosis-like 1',
 'top_sig_enriched_pathways': "['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION', 'GOBP_AMEBOIDAL_TYPE_CELL_MIGRATION', 'GOBP_TISSUE_MIGRATION', 'GOBP_REGULATION_OF_EPITHELIAL_CELL_MIGRATION', 'GOBP_ENDOTHELIAL_CELL_MIGRATION']",
 'sig_upreglated_top10': "['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC092957.1', 'TNC', 'IGFBP7', 'COL23A1', 'NAMPT']",
 'plain_query': "What might the following enriched gene lists say about the type, properties and cell-cell interactions of the cells the list was derived from ['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC092957.1', 'TNC', 'IGFBP7', 'COL23A1', 'NAMPT']",
 'contextual_query': "The following is a list of genes enriched in Malignant cells from a (human) patient with glioblastoma  annotated as Astrocyte-gliosis-like.  What might this list say about the state, properties and interactions of this cell type?  Do not limit your answers to 

In [15]:
import requests
import os
import time

key = os.getenv("PERPLEXITY_API_KEY")

url = "https://api.perplexity.ai/chat/completions"
headers = {
    "accept": "application/json",
    "authorization": f"Bearer {key}",
    "content-type": "application/json"
}

data = {
    "model": "sonar-pro", # specifying deep search. 
    "messages": [
        {
            "role": "system",
            "content": "You are an expert biologist. Your answers must be based on primary scientific literature" \
            "and major reviews from peer-reviewed sources.",
        },
        {
            "role": "user",
            "content": '',
        },
    ],
#    "return_citations": True,  # No longer needed
    "search_mode": 'academic',
    "stream": False
} 

for o in out:
#    data['messages'][1]['content'] = o['plain_query']
#    o['plain_response'] = requests.post(url, headers=headers, json=data)
    data['messages'][1]['content'] = o['contextual_query']
    start = time.time()
    o['contextual_response'] = requests.post(url, headers=headers, json=data)
    end = time.time()
    print(f"Query took {end - start:.2f} seconds")
    time.sleep(2.0)
    

Query took 11.69 seconds
Query took 17.55 seconds
Query took 19.23 seconds
Query took 17.58 seconds
Query took 12.78 seconds
Query took 18.44 seconds
Query took 13.56 seconds
Query took 17.62 seconds
Query took 15.27 seconds
Query took 13.82 seconds
Query took 23.25 seconds
Query took 20.17 seconds
Query took 12.56 seconds
Query took 18.37 seconds
Query took 23.08 seconds
Query took 12.71 seconds
Query took 15.99 seconds
Query took 20.24 seconds
Query took 30.91 seconds
Query took 24.40 seconds
Query took 12.29 seconds
Query took 23.88 seconds
Query took 26.33 seconds
Query took 12.48 seconds
Query took 22.52 seconds
Query took 13.90 seconds
Query took 20.46 seconds
Query took 17.08 seconds
Query took 8.72 seconds
Query took 23.07 seconds
Query took 10.84 seconds
Query took 12.32 seconds
Query took 15.41 seconds
Query took 10.33 seconds
Query took 30.41 seconds
Query took 28.72 seconds
Query took 22.74 seconds
Query took 19.54 seconds
Query took 11.19 seconds
Query took 24.66 seconds
Q

In [16]:
def gen_bib(bib):
    out = ['\n\n## References\n']
    index = 1
    for ref in bib: 
        #print(ref.values())
        out.append(f"- [{str(index)}] {' '.join(str(x) for x in ref.values())}")
        index += 1
    return '\n'.join(out)

In [21]:
out

[{'cell_status': 'Malignant',
  'annotation_coarse': 'AC-gliosis-like',
  'annotation_granular': 'AC-gliosis-like 1',
  'top_sig_enriched_pathways': "['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION', 'GOBP_AMEBOIDAL_TYPE_CELL_MIGRATION', 'GOBP_TISSUE_MIGRATION', 'GOBP_REGULATION_OF_EPITHELIAL_CELL_MIGRATION', 'GOBP_ENDOTHELIAL_CELL_MIGRATION']",
  'sig_upreglated_top10': "['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC092957.1', 'TNC', 'IGFBP7', 'COL23A1', 'NAMPT']",
  'plain_query': "What might the following enriched gene lists say about the type, properties and cell-cell interactions of the cells the list was derived from ['POSTN', 'JPH1', 'EYA4', 'RTN1', 'LAMA2', 'AC092957.1', 'TNC', 'IGFBP7', 'COL23A1', 'NAMPT']",
  'contextual_query': "The following is a list of genes enriched in Malignant cells from a (human) patient with glioblastoma  annotated as Astrocyte-gliosis-like.  What might this list say about the state, properties and interactions of this cell type?  Do not limit your answ

In [26]:
def rep(fu, typ):
    print(typ + '_response')
    print(type(fu))
    print(type(fu[typ + '_response']))
    r = fu[typ + '_response']
    j = r.json()
    title = f"# {typ} query { fu['annotation_granular'] }"
    q = fu[typ + '_query']
    query = f"Query: {q}"
    content = j['choices'][0]['message']['content']
    bib = gen_bib(j['search_results'])
    return "\n\n".join([title, query, content, bib])

def generate_report(row):
    return "\n\n".join([rep(row, typ ='plain'), rep(row, typ='contextual')])

def generate_reports(records, directory_path):
    for r in records:
        plain_filename = f"{directory_path}{r['annotation_granular']}_plain.md"
        contextual_filename = f"{directory_path}{r['annotation_granular']}_contextual.md"

#        with open(plain_filename, 'w') as file:
#            file.write(rep(r, 'plain'))

        with open(contextual_filename, 'w') as file:
            file.write(rep(r, 'contextual')) 

In [25]:
out[0]['contextual_response'].json().keys()

dict_keys(['id', 'model', 'created', 'usage', 'citations', 'search_results', 'object', 'choices'])

In [19]:
from rich.markdown import Markdown
from rich.console import Console
console = Console()
console.print(Markdown(generate_report(out[0])))

plain_response
<class 'dict'>


KeyError: 'plain_response'

In [27]:
generate_reports(out, './output/pro/')

contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
<class 'dict'>
<class 'requests.models.Response'>
contextual_response
