# Background Information

In this notebook I will set up a basic flow for uploading and dowloading files. Formating datasets and ultimately running cytetype.

In [41]:
# Importing Require Libraries
import cytetype
import scanpy as sc
import session_info
import pandas as pd
import os
import numpy as np
from dotenv import load_dotenv

load_dotenv()

True

In [27]:
# Session Info
session_info.show()

  mod_version = _find_version(mod.__version__)


In [3]:
# Importing AnnData file
# File Path
file_path_Simone = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Simone_et_al/7d821d98-5b42-4480-8173-641c1b37b237.h5ad")
# AnnData Read In
adata = sc.read_h5ad(file_path_Simone)

In [4]:
# Normalize (10,000 Counts per Cell)
sc.pp.normalize_total(adata, target_sum=1e4)

# Log1p transform
sc.pp.log1p(adata)

  np.log1p(X, out=X)


In [26]:
# Explore log1p Error (It is due to NaNs)
print("Min:", np.min(adata.X))
print("Any NaNs in sparse data?", np.isnan(adata.X.data).any())
print("Any Infs in sparse data?", np.isinf(adata.X.data).any())

Min: nan
Any NaNs in sparse data? True
Any Infs in sparse data? False


In [9]:
# Find Most Up/Down Regulated Genes 
sc.tl.rank_genes_groups(adata, groupby='cell_type', method='t-test', key_added = 'Rank_Genes_cell_type')  # try 'wilcoxon' if accurate




In [10]:

result = adata.uns['Rank_Genes_cell_type']
groups = result['names'].dtype.names

top_genes_per_cluster = {group: result['names'][group][:10] for group in groups}
print(top_genes_per_cluster)

{'B cell': array(['ENSG00000105369', 'ENSG00000153064', 'ENSG00000156738',
       'ENSG00000196092', 'ENSG00000007312', 'ENSG00000116191',
       'ENSG00000161405', 'ENSG00000023445', 'ENSG00000163534',
       'ENSG00000042980'], dtype=object), 'CD4-positive, alpha-beta T cell': array(['ENSG00000167286', 'ENSG00000127152', 'ENSG00000198851',
       'ENSG00000182866', 'ENSG00000111716', 'ENSG00000160654',
       'ENSG00000152495', 'ENSG00000168685', 'ENSG00000168421',
       'ENSG00000134954'], dtype=object), 'CD14-positive monocyte': array(['ENSG00000038427', 'ENSG00000163563', 'ENSG00000119535',
       'ENSG00000143546', 'ENSG00000135218', 'ENSG00000106780',
       'ENSG00000084234', 'ENSG00000090382', 'ENSG00000163220',
       'ENSG00000169403'], dtype=object), 'CD14-low, CD16-positive monocyte': array(['ENSG00000129757', 'ENSG00000148737', 'ENSG00000170873',
       'ENSG00000204482', 'ENSG00000203747', 'ENSG00000254087',
       'ENSG00000103187', 'ENSG00000104763', 'ENSG00000185201'

In [11]:
adata.var

Unnamed: 0_level_0,gene_name,highly_deviant,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length,feature_type
Ensemble_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000188976,NOC2L,False,False,NOC2L_ENSG00000188976,NCBITaxon:9606,gene,1244,protein_coding
ENSG00000187961,KLHL17,False,False,KLHL17_ENSG00000187961,NCBITaxon:9606,gene,934,protein_coding
ENSG00000187583,PLEKHN1,False,False,PLEKHN1_ENSG00000187583,NCBITaxon:9606,gene,2194,protein_coding
ENSG00000188290,HES4,False,False,HES4_ENSG00000188290,NCBITaxon:9606,gene,961,protein_coding
ENSG00000187608,ISG15,True,False,ISG15_ENSG00000187608,NCBITaxon:9606,gene,657,protein_coding
...,...,...,...,...,...,...,...,...
ENSG00000212907,MT-ND4L,True,False,MT-ND4L_ENSG00000212907,NCBITaxon:9606,gene,297,protein_coding
ENSG00000198886,MT-ND4,True,False,MT-ND4_ENSG00000198886,NCBITaxon:9606,gene,1378,protein_coding
ENSG00000198786,MT-ND5,True,False,MT-ND5_ENSG00000198786,NCBITaxon:9606,gene,1812,protein_coding
ENSG00000198695,MT-ND6,False,False,MT-ND6_ENSG00000198695,NCBITaxon:9606,gene,525,protein_coding


In [12]:
annotator = cytetype.CyteType(adata, cell_group_key = 'cell_type', rank_genes_key = 'Rank_Genes_cell_type', gene_symbols_column_name = 'gene_name')

Calculating expression percentages.
Extracting marker genes.
Data preparation completed. Ready for annotation.


In [43]:
# Run annotation with custom model
adata = annotator.run(
    bio_context={
        'organisms': ['Homo sapiens'],
        'tissues': ['Bone Marrow']
    },
    model_config=[{
        'provider': 'openai',
        'name': 'meta-llama/llama-4-maverick',
        'apiKey': os.environ.get('OPENROUTER_API_KEY'),
        'baseUrl': "https://openrouter.ai/api/v1",
    }] 
)

Waiting for results for job ID: a51a21ac-8847-4013-b656-cf434b53a289
View the automatically updating visualization report at: https://nygen-labs-prod--cell-annotation-agent-fastapi-app.modal.run/report/a51a21ac-8847-4013-b656-cf434b53a289
250526:1450:12 |INFO| [STARTING WORKFLOWS]: Job ID: a51a21ac-8847-4013-b656-cf434b53a289
250526:1450:12 |INFO| [WORKFLOW STEP 1/3]: Generating context summary
250526:1450:20 |INFO| [WORKFLOW STEP 2/3]: Running annotation, review, and ontology assignment
250526:1450:21 |INFO| [Cluster: 1] Annotating cluster...
250526:1450:22 |INFO| [Cluster: 10] Annotating cluster...
250526:1450:23 |INFO| [Cluster: 2] Annotating cluster...
250526:1450:23 |INFO| [Cluster: 3] Annotating cluster...
250526:1450:24 |INFO| [Cluster: 4] Annotating cluster...
250526:1450:28 |INFO| [Cluster: 1] Reviewing cluster annotation...
250526:1450:28 |INFO| [Cluster: 4] Reviewing cluster annotation...
250526:1450:30 |INFO| [Cluster: 1] Finding ontology term for annotation...
250526:1450:

In [14]:
adata.obs.cell_type

GATCACACACGGGTAA-1_10X_3-rep1     CD4-positive, alpha-beta T cell
GTGTGGCTCTATTGTC-1_10X_3-rep1              CD14-positive monocyte
CTCGAGGGTTCGGTTA-1_10X_3-rep1                       megakaryocyte
TCTACATGTGTCCGGT-1_10X_3-rep1              CD14-positive monocyte
GGTGAAGTCGTTCCTG-1_10X_3-rep1                 natural killer cell
                                               ...               
TTTTGCAAATTT_Scipio-rep2                   CD14-positive monocyte
TTTTGGCCGACG_Scipio-rep2         CD14-low, CD16-positive monocyte
TTTTGGCGGCCC_Scipio-rep2          CD4-positive, alpha-beta T cell
TTTTGTAACTCA_Scipio-rep2                      natural killer cell
TTTTTTTTTTTT_Scipio-rep2                   CD14-positive monocyte
Name: cell_type, Length: 124132, dtype: category
Categories (10, object): ['B cell', 'CD4-positive, alpha-beta T cell', 'CD14-positive monocyte', 'CD14-low, CD16-positive monocyte', ..., 'megakaryocyte', 'natural killer cell', 'plasmacytoid dendritic cell', 'unknown']

In [32]:
adata.obs['CyteType_cell_type']

GATCACACACGGGTAA-1_10X_3-rep1                 T cell
GTGTGGCTCTATTGTC-1_10X_3-rep1               Monocyte
CTCGAGGGTTCGGTTA-1_10X_3-rep1               Platelet
TCTACATGTGTCCGGT-1_10X_3-rep1               Monocyte
GGTGAAGTCGTTCCTG-1_10X_3-rep1    Natural killer cell
                                        ...         
TTTTGCAAATTT_Scipio-rep2                    Monocyte
TTTTGGCCGACG_Scipio-rep2                    Monocyte
TTTTGGCGGCCC_Scipio-rep2                      T cell
TTTTGTAACTCA_Scipio-rep2         Natural killer cell
TTTTTTTTTTTT_Scipio-rep2                    Monocyte
Name: CyteType_cell_type, Length: 124132, dtype: category
Categories (8, object): ['Cytotoxic T cell', 'Dendritic cell', 'Monocyte', 'Naive B cell', 'Natural killer cell', 'Plasmacytoid dendritic cell', 'Platelet', 'T cell']

In [44]:
Author_vs_Llama4Mav = pd.crosstab(adata.obs.cell_type, adata.obs.CyteType_cell_type).T.idxmax()

In [39]:
annotator.cluster_map

{'B cell': '1',
 'CD14-low, CD16-positive monocyte': '2',
 'CD14-positive monocyte': '3',
 'CD4-positive, alpha-beta T cell': '4',
 'cytotoxic T cell': '5',
 'dendritic cell': '6',
 'megakaryocyte': '7',
 'natural killer cell': '8',
 'plasmacytoid dendritic cell': '9',
 'unknown': '10'}

In [48]:
Author_vs_Llama4Mav.reset_index().to_csv("/Users/alexantill/Göran_Karlsson_Lab/benchLLM/Simone_et_al/author_vs_Llama4.csv", index=False)

In [46]:
Author_vs_Llama4Mav

cell_type
B cell                                           B cell
CD4-positive, alpha-beta T cell                  T cell
CD14-positive monocyte                         Monocyte
CD14-low, CD16-positive monocyte               Monocyte
cytotoxic T cell                                 T cell
dendritic cell                                 Monocyte
megakaryocyte                             Megakaryocyte
natural killer cell                 Natural Killer Cell
plasmacytoid dendritic cell                      B cell
unknown                                          B cell
dtype: category
Categories (5, object): ['B cell', 'Megakaryocyte', 'Monocyte', 'Natural Killer Cell', 'T cell']