# Install Requirements

In [13]:
%pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas (from -r requirements.txt (line 1))
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting requests (from -r requirements.txt (line 3))
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting python-dotenv (from -r requirements.txt (line 4))
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting openai (from -r requirements.txt (line 5))
  Using cached openai-1.65.2-py3-none-any.whl.metadata (27 kB)
Collecting langchain-core (from -r requirements.txt (line 6))
  Using cached langchain_core-0.3.40-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-google-genai (from -r requirements.txt (line 7))
  Using cached langchain_google_genai-2.0.11-py3-none-any.whl.metadata (3.6 kB)
Collecting langchain-openai (from -r requirements.txt (line 8))
  Using cached langchain_openai-0.3.7-py3-none-any.whl.metadata (2.3 kB)
Collect

ERROR: Could not find a version that satisfies the requirement tavily-search (from versions: none)
ERROR: No matching distribution found for tavily-search


# Dependencies

In [2]:
import pandas as pd
import numpy as np
import os, requests, json, csv

In [3]:
from pydantic import BaseModel, Field
from openai import OpenAI

# Load CSV

In [7]:
def csv_read(file_path):
    with open(file_path, 'r') as file:
        sample = file.read(1024) 
        file.seek(0)
        detected_delimiter = csv.Sniffer().sniff(sample).delimiter    
    # Read the file with the detected delimiter
    df = pd.read_csv(file_path, sep=detected_delimiter)
    return df

In [8]:
file_path = "10-top-cluster.csv"
df = csv_read(file_path)

In [9]:
df

Unnamed: 0,myAUC,avg_diff,power,avg_log2FC,pct.1,pct.2,Cluster,Gene
0,0.939,0.918256,0.878,1.351082,1.000,0.996,0,KRT14
1,0.855,0.713125,0.710,1.219866,1.000,0.848,0,IFI27
2,0.783,0.621616,0.566,1.154583,0.969,0.787,0,COL17A1
3,0.766,0.519959,0.532,1.144090,0.931,0.630,0,HSPA1A
4,0.755,0.507771,0.510,1.248768,0.875,0.509,0,LGALS3BP
...,...,...,...,...,...,...,...,...
260,0.765,0.103585,0.530,0.156168,1.000,0.880,26,COL1A1
261,0.750,0.118977,0.500,0.190017,1.000,0.802,26,COL3A1
262,0.724,0.189142,0.448,0.306940,1.000,0.828,26,SPARC
263,0.703,0.282657,0.406,0.431395,1.000,0.998,26,MT-CYB


# Preprocess Data

## Group Genes by Clusters

In [10]:
def extract_clusters(df):
    # Ensure the column names are correct
    if "Cluster" not in df.columns or "Gene" not in df.columns:
        raise ValueError("The file must contain 'Cluster' and 'Gene' columns")

    # Group the data by clusters and pad genes to ensure columns have equal lengths
    grouped = df.groupby("Cluster")["Gene"].apply(list)

    # Create a DataFrame where each column is a cluster and rows contain the genes
    max_length = max(grouped.apply(len))  # Determine the maximum number of genes in a cluster
    return pd.DataFrame({cluster: genes + [None] * (max_length - len(genes)) for cluster, genes in grouped.items()})

In [11]:
clustered_df = extract_clusters(df)

clustered_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,KRT14,TXNIP,LY6D,S100A8,TIMP1,ITM2B,TINAGL1,KRT1,KRT16,KRT14,...,IGHG4,FADS2,IGKC,SPRR1B,IGHG4,FASN,LOR,S100A8,COL3A1,COL1A1
1,IFI27,ATP1B3,KRT14,S100A7,APOD,PERP,TM4SF1,GPX2,SBSN,LY6D,...,IGKC,THRSP,IGHG4,S100A7,IGKC,APOC1,FLG2,S100A7,COL1A1,COL3A1
2,COL17A1,MAF,KRT16,SPRR1B,SAA1,GPNMB,IGFBP6,LTF,LY6D,S100A2,...,MYL9,FADS1,IGHG3,SPRR2D,MALAT1,MGST1,ARG1,CSTA,COL1A2,SPARC
3,HSPA1A,EIF4A2,LGALS7,KRTDAP,IGFBP5,TXNIP,MMP28,CCL27,LGALS7,LGALS7,...,IGHG3,PECR,IGHG1,MUCL1,C1QA,SAA1,LCE1B,KRTDAP,SPARC,MT-CYB
4,LGALS3BP,MMP13,IFI27,SBSN,C3,GJB6,BIRC5,FOSB,KRTDAP,IFI27,...,IGHG1,HSD11B1,JCHAIN,S100A7A,HLA-DRA,AWAT2,KPRP,SPRR1B,POSTN,MT-ND2
5,EFNA1,CAPN2,S100A2,CRABP2,MMP3,ATP1B3,CA2,EPHB6,DMKN,KRT5,...,JCHAIN,HSD3B1,IGHA1,SPRR1A,MT2A,ACSBG1,WFDC12,GJB6,DCN,
6,RBM42,SULF2,HIST1H1C,SLPI,COMP,CLDN1,CENPW,ATF3,GJB2,TAGLN2,...,IGFBP6,FABP7,IGLC2,IL36RN,MYL9,CYP4F8,LCE1A,MUCL1,COL6A3,
7,HIST1H1C,CEBPG,LSR,IVL,CXCL12,TNFSF10,SERPINA3,TNFRSF19,HIST1H1C,FXYD3,...,SULF1,ACSM6,IGLC1,LCE3E,IGHG1,TLCD4,LCE2C,SLPI,COL5A2,
8,SCPEP1,PALLD,FAM83H,PI3,CCL19,DSG3,RRM2,AC103591.3,CRABP2,HSPB1,...,IGHA1,ACOT1,IGHG2,SLC6A14,ATP6V0C,CLMP,LCE2B,S100A7A,PRRX1,
9,GAMT,PLAU,TMEM79,MUCL1,MMP1,MAF,CCNB2,HLA-DQB2,SLPI,COX6B1,...,GGT5,PKLR,MZB1,SPP1,IGHG3,GLDC,LCE1E,SPP1,COL11A1,


In [12]:
print(clustered_df.to_csv(sep='\t', index=False))

0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26
KRT14	TXNIP	LY6D	S100A8	TIMP1	ITM2B	TINAGL1	KRT1	KRT16	KRT14	COL3A1	COL1A2	COL1A1	TXNIP	CST6	KRT14	RPL13A	IGHG4	FADS2	IGKC	SPRR1B	IGHG4	FASN	LOR	S100A8	COL3A1	COL1A1
IFI27	ATP1B3	KRT14	S100A7	APOD	PERP	TM4SF1	GPX2	SBSN	LY6D	COL1A1	COL1A1	COL1A2	COL3A1	PSAPL1	KRT5	RPL18A	IGKC	THRSP	IGHG4	S100A7	IGKC	APOC1	FLG2	S100A7	COL1A1	COL3A1
COL17A1	MAF	KRT16	SPRR1B	SAA1	GPNMB	IGFBP6	LTF	LY6D	S100A2	COL1A2	COL6A2	COL3A1	DCN	SLC15A1	COL6A1	KRT1	MYL9	FADS1	IGHG3	SPRR2D	MALAT1	MGST1	ARG1	CSTA	COL1A2	SPARC
HSPA1A	EIF4A2	LGALS7	KRTDAP	IGFBP5	TXNIP	MMP28	CCL27	LGALS7	LGALS7	SPARC	COL6A1	POSTN	LUM	DGAT2L6	COL6A2	RPS11	IGHG3	PECR	IGHG1	MUCL1	C1QA	SAA1	LCE1B	KRTDAP	SPARC	MT-CYB
LGALS3BP	MMP13	IFI27	SBSN	C3	GJB6	BIRC5	FOSB	KRTDAP	IFI27	POSTN	COL5A1	FN1	APCDD1	GAL	IFI27	RPL18	IGHG1	HSD11B1	JCHAIN	S100A7A	HLA-DRA	AWAT2	KPRP	SPRR1B	POSTN	MT-ND2
EFNA1	CAPN2	S100A2	CRABP2	MMP3	ATP1B3	CA2	EPHB6	DMKN	KRT5	COL6A3	COL4A1	COL11A1	COL6A3	SEC14L6	CO

## Translate Genes into Ensembl ID

In [13]:
def translate_genes(clustered_df):
    # Load the Ensembl ID mapping from JSON
    with open('dictionary.json', 'r') as file:
        gene_to_ensembl = json.load(file)["ensembl_to_gene"]

    ensembl_to_gene = {v: k for k, v in gene_to_ensembl.items()}

    # Translate gene names to Ensembl IDs in the clustered DataFrame
    return clustered_df.apply(lambda col: col.map(lambda gene: ensembl_to_gene.get(gene, None) if pd.notna(gene) else None))

In [14]:
ensembl_df = translate_genes(clustered_df)
ensembl_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,ENSG00000186847,ENSG00000265972,ENSG00000167656,ENSG00000143546,ENSG00000102265,ENSG00000136156,ENSG00000142910,ENSG00000167768,ENSG00000186832,ENSG00000186847,...,ENSG00000211892,ENSG00000134824,ENSG00000211592,ENSG00000169469,ENSG00000211892,ENSG00000169710,,ENSG00000143546,ENSG00000168542,ENSG00000108821
1,ENSG00000165949,ENSG00000069849,ENSG00000186847,ENSG00000143556,ENSG00000189058,ENSG00000112378,ENSG00000169908,ENSG00000176153,ENSG00000189001,ENSG00000167656,...,ENSG00000211592,ENSG00000151365,ENSG00000211892,ENSG00000143556,ENSG00000211592,ENSG00000130208,ENSG00000143520,ENSG00000143556,ENSG00000108821,ENSG00000168542
2,ENSG00000065618,ENSG00000178573,ENSG00000186832,ENSG00000169469,ENSG00000173432,ENSG00000136235,ENSG00000167779,ENSG00000012223,ENSG00000167656,ENSG00000196754,...,ENSG00000101335,ENSG00000149485,ENSG00000211897,ENSG00000163216,ENSG00000251562,ENSG00000008394,ENSG00000118520,ENSG00000121552,ENSG00000164692,ENSG00000113140
3,ENSG00000204389,ENSG00000156976,ENSG00000205076,ENSG00000188508,ENSG00000115461,ENSG00000265972,ENSG00000271447,ENSG00000213927,ENSG00000205076,ENSG00000205076,...,ENSG00000211897,ENSG00000115425,ENSG00000211896,ENSG00000172551,ENSG00000173372,ENSG00000173432,ENSG00000196734,ENSG00000188508,ENSG00000113140,ENSG00000198727
4,ENSG00000108679,ENSG00000137745,ENSG00000165949,ENSG00000189001,,ENSG00000121742,ENSG00000089685,ENSG00000125740,ENSG00000188508,ENSG00000165949,...,ENSG00000211896,ENSG00000117594,ENSG00000132465,ENSG00000184330,ENSG00000204287,ENSG00000147160,ENSG00000203786,ENSG00000169469,ENSG00000133110,ENSG00000198763
5,ENSG00000169242,ENSG00000162909,ENSG00000196754,ENSG00000143320,ENSG00000149968,ENSG00000069849,ENSG00000104267,ENSG00000106123,ENSG00000161249,ENSG00000186081,...,ENSG00000132465,ENSG00000203857,ENSG00000211895,ENSG00000169474,ENSG00000125148,ENSG00000103740,ENSG00000168703,ENSG00000121742,ENSG00000011465,
6,ENSG00000126254,ENSG00000196562,,ENSG00000124107,ENSG00000105664,ENSG00000163347,ENSG00000203760,ENSG00000162772,ENSG00000165474,ENSG00000158710,...,ENSG00000167779,ENSG00000164434,ENSG00000211677,ENSG00000136695,ENSG00000101335,ENSG00000186526,ENSG00000186844,ENSG00000172551,ENSG00000163359,
7,,ENSG00000153879,ENSG00000105699,ENSG00000163207,ENSG00000107562,ENSG00000121858,ENSG00000196136,ENSG00000127863,,ENSG00000089356,...,ENSG00000137573,ENSG00000173124,ENSG00000211675,ENSG00000185966,ENSG00000211896,ENSG00000152078,ENSG00000187180,ENSG00000124107,ENSG00000204262,
8,ENSG00000121064,ENSG00000129116,ENSG00000180921,ENSG00000124102,ENSG00000172724,ENSG00000134757,ENSG00000171848,,ENSG00000143320,ENSG00000106211,...,ENSG00000211895,ENSG00000184227,ENSG00000211893,ENSG00000268104,ENSG00000185883,ENSG00000166250,ENSG00000159455,ENSG00000184330,ENSG00000116132,
9,ENSG00000130005,ENSG00000122861,ENSG00000163472,ENSG00000172551,ENSG00000196611,ENSG00000178573,ENSG00000157456,ENSG00000232629,ENSG00000124107,ENSG00000126267,...,ENSG00000099998,ENSG00000143627,ENSG00000170476,ENSG00000118785,ENSG00000211897,ENSG00000178445,ENSG00000186226,ENSG00000118785,ENSG00000060718,


# Fetch Gene Expression Data

## Function Initialization

Function to fetch gene expression data

In [15]:
def fetch_expression_data(cluster, ensembl_ids):
    """
    Fetches gene expression data from the Cellxgene API for a given cluster and list of Ensembl IDs.
    
    :param cluster: The cluster name.
    :param ensembl_ids: List of Ensembl IDs for the cluster.
    :return: JSON response from the API or None if an error occurred.
    """

    payload = {
        "filter": {
            "dataset_ids": [],
            "development_stage_ontology_term_ids": [],
            "disease_ontology_term_ids": [],
            "gene_ontology_term_ids": ensembl_ids,
            "organism_ontology_term_id": "NCBITaxon:9606",
            "self_reported_ethnicity_ontology_term_ids": [],
            "sex_ontology_term_ids": [],
            "publication_citations": [],
        },
        "is_rollup": True
    }

    # URL for the POST request
    API_URL = "https://api.cellxgene.cziscience.com/wmg/v2/query"

    try:
        response = requests.post(API_URL, json=payload)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.json()['expression_summary']
    except requests.RequestException as e:
        print(f"Error fetching data for Cluster {cluster}: {e}")
        return None

Function to convert the data to a DataFrame

In [16]:
def expression_data_to_df(data):
    flattened_data = []
    for gene_id, anatomical_structures in data.items():
        for anatomical_id, cell_types in anatomical_structures.items():
            for cell_type_id, aggregated_data in cell_types.items():
                metrics = aggregated_data['aggregated']
                flattened_data.append({
                    'gene': gene_id,
                    'tissue': anatomical_id,
                    'cell': cell_type_id,
                    'expression': metrics.get('me', None),
                    'cell count': metrics.get('n', None),
                    'cell percentage': metrics.get('pc', None),
                    'tissue composition': metrics.get('tpc', None)
                })
    return pd.DataFrame(flattened_data)

Function to filter the data

In [17]:
def filter_expression_data(response_df, TARGET_UBERON_ID):
    tissue_df = response_df[response_df['tissue'] == TARGET_UBERON_ID].drop(columns=['tissue'])
    return tissue_df[~tissue_df['cell'].isin(['tissue_stats', 'CL:0000000'])]

Function to translate the ontology terms

In [18]:
def translate_ontology(df):
    with open('cell_dict.json', 'r') as cell_file, open('ensembl_to_gene.json', 'r') as gene_file:
        cl_to_cell = json.load(cell_file)
        ensembl_to_gene = json.load(gene_file)

    df['cell'] = df['cell'].map(cl_to_cell)
    df['gene'] = df['gene'].map(ensembl_to_gene).fillna(df['gene'])

    return df

In [19]:
def transform_results(results):
    # Count the number of entries for each cell
    cell_counts = results['cell'].value_counts().reset_index()
    cell_counts.columns = ['cell', 'count']

    # Merge the counts back to the original DataFrame
    merged_df = results.merge(cell_counts, on='cell')

    # Sort the DataFrame based on the count of entries in descending order
    return merged_df.groupby(['cell', 'gene']).first().sort_values(by=['count', 'cell', 'expression', 'cell percentage'], ascending=[False, True, False, False]).reset_index().drop(columns=['count'])

In [20]:
def calculate_cell_score(results):
    cell_df = results.copy()

    # Group by 'cell' and calculate the total cell count for each group
    sum_cell_count = cell_df.groupby('cell')['cell count'].sum().reset_index()
    sum_cell_count.columns = ['cell', 'total cell count']
    cell_df = cell_df.merge(sum_cell_count, on='cell')

    cell_df = cell_df[cell_df['total cell count'] >= 100]

    # Calculate scores
    cell_df['score'] = (1) * ((cell_df['expression'] - cell_df['expression'].min()) / (cell_df['expression'].max() - cell_df['expression'].min())) + (1.5) * (cell_df['cell percentage']) + (2.5) * ((cell_df['cell count'] - cell_df['cell count'].min()) / (cell_df['cell count'].max() - cell_df['cell count'].min()))

    # Group by 'cell' and calculate the standard deviation of 'score' for each group
    std_scores = cell_df.groupby('cell')['score'].std().reset_index()
    std_scores.columns = ['cell', 'std']
    cell_df = cell_df.merge(std_scores, on='cell')

    # Group by 'cell' and calculate the mean of 'score' for each group
    mean_scores = cell_df.groupby('cell')['score'].mean().reset_index().fillna(0)
    mean_scores.columns = ['cell', 'mean']
    cell_df = cell_df.merge(mean_scores, on='cell')

    # Calculate the coefficient of variation (CV) for each cell
    cell_df['CV'] = cell_df['std'] / cell_df['mean']

    # Group by 'cell' and calculate the kurtosis of 'score' for each group
    kurtosis_scores = cell_df.groupby('cell')['score'].apply(pd.Series.kurt).reset_index()
    kurtosis_scores.columns = ['cell', 'kurtosis']
    cell_df = cell_df.merge(kurtosis_scores, on='cell')

    # Normalize the scores
    cell_df['std norm'] = (cell_df['std'] - cell_df['std'].min()) / (cell_df['std'].max() - cell_df['std'].min())
    cell_df['mean norm'] = (cell_df['mean'] - cell_df['mean'].min()) / (cell_df['mean'].max() - cell_df['mean'].min())
    cell_df['CV norm'] = (cell_df['CV'] - cell_df['CV'].min()) / (cell_df['CV'].max() - cell_df['CV'].min())
    cell_df['kurtosis norm'] = ((cell_df['kurtosis'] - cell_df['kurtosis'].min()) / (cell_df['kurtosis'].max() - cell_df['kurtosis'].min()))

    # Count the number of entries for each cell
    cell_counts = cell_df['cell'].value_counts().reset_index()
    cell_counts.columns = ['cell', 'count']
    cell_df = cell_df.merge(cell_counts, on='cell')

    cell_df['cell score'] = (0.1 * (1 - cell_df['std norm']).fillna(0) + 0.7 * cell_df['mean norm'] + 0.1 * (1 - cell_df['CV norm']) + 0.1 * (1 - cell_df['kurtosis norm']).fillna(0)) * cell_df['count']
    return cell_df.sort_values(by=['count', 'cell score', 'cell', 'gene'], ascending=[False, False, False, True]).reset_index(drop=True)

In [21]:
def run_data_processing(cluster, ensembl_ids, target_uberon_id):
    try:
        # Step 1: Fetch expression data
        data = fetch_expression_data(cluster, ensembl_ids)

        # Step 2: Convert data to DataFrame
        response_df = expression_data_to_df(data)

        # Step 3: Filter data based on target UBERON ID
        filtered_df = filter_expression_data(response_df, target_uberon_id)

        # Step 4: Translate ontology terms
        translated_df = translate_ontology(filtered_df)

        # Step 5: Transform the results
        transformed_df = transform_results(translated_df)

        # Step 6: Rank the cells based on the calculated score
        ranked_df = calculate_cell_score(transformed_df)

        return ranked_df

    except Exception as e:
        print(f"An error occurred during the pipeline execution: {e}")
        return pd.DataFrame()

In [22]:
def main_data_analysis(ensembl_df, target_uberon_id):
    results = {}

    # Iterate over each cluster (column) in the DataFrame
    for cluster in ensembl_df.columns:
        # Extract Ensembl IDs for the current cluster, dropping any NaN values
        ensembl_ids = ensembl_df[cluster].dropna().tolist()
        
        if not ensembl_ids:
            continue
        
        # Run the expression pipeline for the current cluster
        result_df = run_data_processing(cluster, ensembl_ids, target_uberon_id)
        
        # Store the result in the dictionary
        results[cluster] = result_df.reset_index(drop=True)

    return results

Function to Display the Results

In [23]:
from IPython.display import Markdown, display
def display_dataframe_as_markdown(df):
    markdown_table = df.to_markdown(index=False)
    display(Markdown(markdown_table))

## Run the Section

In [24]:
# Specify the UBERON ID to filter
TARGET_UBERON_ID = "UBERON:0002097"

In [25]:
# Run the main pipeline
results = main_data_analysis(ensembl_df, TARGET_UBERON_ID)

In [26]:
def export_dataframe(df):
    unique_cell = df['cell'].unique()[:20]
    final_df = pd.DataFrame(unique_cell)
    final_df.columns = ['cells']
    genes, cell_score = [], []
    for cell in unique_cell:
        gene_list = df['gene'][df['cell'] == cell].unique()
        score_list = df['cell score'][df['cell'] == cell].unique()
        
        genes.append(", ".join(map(str, gene_list)))
        cell_score.append(", ".join(map(str, score_list)))
    final_df['genes'], final_df['cell score'] = genes, cell_score
    return final_df

In [27]:
final_df = export_dataframe(results[0])
display_dataframe_as_markdown(final_df)

| cells                                     | genes                                                               |   cell score |
|:------------------------------------------|:--------------------------------------------------------------------|-------------:|
| epithelial cell                           | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      7.67564 |
| epidermal cell                            | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      6.94493 |
| squamous epithelial cell                  | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      6.94387 |
| stratified squamous epithelial cell       | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      6.93894 |
| stratified epithelial cell                | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      6.93894 |
| keratinocyte                              | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      6.93894 |
| basal cell of epidermis                   | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      5.20789 |
| stem cell of epidermis                    | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      5.20348 |
| stem cell                                 | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      5.05776 |
| epithelial fate stem cell                 | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      5.03504 |
| basal cell                                | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      5.03504 |
| progenitor cell                           | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.9887  |
| precursor cell                            | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.91531 |
| prickle cell                              | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.3003  |
| skin fibroblast                           | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.22101 |
| columnar/cuboidal epithelial cell         | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.10285 |
| neuron associated cell (sensu Vertebrata) | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.10111 |
| Merkel cell                               | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.10111 |
| connective tissue cell                    | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.05615 |
| fibroblast                                | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      3.98718 |

In [105]:
def get_top3_cells(df):
    return df['cell'].unique()[:3]

In [59]:
def export_top3_cells(results):

    top3cells = pd.DataFrame()
    for cluster in results:
        top3cells[f'{cluster}'] = get_top3_cells(results[cluster])

    top3cellsT = top3cells.T.reset_index()
    top3cellsT.columns = ['Cluster'] + [f'Cell {i+1}' for i in range(top3cellsT.shape[1]-1)]

    return top3cellsT

In [69]:
display_dataframe_as_markdown(export_top3_cells(results))

|   Cluster | Cell 1                 | Cell 2                              | Cell 3                              | Cell 4                              | Cell 5                                 |
|----------:|:-----------------------|:------------------------------------|:------------------------------------|:------------------------------------|:---------------------------------------|
|         0 | epithelial cell        | epidermal cell                      | squamous epithelial cell            | stratified squamous epithelial cell | stratified epithelial cell             |
|         1 | hematopoietic cell     | leukocyte                           | mononuclear cell                    | epithelial cell                     | connective tissue cell                 |
|         2 | epithelial cell        | epidermal cell                      | squamous epithelial cell            | stratified squamous epithelial cell | stratified epithelial cell             |
|         3 | epidermal cell         | stratified squamous epithelial cell | stratified epithelial cell          | keratinocyte                        | squamous epithelial cell               |
|         4 | connective tissue cell | fibroblast                          | skin fibroblast                     | neuron associated cell              | neural cell                            |
|         5 | epithelial cell        | squamous epithelial cell            | epidermal cell                      | stratified squamous epithelial cell | stratified epithelial cell             |
|         6 | connective tissue cell | fibroblast                          | epithelial cell                     | endothelial cell                    | skin fibroblast                        |
|         7 | epidermal cell         | stratified squamous epithelial cell | stratified epithelial cell          | keratinocyte                        | squamous epithelial cell               |
|         8 | epidermal cell         | stratified squamous epithelial cell | stratified epithelial cell          | keratinocyte                        | squamous epithelial cell               |
|         9 | epithelial cell        | squamous epithelial cell            | stratified squamous epithelial cell | stratified epithelial cell          | keratinocyte                           |
|        10 | connective tissue cell | fibroblast                          | skin fibroblast                     | myofibroblast cell                  | contractile cell                       |
|        11 | connective tissue cell | fibroblast                          | skin fibroblast                     | myofibroblast cell                  | contractile cell                       |
|        12 | connective tissue cell | fibroblast                          | skin fibroblast                     | myofibroblast cell                  | vascular associated smooth muscle cell |
|        13 | connective tissue cell | fibroblast                          | skin fibroblast                     | stromal cell                        | contractile cell                       |
|        14 | epithelial cell        | connective tissue cell              | squamous epithelial cell            | fibroblast                          | endothelial cell                       |
|        15 | connective tissue cell | fibroblast                          | skin fibroblast                     | epithelial cell                     | squamous epithelial cell               |
|        16 | epithelial cell        | stratified squamous epithelial cell | stratified epithelial cell          | keratinocyte                        | epidermal cell                         |
|        17 | connective tissue cell | fibroblast                          | skin fibroblast                     | hematopoietic cell                  | leukocyte                              |
|        18 | sebaceous gland cell   | connective tissue cell              | lymphocyte                          | regulatory T cell                   | T cell                                 |
|        19 | hematopoietic cell     | leukocyte                           | mononuclear cell                    | lymphocyte                          | professional antigen presenting cell   |
|        20 | epithelial cell        | epidermal cell                      | stratified squamous epithelial cell | stratified epithelial cell          | keratinocyte                           |
|        21 | hematopoietic cell     | leukocyte                           | mononuclear cell                    | myeloid cell                        | phagocyte                              |
|        22 | connective tissue cell | fibroblast                          | skin fibroblast                     | neoplastic cell                     | malignant cell                         |
|        23 | hematopoietic cell     | squamous epithelial cell            | stratified squamous epithelial cell | stratified epithelial cell          | keratinocyte                           |
|        24 | epithelial cell        | epidermal cell                      | stratified squamous epithelial cell | stratified epithelial cell          | keratinocyte                           |
|        25 | connective tissue cell | fibroblast                          | skin fibroblast                     | myofibroblast cell                  | mesenchymal stem cell                  |
|        26 | connective tissue cell | fibroblast                          | skin fibroblast                     | smooth muscle cell                  | muscle cell                            |

# Analysis using OpenAI and Tavily

In [4]:
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.utilities.tavily_search import TavilySearchAPIWrapper
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain_core.runnables import RunnableConfig, chain

In [5]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import chain
from tavily import TavilyClient
import ast
import asyncio

In [6]:
from langchain_openai import ChatOpenAI

In [45]:
from dotenv import load_dotenv
load_dotenv()

True

In [79]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash", 
    temperature=0.0
)

In [96]:
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [77]:
tavily = TavilyClient()

In [70]:
final_df = export_dataframe(results[0])
display_dataframe_as_markdown(final_df)

| cells                                     | genes                                                               |   cell score |
|:------------------------------------------|:--------------------------------------------------------------------|-------------:|
| epithelial cell                           | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      7.67564 |
| epidermal cell                            | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      6.94493 |
| squamous epithelial cell                  | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      6.94387 |
| stratified squamous epithelial cell       | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      6.93894 |
| stratified epithelial cell                | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      6.93894 |
| keratinocyte                              | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      6.93894 |
| basal cell of epidermis                   | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      5.20789 |
| stem cell of epidermis                    | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      5.20348 |
| stem cell                                 | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      5.05776 |
| epithelial fate stem cell                 | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      5.03504 |
| basal cell                                | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      5.03504 |
| progenitor cell                           | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.9887  |
| precursor cell                            | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.91531 |
| prickle cell                              | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.3003  |
| skin fibroblast                           | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.22101 |
| columnar/cuboidal epithelial cell         | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.10285 |
| neuron associated cell (sensu Vertebrata) | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.10111 |
| Merkel cell                               | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.10111 |
| connective tissue cell                    | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      4.05615 |
| fibroblast                                | COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LGALS3BP, RBM42, SCPEP1 |      3.98718 |

In [74]:
final_df.head(1)

Unnamed: 0,cells,genes,cell score
0,epithelial cell,"COL17A1, EFNA1, GAMT, HSPA1A, IFI27, KRT14, LG...",7.675640368487279


In [97]:
search = TavilySearchAPIWrapper()
tavily_tool = TavilySearchResults(api_wrapper=search)

# initialize the agent
agent = create_conversational_retrieval_agent(
    llm,
    tools=[tavily_tool],
)

In [81]:
def generate_queries(cell_name, gene_names: list):
    return f"answer with yes or no. Is {cell_name} related to this genes: {gene_names}?"

In [88]:
cell_names = results[0]['cell'].unique()[:20].tolist()

In [90]:
gene_names = results[0][results[0]['cell'] == cell_names[1]]['gene'].tolist()

In [91]:
gene_names

['COL17A1',
 'EFNA1',
 'GAMT',
 'HSPA1A',
 'IFI27',
 'KRT14',
 'LGALS3BP',
 'RBM42',
 'SCPEP1']

In [92]:
cell_names

['epithelial cell',
 'epidermal cell',
 'squamous epithelial cell',
 'stratified squamous epithelial cell',
 'stratified epithelial cell',
 'keratinocyte',
 'basal cell of epidermis',
 'stem cell of epidermis',
 'stem cell',
 'epithelial fate stem cell',
 'basal cell',
 'progenitor cell',
 'precursor cell',
 'prickle cell',
 'skin fibroblast',
 'columnar/cuboidal epithelial cell',
 'neuron associated cell (sensu Vertebrata)',
 'Merkel cell',
 'connective tissue cell',
 'fibroblast']

In [94]:
test = generate_queries(cell_names[1], gene_names)
test

"answer with yes or no. Is epidermal cell related to this genes: ['COL17A1', 'EFNA1', 'GAMT', 'HSPA1A', 'IFI27', 'KRT14', 'LGALS3BP', 'RBM42', 'SCPEP1']?"

In [100]:
# run the agent
resp = agent.invoke(test)

In [104]:
resp

{'input': "answer with yes or no. Is epidermal cell related to this genes: ['COL17A1', 'EFNA1', 'GAMT', 'HSPA1A', 'IFI27', 'KRT14', 'LGALS3BP', 'RBM42', 'SCPEP1']?",
 'chat_history': [HumanMessage(content="answer with yes or no. Is epidermal cell related to this genes: ['COL17A1', 'EFNA1', 'GAMT', 'HSPA1A', 'IFI27', 'KRT14', 'LGALS3BP', 'RBM42', 'SCPEP1']?", additional_kwargs={}, response_metadata={}),
  AIMessage(content='Yes.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content="answer with yes or no. Is epidermal cell related to this genes: ['COL17A1', 'EFNA1', 'GAMT', 'HSPA1A', 'IFI27', 'KRT14', 'LGALS3BP', 'RBM42', 'SCPEP1']?", additional_kwargs={}, response_metadata={}),
  AIMessage(content='Yes.', additional_kwargs={}, response_metadata={})],
 'output': 'Yes.',
 'intermediate_steps': []}

In [66]:
resp

{'input': "answer with yes or no. Is squamous epithelial cell related to this genes: ['ARG1', 'FLG2', 'KPRP', 'LCE1A', 'LCE1B', 'LCE1E', 'LCE2B', 'LCE2C', 'WFDC12']?",
 'chat_history': [HumanMessage(content="answer with yes or no. Is hematopoietic cell related to this genes: ['ARG1', 'FLG2', 'KPRP', 'LCE1A', 'LCE1B', 'LCE1E', 'LCE2B', 'LCE2C', 'WFDC12']?", additional_kwargs={}, response_metadata={}),
  AIMessage(content='No.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content="answer with yes or no. Is squamous epithelial cell related to this genes: ['ARG1', 'FLG2', 'KPRP', 'LCE1A', 'LCE1B', 'LCE1E', 'LCE2B', 'LCE2C', 'WFDC12']?", additional_kwargs={}, response_metadata={}),
  AIMessage(content='Yes.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content="answer with yes or no. Is squamous epithelial cell related to this genes: ['ARG1', 'FLG2', 'KPRP', 'LCE1A', 'LCE1B', 'LCE1E', 'LCE2B', 'LCE2C', 'WFDC12']?", additional_kwargs={}, response_metadata={}),

# Analysis using LangChain

In [4]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain_core.output_parsers import JsonOutputParser

In [56]:
from dotenv import load_dotenv
load_dotenv()

True

In [30]:
llm = ChatOpenAI(
    model="gpt-4o-mini-2024-07-18",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [None]:
#JSON Output formatter  
class CellAnnotation(BaseModel):
    class cell(BaseModel):
        cell_type : str = Field(description="name of the chosen cell.")
        reason : str = Field(description="explain the reasoning for choosing the cell. give comments about parameters from the data that support the choice.")
        
    explanation : list[str] = Field(description="explanation of your reasoning step by step. focus on the parameters and number of genes expressed that support the choice of the cell.")
    cells : list[cell]
  

In [33]:
initial_df = df

In [None]:
prompt = ChatPromptTemplate.from_template("""
                                          
You are an expert in analyzing single-cell RNA sequencing (scRNA-seq) data. Your goal is to identify the top 5 most similar cell types to a given, unannotated cell cluster, based on a limited dataset.

The dataset is structured as follows:

Each row represents a known cell type expressing top genes from a cluster. The columns are: "Gene" (top gene expressed in the cluster), "Cell Name" (the known cell type expressing the gene), "Expression Value" (level of expression), "Cell Percentage" (percentage of cells expressing the gene), "Cell Count" (number of cells), and "Tissue Composition" (percentage of cells in the tissue).

Given this dataset, outline the steps to create and apply a scoring system, then finally return a table of top 5 possible cell type identities for the cluster, and its calculated value.

Ensure that:
1. Describe each row of the dataset, and what it represents.
2. What metric is being used, and what value do they indicate.
3. The steps for scoring each possible cell type.
4. How to obtain the top five possible candidates.                                      
""")

chain = prompt | llm | StrOutputParser()

In [None]:
parser = JsonOutputParser(pydantic_object=CellAnnotation)

prompt = PromptTemplate(
    template="""You are an expert in single-cell RNA sequencing.

        I have a cluster of cells with its genes as follow

        {initial_df}

        with the gene expression of each cell

        {expression_data}

        Based on the given data, which cell is the most valid to be annotated for this cluster of cells according to you?
        
        \n{format_instructions}""",
    input_variables=["initial_df", "expression_data"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

In [103]:
response = chain.invoke({'initial_df': initial_df, 'expression_data': results[0].to_csv(sep='\t', index=False)})

In [104]:
response

{'cell': 'stem cell of epidermis, stem cell, basal cell of epidermis, precursor cell, progenitor cell, epithelial fate stem cell, basal cell, columnar/cuboidal epithelial cell, neuron associated cell (sensu Vertebrata), Merkel cell',
 'explanation': 'The provided data strongly suggests that the cluster of cells is predominantly composed of epidermal cells.  The genes KRT14, which is highly expressed in keratinocytes (epidermal cells), is the most highly expressed gene in the cluster.  The cells listed above are all epidermal cell types or closely related stem/progenitor cells.  The high expression of KRT14, coupled with the presence of other epidermal-related cell types in the list, makes this the most likely annotation for the cluster.  The high cell counts and percentages for these cells further support this conclusion.  While other cell types might express some of these genes, the overall pattern points to a significant epidermal cell population.'}

# Analysis using OpenAI

In [None]:
# MODEL = "o3-mini"

In [28]:
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [37]:
initial_df = df

In [279]:
instructions = """
You are an expert in analyzing single-cell RNA sequencing (scRNA-seq) data. Your goal is to identify the top 5 most similar cell types to a given, unannotated cell cluster, based on a limited dataset. You will do this by creating and applying a scoring system.

The dataset is structured as follows:

Each row represents the cells that strongly express a top genes from the cluster. The columns are: "Gene" (a top gene expressed in the cluster), "Cell Name" (the known cell type expressing the gene), "Expression Value" (level of expression), "Cell Percentage" (percentage of cells expressing the gene), "Cell Count" (number of cells), and "Tissue Composition" (percentage of cells in the tissue).

Follow these steps precisely to prepare the information, which will be automatically formatted into JSON:

**Step 1: Data Understanding**

*   Explain what each row in the dataset represents. Specifically, explain that each row shows a potential connection between the top gene from the cluster and one known cell type, and the metrics for THAT cell type, not for the cluster itself.
*   Explain the meaning of each column: "Gene", "Cell Name", "Expression Value", "Cell Percentage", "Cell Count", and "Tissue Composition." For each, explain what a higher or lower value might indicate *in the context of identifying similarity to the unannotated cluster*.

**Step 2: Develop a Scoring Function (CellTypeScore)**

*   Explain that the goal is to assign each known "Cell Name" (cell type) a score that reflects its similarity to the unannotated cluster.
*   Propose a formula for calculating a `GeneScore` for each *Gene* within each *Cell Name*. This formula should incorporate both "Cell Percentage" and "Expression Value". Justify why you chose to include these two metrics and how they are combined.
*   Propose a formula for combining the `GeneScore` values across all the `Gene` entries for a given "Cell Name" to calculate a final overall `CellTypeScore` for that cell type.

**Step 3: Calculate CellTypeScore for All Cell Names**

*   Explain that the goal is to now apply your scoring system to the known "Cell Name" values and find out which ones are the highest, with relation to what each Gene expresses.
*   Describe in precise steps what it should do: go through each unique "Cell Name" in the dataset, calculate a `GeneScore` and `CellTypeScore` based on Gene, Expression Value, and the percentage that are expressed.

**Step 4: Identify Top 5 Candidates**

*   Explain how to identify the top 5 candidates from the `CellTypeScore` values, now that they have been calculated in Step 3.
"""

In [73]:
instructions ="""
You are an expert in analyzing single-cell RNA sequencing (scRNA-seq) data. Your goal is to identify the top 3 most similar cell types to a given, unannotated cell cluster, based on a limited dataset.

The dataset is structured as follows:

Each row represents a known cell type expressing top genes from a cluster. The columns are: "Gene" (top gene expressed in the cluster), "Cell Name" (the known cell type expressing the gene), "Expression Value" (level of expression), "Cell Percentage" (percentage of cells expressing the gene), "Cell Count" (number of cells), and "Tissue Composition" (percentage of cells in the tissue).

Given this dataset, analyze the dataset to determine similarity, then finally return a list of top possible cell type identities for the cluster.

Ensure that:
1. Never make any assumption. If a gene is not expressed, it should be treated as such and not used for consideration.
2. Understand each row of the data and what it represents.
3. Prioritize cell types that's expressing more genes from the cluster, then consider the cell count, cell percentage, and expression value.
4. The steps for determining each possible cell type.
5. How to obtain the top three possible candidates.
6. give the explanation as if writing a report
"""

In [269]:
instructions ="""
You are an expert in analyzing single-cell RNA sequencing (scRNA-seq) data. Your goal is to identify the top 3 most similar cell types to a given, unannotated cell cluster, based on a limited dataset.

The dataset is structured as follows:
Each row represents a known cell type expressing top genes from a cluster. The columns are: "Gene" (top gene expressed in the cluster), "Cell Name" (the known cell type expressing the gene), "Expression Value" (level of expression), "Cell Percentage" (percentage of cells expressing the gene), "Cell Count" (number of cells), and "Tissue Composition" (percentage of cells in the tissue).
Given this dataset, analyze the dataset to determine similarity, then finally return a list of top possible cell type identities for the cluster. The dataset does not contain any representation of the unannotated cell cluster.

Cell similiarity is determined by the number of genes expressed in the cell then the cell count, cell percentage, and expression value. The more genes expressed in a cell, the higher the cell count, cell percentage, and expression value, the more similar the cell is to the unannotated cell cluster. the number of genes expressed in the cell is the most important factor in determining the similarity of the cell to the unannotated cell cluster.

Follow these steps precisely to prepare the information
Step 1 : Read the whole dataset
Step 2 : Explain to me what do you think about each cell's similiarity to the unannotated cell cluster based on the given data.
Step 3 : Based on you analysis, what are the top 3 possible cell type identities for the cluster?

Ensure that:
1. Never make any assumption. If a gene is not expressed, it should be treated as such and not used for consideration.
2. Do not consider cells that expressing lesser amount of genes from the cluster.
3. Do not consider cells that express a gene with parameters that deviate too much from other genes expressed by that cell.
4. Explanations must be written like a report.
"""

In [103]:
#JSON Output formatter  
class CellAnnotation(BaseModel):
    class top_cell(BaseModel):
        class gene(BaseModel):
            gene_name : str = Field(description="Name of the gene expressed by the cell type")
            metrics : str = Field(description="cell count, cell percentage, and expression value of the gene")
        
        cell_type : str = Field(description="name of the chosen cell.")
        gene_info : list[gene]
        reason : str = Field(description="explain the reasoning for choosing the cell. give comments about parameters from the data that support the choice.")
    
    explanation : list[str] = Field(description="explanation of your reasoning step by step. focus on the parameters and number of genes expressed that support the choice of the cell.")
    cells : list[top_cell]

In [270]:
print(results[0].to_csv(sep='\t', index=False))

cell	gene	expression	cell count	cell percentage	tissue composition	score	std	kurtosis	mean	CV
CD8-positive, alpha-beta T cell	RBM42	1.950844832292692	254	0.0186545240893067	0.0003493843819679144	9.243588807299892	3.6248595516837985	4.631287732326113	2.821587956603296	1.2846877742019789
CD8-positive, alpha-beta T cell	KRT14	1.8208990097045898	56	0.0047317279256442754	7.702962752048506e-05	0.48249592686384396	3.6248595516837985	4.631287732326113	2.821587956603296	1.2846877742019789
CD8-positive, alpha-beta T cell	GAMT	1.7516368865966796	40	0.022459292532285232	5.502116251463219e-05	1.5736210098566463	3.6248595516837985	4.631287732326113	2.821587956603296	1.2846877742019789
CD8-positive, alpha-beta T cell	LGALS3BP	1.713193620954241	112	0.0092295014421096	0.00015405925504097012	1.770935375463535	3.6248595516837985	4.631287732326113	2.821587956603296	1.2846877742019789
CD8-positive, alpha-beta T cell	IFI27	1.6771648889300468	87	0.0071090047393364926	0.00011967102846932501	1.0372986635325645

In [280]:
content = f"expression data : {results[23].drop(columns=['tissue composition']).to_csv(sep='\t', index=False)}"

In [281]:
#normal
completion = client.beta.chat.completions.parse(
    model = 'gpt-4o-mini-2024-07-18',
    messages=[
        {"role": "developer", 
         "content": instructions},
        {"role": "user", 
         "content": content}
    ],
    temperature=0.0,
    response_format=CellAnnotation,
)

In [None]:
#Reasoning Model
response = client.chat.completions.create(
    model = "o3-mini",
    reasoning_effort="medium",
    messages=[
        {"role": "developer", 
         "content": """
            - You are an expert in single-cell RNA sequencing. 
            - You will be provided with cell clustering data and gene expression data for each cell. 
            - Based on the given data, you need to identify the 4 most valid cell to be annotated for the cluster. 
            - Provide a list of cells in the order of validity and explain your reasoning step by step. only give out the name of the cells from the given data.
            - Analyze only the data provided. Do not reference or infer from any external sources
            - the answer is in the format of 
                {
                cells : [cell1, cell2, cell3, cell4],
                explanation : <explanation>
                }
            """},
        {"role": "user", 
         "content": content}
    ],
)

NotFoundError: Error code: 404 - {'error': {'message': 'The model `o3-mini` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}

In [282]:
print(completion.choices[0].message.parsed.model_dump_json(indent=4))

{
    "explanation": [
        "Each row in the dataset represents a known cell type and its expression of a specific gene, along with various metrics that describe the expression level and prevalence of that gene in the cell type.",
        "The 'Gene' column indicates the specific gene being analyzed, which is crucial for understanding the biological functions associated with the cell type.",
        "The 'Cell Name' column identifies the known cell type that expresses the gene, allowing for comparison with the unannotated cluster.",
        "The 'Expression Value' column shows the level of gene expression; higher values suggest stronger expression, which may indicate a more significant role of that gene in the cell type's function.",
        "The 'Cell Percentage' column indicates the proportion of cells expressing the gene; a higher percentage suggests that the gene is more commonly expressed within that cell type, which may correlate with its relevance to the unannotated cluster."

In [39]:
import json

output_dict = json.loads(completion.choices[0].message.parsed.model_dump_json(indent=4))

In [40]:
output_dict['cells']

[{'rank': 1,
  'cell_type': 'T cell',
  'reason': 'The T cell type expresses 5 genes from the cluster: KRT14 (1.929), IFI27 (1.713), LGALS3BP (1.698), RBM42 (1.662), and GAMT (1.604). It has a high cell count of 3174, a cell percentage of 0.0336, and a tissue composition of 0.0044, indicating a strong presence in the tissue.'},
 {'rank': 2,
  'cell_type': 'CD8-positive, alpha-beta T cell',
  'reason': 'This cell type expresses 5 genes: RBM42 (1.950), KRT14 (1.820), GAMT (1.751), LGALS3BP (1.713), and IFI27 (1.677). It has a cell count of 254, a cell percentage of 0.0187, and a tissue composition of 0.0003, showing a significant expression of relevant genes.'},
 {'rank': 3,
  'cell_type': 'CD4-positive, alpha-beta T cell',
  'reason': 'This cell type expresses 4 genes: IFI27 (2.030), LGALS3BP (1.680), RBM42 (1.637), and GAMT (1.554). It has a cell count of 91, a cell percentage of 0.0247, and a tissue composition of 0.0001, indicating a relevant but slightly lower expression compared to

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# Sample DataFrame
data = {'Values': [5, 3, 8, 2, 9, 1, 7, 4, 6, 5, 3, 2]}
df = pd.DataFrame(data)

# Define normalization (scale from min=1 to max=9)
norm = mcolors.Normalize(vmin=1, vmax=9)
# Choose a colormap (reversed for light→dark)
cmap = plt.get_cmap('YlGnBu_r')  # Or 'Blues_r', 'Greys_r', etc.

# Map values to hex color codes
df['Color'] = df['Values'].apply(
    lambda x: mcolors.to_hex(cmap(norm(x)))  # Convert to hex
)

# Display DataFrame with hex color codes
print(df)

    Values    Color
0        5  #42b6c4
1        3  #225ea8
2        8  #edf8b2
3        2  #253494
4        9  #ffffd9
5        1  #081d58
6        7  #c8e9b4
7        4  #1d91c0
8        6  #80cebb
9        5  #42b6c4
10       3  #225ea8
11       2  #253494
