In [7]:
import scanpy as sc
adata = sc.read_h5ad("C:/Users/ellio/Downloads/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad")

In [6]:
print(adata.obs['tissue'].unique())

['skeletalmuscle', 'breast', 'esophagusmucosa', 'esophagusmuscularis', 'heart', 'lung', 'prostate', 'skin']
Categories (8, object): ['breast', 'esophagusmucosa', 'esophagusmuscularis', 'heart', 'lung', 'prostate', 'skeletalmuscle', 'skin']


In [26]:
processed_breast_adata = process_tissue_data(adata, 'breast')

Breast Broad cell types:
['Epithelial cell (luminal)', 'Adipocyte', 'Immune (DC/macrophage)', 'Myoepithelial (basal)', 'Fibroblast', 'Endothelial cell (vascular)', 'Endothelial cell (lymphatic)', 'Pericyte/SMC']
Categories (8, object): ['Adipocyte', 'Endothelial cell (lymphatic)', 'Endothelial cell (vascular)', 'Epithelial cell (luminal)', 'Fibroblast', 'Immune (DC/macrophage)', 'Myoepithelial (basal)', 'Pericyte/SMC']

Breast Granular cell types:
['Epithelial cell (luminal II)', 'Adipocyte', 'Epithelial cell (luminal I)', 'Immune (macrophage II)', 'Immune (macrophage I)', ..., 'Endothelial cell (vascular) I', 'Immune (macrophage III)', 'Endothelial cell (vascular) III', 'Pericyte/SMC II', 'Pericyte/SMC I']
Length: 14
Categories (14, object): ['Adipocyte', 'Endothelial cell (lymphatic)', 'Endothelial cell (vascular) I', 'Endothelial cell (vascular) II', ..., 'Immune (macrophage III)', 'Myoepithelial (basal)', 'Pericyte/SMC I', 'Pericyte/SMC II']

Breast Cell types level 2:
['Epithelial

  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()


In [60]:
import json
import csv

def json_to_csv(input_json_file, output_full_csv, output_summary_csv):
    # Function to write CSV files
    def write_csv(filename, headers, row_data):
        with open(filename, 'w', newline='', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(headers)
            writer.writerows(row_data)

    # Read the JSON file
    with open(input_json_file, 'r') as json_file:
        data = json.load(json_file)

    # Prepare data for both CSV files
    full_data = []
    summary_data = []

    for true_cell_type, details in data.items():
        main_cell_type = details['analysis_result']['main_cell_type']
        sub_cell_types = ', '.join(details['analysis_result']['sub_cell_types'])
        marker_number = details['analysis_result']['num_markers']
        confidence_score = details['analysis_result']['confidence_score']
        conversation_history = ' | '.join([f"{entry[0]}: {entry[1]}" for entry in details['conversation_history']])
        
        full_data.append([true_cell_type, main_cell_type, sub_cell_types, 
                          marker_number, confidence_score, conversation_history])
        summary_data.append([true_cell_type, main_cell_type, sub_cell_types])

    # Write the full data CSV
    write_csv(output_full_csv, 
              ['True Cell Type', 'Predicted Main Cell Type', 'Predicted Sub Cell Types', 
               'Marker Number', 'Confidence Score', 'Conversation History'],
              full_data)

    # Write the summary data CSV
    write_csv(output_summary_csv,
              ['True Cell Type', 'Predicted Main Cell Type', 'Predicted Sub Cell Types'],
              summary_data)

    print(f"Two CSV files have been created:")
    print(f"1. {output_full_csv} (full data)")
    print(f"2. {output_summary_csv} (summary data)")

# Example usage:
# json_to_csv('cell_type_analysis_results_10.json', 'cell_type_analysis_results_full11.csv', 'cell_type_analysis_results_summary11.csv')

In [None]:
sc.tl.rank_genes_groups(adata, groupby=groupby, method='t-test', use_raw=False)

In [31]:
default_df, modified_df = analyze_markers(processed_breast_adata, 'Broad cell type')

  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Unnamed: 0,Broad cell type,Top 10 Markers
0,Adipocyte,"PRMT2, UBR2, CCND3, TAF8, C6orf132, GUCA1A, GU..."
1,Endothelial cell (lymphatic),"PRMT2, UBR2, CCND3, TAF8, C6orf132, GUCA1A, GU..."
2,Endothelial cell (vascular),"PRMT2, UBR2, CCND3, TAF8, C6orf132, GUCA1A, GU..."
3,Epithelial cell (luminal),"PRMT2, UBR2, CCND3, TAF8, C6orf132, GUCA1A, GU..."
4,Fibroblast,"PRMT2, UBR2, CCND3, TAF8, C6orf132, GUCA1A, GU..."
5,Immune (DC/macrophage),"PRMT2, UBR2, CCND3, TAF8, C6orf132, GUCA1A, GU..."
6,Myoepithelial (basal),"PRMT2, UBR2, CCND3, TAF8, C6orf132, GUCA1A, GU..."
7,Pericyte/SMC,"PRMT2, UBR2, CCND3, TAF8, C6orf132, GUCA1A, GU..."


In [29]:
##final version
import pandas as pd
import scanpy as sc




def analyze_markers(adata, groupby, n_genes=10):
    # Perform differential expression analysis
    sc.tl.rank_genes_groups(adata, groupby=groupby, method='t-test', use_raw=False)
    
    # Get the default Scanpy results
    default_markers = sc.get.rank_genes_groups_df(adata, group=None)
    
    # Modified version
    modified_markers = default_markers.copy()
    modified_markers = modified_markers.sort_values(['group', 'logfoldchanges'], ascending=[True, False])
    
    top_markers = modified_markers.groupby('group').apply(
        lambda x: ', '.join(x['names'].head(n_genes))
    ).reset_index()
    top_markers.columns = [groupby, f'Top {n_genes} Markers']
    
    return default_markers, top_markers

# List of annotation levels
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']

# Dictionaries to store results for each annotation level
default_results = {}
modified_results = {}

# Perform analysis for each annotation level
for level in annotation_levels:
    print(f"Analyzing {level}...")
    default_df, modified_df = analyze_markers(processed_breast_adata, level)
    default_results[level] = default_df
    modified_results[level] = modified_df
    print(f"Analysis for {level} completed.")

# Print and export results for each level
for level in annotation_levels:
    print(f"\n--- Results for {level} ---")
    
    print("\nDefault Scanpy output (first 10 rows):")
    print(default_results[level].head(10))
    
    print("\nModified output:")
    print(modified_results[level].head())
    
    # Export both DataFrames to separate CSV files
    default_results[level].to_csv(f"default_markersbreast_{level.replace(' ', '_').lower()}.csv", index=False)
    modified_results[level].to_csv(f"modified_markersbreast_{level.replace(' ', '_').lower()}.csv", index=False)
    
    print(f"Results for {level} exported to CSV files.")

print("\nAll analyses completed and results exported.")

Analyzing Broad cell type...


  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Broad cell type completed.
Analyzing Granular cell type...


  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Granular cell type completed.
Analyzing Cell types level 2...


  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 2 completed.
Analyzing Cell types level 3...


  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 3 completed.

--- Results for Broad cell type ---

Default Scanpy output (first 10 rows):
       group     names  scores  logfoldchanges  pvals  pvals_adj
0  Adipocyte     PRMT2     0.0             NaN    1.0        1.0
1  Adipocyte      UBR2     0.0             NaN    1.0        1.0
2  Adipocyte     CCND3     0.0             NaN    1.0        1.0
3  Adipocyte      TAF8     0.0             NaN    1.0        1.0
4  Adipocyte  C6orf132     0.0             NaN    1.0        1.0
5  Adipocyte    GUCA1A     0.0             NaN    1.0        1.0
6  Adipocyte    GUCA1B     0.0             NaN    1.0        1.0
7  Adipocyte    TRERF1     0.0             NaN    1.0        1.0
8  Adipocyte     PRPH2     0.0             NaN    1.0        1.0
9  Adipocyte   PPP2R5D     0.0             NaN    1.0        1.0

Modified output:
                Broad cell type  \
0                     Adipocyte   
1  Endothelial cell (lymphatic)   
2   Endothelial cell (vascular)   
3     E

In [None]:
def analyze_markers(adata, groupby, n_genes=10):
    # Perform differential expression analysis
    sc.tl.rank_genes_groups(adata, groupby=groupby, method='t-test', use_raw=False)
    
    # Get the default Scanpy results
    default_markers = sc.get.rank_genes_groups_df(adata, group=None)
    
    # Modified version
    modified_markers = default_markers.copy()
    modified_markers = modified_markers.sort_values(['group', 'logfoldchanges'], ascending=[True, False])
    
    top_markers = modified_markers.groupby('group').apply(
        lambda x: ', '.join(x['names'].head(n_genes))
    ).reset_index()
    top_markers.columns = [groupby, f'Top {n_genes} Markers']
    
    return default_markers, top_markers

# List of annotation levels
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']

# Dictionaries to store results for each annotation level
default_results = {}
modified_results = {}

# Perform analysis for each annotation level
for level in annotation_levels:
    print(f"Analyzing {level}...")
    default_df, modified_df = analyze_markers(lung_adata, level)
    default_results[level] = default_df
    modified_results[level] = modified_df
    print(f"Analysis for {level} completed.")

# Print and export results for each level
for level in annotation_levels:
    print(f"\n--- Results for {level} ---")
    
    print("\nDefault Scanpy output (first 10 rows):")
    print(default_results[level].head(10))
    
    print("\nModified output:")
    print(modified_results[level].head())
    
    # Export both DataFrames to separate CSV files
    default_results[level].to_csv(f"default_markers_{level.replace(' ', '_').lower()}.csv", index=False)
    modified_results[level].to_csv(f"modified_markers_{level.replace(' ', '_').lower()}.csv", index=False)
    
    print(f"Results for {level} exported to CSV files.")

print("\nAll analyses completed and results exported.")

In [88]:
import pandas as pd
import scanpy as sc

def analyze_and_export_markers(adata, annotation_levels, n_genes=10):
    """
    Analyze markers for multiple annotation levels and export results.
    
    Parameters:
    adata (AnnData): The annotated data matrix.
    annotation_levels (list): List of column names in adata.obs to use for grouping.
    n_genes (int): Number of top genes to include in the modified output.
    
    Returns:
    dict: Dictionary containing default and modified results for each annotation level.
    """
    def analyze_markers(adata, groupby, n_genes):
        sc.tl.rank_genes_groups(adata, groupby=groupby, method='t-test', use_raw=False)
        default_markers = sc.get.rank_genes_groups_df(adata, group=None)
        modified_markers = default_markers.copy()
        modified_markers = modified_markers.sort_values(['group', 'logfoldchanges'], ascending=[True, False])
        top_markers = modified_markers.groupby('group').apply(
            lambda x: ', '.join(x['names'].head(n_genes))
        ).reset_index()
        top_markers.columns = [groupby, f'Top {n_genes} Markers']
        return default_markers, top_markers

    results = {}
    tissue_type = adata.obs['tissue'].iloc[0].lower()

    for level in annotation_levels:
        print(f"Analyzing {level}...")
        default_df, modified_df = analyze_markers(adata, level, n_genes)
        results[level] = {'default': default_df, 'modified': modified_df}
        print(f"Analysis for {level} completed.")

        print(f"\n--- Results for {level} ---")
        print("\nDefault Scanpy output (first 10 rows):")
        print(default_df.head(10))
        print("\nModified output:")
        print(modified_df.head())

        default_filename = f"{tissue_type}_default_markers_{level.replace(' ', '_').lower()}_{n_genes}genes.csv"
        modified_filename = f"{tissue_type}_modified_markers_{level.replace(' ', '_').lower()}_{n_genes}genes.csv"
        
        default_df.to_csv(default_filename, index=False)
        modified_df.to_csv(modified_filename, index=False)
        
        print(f"Results for {level} exported to CSV files: {default_filename} and {modified_filename}")

    print(f"\nAll analyses completed and results exported for {tissue_type} dataset.")
    return results



In [17]:
import scanpy as sc

def process_tissue_data(adata, tissue_type):
    """
    Process the AnnData object for a specific tissue type.
    
    Parameters:
    adata (AnnData): The original AnnData object containing all tissues.
    tissue_type (str): The tissue type to process (e.g., 'prostate', 'lung').
    
    Returns:
    AnnData: Processed AnnData object for the specified tissue.
    """
    # Subset the AnnData object to include only the specified tissue
    tissue_adata = adata[adata.obs['tissue'] == tissue_type].copy()
    
    # Print unique values for different cell type classifications
    print(f"{tissue_type.capitalize()} Broad cell types:")
    print(tissue_adata.obs['Broad cell type'].unique())
    print(f"\n{tissue_type.capitalize()} Granular cell types:")
    print(tissue_adata.obs['Granular cell type'].unique())
    print(f"\n{tissue_type.capitalize()} Cell types level 2:")
    print(tissue_adata.obs['Cell types level 2'].unique())
    print(f"\n{tissue_type.capitalize()} Cell types level 3:")
    print(tissue_adata.obs['Cell types level 3'].unique())
    
    # Normalize the data by total counts per cell and scale to 10,000 reads per cell
    sc.pp.normalize_total(tissue_adata, target_sum=1e4)
    
    # Log-transform the data after adding a pseudocount of 1
    sc.pp.log1p(tissue_adata)
    
    # Perform batch correction
    sc.pp.combat(tissue_adata, key='batch')
    
    return tissue_adata



In [89]:
processed_skeletalmuscle_adata = process_tissue_data(adata, 'prostate')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_skeletalmuscle_adata, annotation_levels,n_genes=50)

Prostate Broad cell types:
['Epithelial cell (luminal)', 'Epithelial cell (Hillock)', 'Epithelial cell (basal)', 'Epithelial cell (club)', 'Myocyte (smooth muscle)', ..., 'Schwann cell', 'Neuroendocrine', 'Immune (NK cell)', 'Immune (T cell)', 'Immune (mast cell)']
Length: 16
Categories (16, object): ['Endothelial cell (lymphatic)', 'Endothelial cell (vascular)', 'Epithelial cell (Hillock)', 'Epithelial cell (basal)', ..., 'Myocyte (smooth muscle)', 'Neuroendocrine', 'Pericyte/SMC', 'Schwann cell']

Prostate Granular cell types:
['Epithelial cell (luminal)', 'Epithelial cell (Hillock)', 'Epithelial cell (basal I)', 'Epithelial cell (club)', 'Myocyte (smooth muscle)', ..., 'Schwann cell', 'Neuroendocrine', 'Immune (NK cell)', 'Immune (T cell)', 'Immune (mast cell)']
Length: 21
Categories (21, object): ['Endothelial cell (lymphatic)', 'Endothelial cell (vascular) I', 'Endothelial cell (vascular) II', 'Endothelial cell (vascular) III', ..., 'Myocyte (smooth muscle)', 'Neuroendocrine', 'Pe

  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()


Analyzing Broad cell type...


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Broad cell type completed.

--- Results for Broad cell type ---

Default Scanpy output (first 10 rows):
                          group       names     scores  logfoldchanges  \
0  Endothelial cell (lymphatic)        TFPI  42.263424        6.271377   
1  Endothelial cell (lymphatic)      SEMA3A  32.460411        6.538217   
2  Endothelial cell (lymphatic)       MMRN1  30.380936        7.636989   
3  Endothelial cell (lymphatic)        RELN  27.844282        7.836636   
4  Endothelial cell (lymphatic)       SASH1  25.230183        3.399506   
5  Endothelial cell (lymphatic)    CNTNAP3B  24.993130        4.395359   
6  Endothelial cell (lymphatic)       GPM6A  24.964727        5.623390   
7  Endothelial cell (lymphatic)     PPFIBP1  24.234943        3.306324   
8  Endothelial cell (lymphatic)  ST6GALNAC3  22.717415        4.578478   
9  Endothelial cell (lymphatic)       ZFPM2  22.459307        5.373502   

           pvals      pvals_adj  
0  2.566114e-116  1.460045e-114  


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "na

Analysis for Granular cell type completed.

--- Results for Granular cell type ---

Default Scanpy output (first 10 rows):
                          group       names     scores  logfoldchanges  \
0  Endothelial cell (lymphatic)        TFPI  42.569447        6.290088   
1  Endothelial cell (lymphatic)      SEMA3A  33.298309        6.587502   
2  Endothelial cell (lymphatic)       MMRN1  30.656824        7.664512   
3  Endothelial cell (lymphatic)        RELN  28.309370        7.879247   
4  Endothelial cell (lymphatic)    CNTNAP3B  25.475521        4.436334   
5  Endothelial cell (lymphatic)       SASH1  25.198036        3.405851   
6  Endothelial cell (lymphatic)       GPM6A  24.814034        5.632415   
7  Endothelial cell (lymphatic)     PPFIBP1  24.596691        3.337643   
8  Endothelial cell (lymphatic)       ZFPM2  22.922661        5.426909   
9  Endothelial cell (lymphatic)  ST6GALNAC3  22.296513        4.552267   

           pvals      pvals_adj  
0  6.243804e-116  3.268761e-

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 2 completed.

--- Results for Cell types level 2 ---

Default Scanpy output (first 10 rows):
              group       names     scores  logfoldchanges  pvals  pvals_adj
0  Endothelial cell         VWF  79.571724        7.538949    0.0        0.0
1  Endothelial cell      PECAM1  77.860535        6.731662    0.0        0.0
2  Endothelial cell  ST6GALNAC3  77.295135        6.346106    0.0        0.0
3  Endothelial cell        TCF4  71.904373        3.353484    0.0        0.0
4  Endothelial cell       PTPRB  66.918793        7.335071    0.0        0.0
5  Endothelial cell       MEF2C  65.328781        5.020666    0.0        0.0
6  Endothelial cell        FLT1  64.947937        7.216104    0.0        0.0
7  Endothelial cell     SPARCL1  61.766758        3.767296    0.0        0.0
8  Endothelial cell       EGFL7  61.573463        6.616244    0.0        0.0
9  Endothelial cell     SLCO2A1  61.347015        5.251340    0.0        0.0

Modified output:
    Cell typ

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 3 completed.

--- Results for Cell types level 3 ---

Default Scanpy output (first 10 rows):
        group     names      scores  logfoldchanges  pvals  pvals_adj
0  Epithelial     GRHL2  124.841919        4.892031    0.0        0.0
1  Epithelial      TTC6  121.867935        4.571084    0.0        0.0
2  Epithelial  KIAA1217  113.339157        3.356953    0.0        0.0
3  Epithelial       EHF  111.222076        4.803591    0.0        0.0
4  Epithelial     AZGP1  103.991333        4.733384    0.0        0.0
5  Epithelial      KLK2  103.055031        3.936097    0.0        0.0
6  Epithelial      ANK3  102.366295        3.417012    0.0        0.0
7  Epithelial     RBM47   98.839485        3.205570    0.0        0.0
8  Epithelial     CXADR   98.289062        4.519721    0.0        0.0
9  Epithelial      WWC1   96.295982        4.231160    0.0        0.0

Modified output:
  Cell types level 3                                     Top 50 Markers
0         Epithel

In [50]:
processed_skeletalmuscle_adata = process_tissue_data(adata, 'skeletalmuscle')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_skeletalmuscle_adata, annotation_levels)

Skeletalmuscle Broad cell types:
['Myocyte (sk. muscle, cytoplasmic)', 'Myocyte (NMJ-rich)', 'Endothelial cell (vascular)', 'Myocyte (sk. muscle)', 'Fibroblast', ..., 'Satellite cell', 'Endothelial cell (lymphatic)', 'Immune (T cell)', 'Immune (NK cell)', 'Immune (mast cell)']
Length: 14
Categories (14, object): ['Adipocyte', 'Endothelial cell (lymphatic)', 'Endothelial cell (vascular)', 'Fibroblast', ..., 'Myocyte (sk. muscle, cytoplasmic)', 'Pericyte/SMC', 'Satellite cell', 'Schwann cell']

Skeletalmuscle Granular cell types:
['Myocyte (slow-twitch, cytoplasmic)', 'Myocyte (NMJ-rich)', 'Endothelial cell (vascular) I', 'Myocyte (sk. muscle)', 'Fibroblast', ..., 'Pericyte/SMC II', 'Endothelial cell (vascular) III', 'Endothelial cell (lymphatic)', 'Immune (T cell)', 'Immune (NK cell)']
Length: 22
Categories (22, object): ['Adipocyte', 'Endothelial cell (lymphatic)', 'Endothelial cell (vascular) I', 'Endothelial cell (vascular) II', ..., 'Pericyte/SMC I', 'Pericyte/SMC II', 'Satellite ce

  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()


Analyzing Broad cell type...


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Broad cell type completed.

--- Results for Broad cell type ---

Default Scanpy output (first 10 rows):
       group    names     scores  logfoldchanges         pvals     pvals_adj
0  Adipocyte      CPM  31.031006        6.965744  3.155052e-69  1.604271e-67
1  Adipocyte    PPARG  21.022568        5.861884  1.100758e-47  2.847649e-46
2  Adipocyte      GSN  18.656420        3.712667  6.254795e-42  1.314473e-40
3  Adipocyte    ACACB  17.136478        1.884094  3.040621e-38  5.657602e-37
4  Adipocyte   FRMD4A  17.011034        3.929830  1.276132e-37  2.344877e-36
5  Adipocyte  SLC19A3  16.979561       10.123559  1.850507e-37  3.375745e-36
6  Adipocyte    SVEP1  16.251976        4.479470  1.383076e-35  2.348708e-34
7  Adipocyte     EBF1  15.682308        3.546812  4.196401e-34  6.806171e-33
8  Adipocyte    PLIN1  15.665658        7.789655  5.551123e-34  8.954159e-33
9  Adipocyte    ADH1B  14.845592        5.226194  8.620318e-32  1.269023e-30

Modified output:
                Br

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "lo

Analysis for Granular cell type completed.

--- Results for Granular cell type ---

Default Scanpy output (first 10 rows):
       group    names     scores  logfoldchanges         pvals     pvals_adj
0  Adipocyte      CPM  30.871313        7.048368  9.989886e-67  4.130164e-65
1  Adipocyte    PPARG  20.968872        5.933959  2.381456e-46  5.101679e-45
2  Adipocyte      GSN  18.726734        3.783941  3.951888e-41  7.092156e-40
3  Adipocyte   FRMD4A  17.412228        4.026417  7.919234e-38  1.280904e-36
4  Adipocyte  SLC19A3  17.190136       10.181335  3.441151e-37  5.456198e-36
5  Adipocyte    ACACB  17.007593        1.892440  4.322308e-37  6.847202e-36
6  Adipocyte    SVEP1  16.298925        4.555878  5.632921e-35  8.292391e-34
7  Adipocyte     EBF1  16.004612        3.645249  2.922833e-34  4.194609e-33
8  Adipocyte    PLIN1  15.659821        7.863802  2.702379e-33  3.762281e-32
9  Adipocyte    ADH1B  15.209573        5.391531  3.822303e-32  5.051206e-31

Modified output:
            

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 2 completed.

--- Results for Cell types level 2 ---

Default Scanpy output (first 10 rows):
       group    names     scores  logfoldchanges         pvals     pvals_adj
0  Adipocyte      CPM  31.031006        6.965744  3.155052e-69  1.604271e-67
1  Adipocyte    PPARG  21.022568        5.861884  1.100758e-47  2.847649e-46
2  Adipocyte      GSN  18.656420        3.712667  6.254795e-42  1.314473e-40
3  Adipocyte    ACACB  17.136478        1.884094  3.040621e-38  5.657602e-37
4  Adipocyte   FRMD4A  17.011034        3.929830  1.276132e-37  2.344877e-36
5  Adipocyte  SLC19A3  16.979561       10.123559  1.850507e-37  3.375745e-36
6  Adipocyte    SVEP1  16.251976        4.479470  1.383076e-35  2.348708e-34
7  Adipocyte     EBF1  15.682308        3.546812  4.196401e-34  6.806171e-33
8  Adipocyte    PLIN1  15.665658        7.789655  5.551123e-34  8.954159e-33
9  Adipocyte    ADH1B  14.845592        5.226194  8.620318e-32  1.269023e-30

Modified output:
    Cell typ

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


In [49]:
processed_esophagusmucosa_adata = process_tissue_data(adata, 'esophagusmucosa')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_esophagusmucosa_adata, annotation_levels)

Esophagusmucosa Broad cell types:
['Epithelial cell (squamous)', 'Epithelial cell (suprabasal)', 'Myofibroblast', 'Endothelial cell (vascular)', 'Epithelial cell (basal)', ..., 'Immune (DC)', 'Neuroendocrine', 'Schwann cell', 'Immune (NK cell)', 'Immune (mast cell)']
Length: 17
Categories (17, object): ['Endothelial cell (lymphatic)', 'Endothelial cell (vascular)', 'Epithelial cell (basal)', 'Epithelial cell (squamous)', ..., 'Myofibroblast', 'Neuroendocrine', 'Pericyte/SMC', 'Schwann cell']

Esophagusmucosa Granular cell types:
['Epithelial cell (squamous)', 'Epithelial cell (suprabasal)', 'Myofibroblast', 'Endothelial cell (vascular) I', 'Epithelial cell (basal I)', ..., 'Neuroendocrine', 'Schwann cell', 'Immune (NK cell)', 'Immune (mast cell)', 'Immune (mmDC)']
Length: 26
Categories (26, object): ['Endothelial cell (lymphatic)', 'Endothelial cell (vascular) I', 'Endothelial cell (vascular) II', 'Endothelial cell (vascular) III', ..., 'Neuroendocrine', 'Pericyte/SMC I', 'Pericyte/SMC

  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()


Analyzing Broad cell type...


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Broad cell type completed.

--- Results for Broad cell type ---

Default Scanpy output (first 10 rows):
                          group       names     scores  logfoldchanges  \
0  Endothelial cell (lymphatic)     PKHD1L1  97.316933        8.489282   
1  Endothelial cell (lymphatic)     PPFIBP1  75.410286        4.214581   
2  Endothelial cell (lymphatic)  ST6GALNAC3  73.197052        4.677771   
3  Endothelial cell (lymphatic)       TSHZ2  64.138992        2.813383   
4  Endothelial cell (lymphatic)       MMRN1  63.947876        7.723027   
5  Endothelial cell (lymphatic)        TFPI  63.536518        5.017277   
6  Endothelial cell (lymphatic)       KALRN  62.246513        4.589102   
7  Endothelial cell (lymphatic)        CD36  55.153595        6.564581   
8  Endothelial cell (lymphatic)        PGM5  50.271420        5.258752   
9  Endothelial cell (lymphatic)       CCL21  49.555599        8.334414   

           pvals      pvals_adj  
0   0.000000e+00   0.000000e+00  


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "lo

Analysis for Granular cell type completed.

--- Results for Granular cell type ---

Default Scanpy output (first 10 rows):
                          group       names     scores  logfoldchanges  \
0  Endothelial cell (lymphatic)     PKHD1L1  99.419846        8.498019   
1  Endothelial cell (lymphatic)     PPFIBP1  76.477562        4.237154   
2  Endothelial cell (lymphatic)  ST6GALNAC3  73.785919        4.694474   
3  Endothelial cell (lymphatic)       TSHZ2  64.430061        2.822079   
4  Endothelial cell (lymphatic)       MMRN1  63.930775        7.694232   
5  Endothelial cell (lymphatic)        TFPI  63.638512        5.026341   
6  Endothelial cell (lymphatic)       KALRN  62.336025        4.598777   
7  Endothelial cell (lymphatic)        CD36  55.192429        6.565350   
8  Endothelial cell (lymphatic)        PGM5  50.181538        5.260492   
9  Endothelial cell (lymphatic)       CCL21  49.746929        8.325597   

           pvals      pvals_adj  
0   0.000000e+00   0.000000e

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 2 completed.

--- Results for Cell types level 2 ---

Default Scanpy output (first 10 rows):
              group       names     scores  logfoldchanges  pvals  pvals_adj
0  Endothelial cell  ST6GALNAC3  98.930824        5.553425    0.0        0.0
1  Endothelial cell        LDB2  98.150894        4.618948    0.0        0.0
2  Endothelial cell       PTPRM  83.929031        3.898647    0.0        0.0
3  Endothelial cell       SNTG2  79.355721        5.527849    0.0        0.0
4  Endothelial cell       EGFL7  76.149849        6.329471    0.0        0.0
5  Endothelial cell      PECAM1  74.732704        5.791344    0.0        0.0
6  Endothelial cell       ELMO1  71.930313        3.365596    0.0        0.0
7  Endothelial cell    C10orf11  71.662674        3.107920    0.0        0.0
8  Endothelial cell       TSHZ2  63.512020        2.384401    0.0        0.0
9  Endothelial cell       PTPRB  63.037258        6.174127    0.0        0.0

Modified output:
    Cell typ

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 3 completed.

--- Results for Cell types level 3 ---

Default Scanpy output (first 10 rows):
        group   names      scores  logfoldchanges  pvals  pvals_adj
0  Epithelial   KRT15  119.556297        4.311607    0.0        0.0
1  Epithelial   KRT13  111.323471        4.322252    0.0        0.0
2  Epithelial    RHCG   99.578094        4.468267    0.0        0.0
3  Epithelial  TRIM29   97.677597        4.741588    0.0        0.0
4  Epithelial    DSG3   96.861259        4.901629    0.0        0.0
5  Epithelial   RBM47   96.055122        3.397151    0.0        0.0
6  Epithelial   PITX1   95.682693        4.532236    0.0        0.0
7  Epithelial    AIM1   94.798523        3.530467    0.0        0.0
8  Epithelial   THSD4   94.381996        3.072912    0.0        0.0
9  Epithelial    ANK3   94.034554        3.330036    0.0        0.0

Modified output:
  Cell types level 3                                     Top 10 Markers
0         Epithelial  CEACAM7, IL36RN, 

In [19]:
processed_prostate_adata = process_tissue_data(adata, 'prostate')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_prostate_adata, annotation_levels)

Prostate Broad cell types:
['Epithelial cell (luminal)', 'Epithelial cell (Hillock)', 'Epithelial cell (basal)', 'Epithelial cell (club)', 'Myocyte (smooth muscle)', ..., 'Schwann cell', 'Neuroendocrine', 'Immune (NK cell)', 'Immune (T cell)', 'Immune (mast cell)']
Length: 16
Categories (16, object): ['Endothelial cell (lymphatic)', 'Endothelial cell (vascular)', 'Epithelial cell (Hillock)', 'Epithelial cell (basal)', ..., 'Myocyte (smooth muscle)', 'Neuroendocrine', 'Pericyte/SMC', 'Schwann cell']

Prostate Granular cell types:
['Epithelial cell (luminal)', 'Epithelial cell (Hillock)', 'Epithelial cell (basal I)', 'Epithelial cell (club)', 'Myocyte (smooth muscle)', ..., 'Schwann cell', 'Neuroendocrine', 'Immune (NK cell)', 'Immune (T cell)', 'Immune (mast cell)']
Length: 21
Categories (21, object): ['Endothelial cell (lymphatic)', 'Endothelial cell (vascular) I', 'Endothelial cell (vascular) II', 'Endothelial cell (vascular) III', ..., 'Myocyte (smooth muscle)', 'Neuroendocrine', 'Pe

  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()


Analyzing Broad cell type...


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Broad cell type completed.

--- Results for Broad cell type ---

Default Scanpy output (first 10 rows):
                          group       names     scores  logfoldchanges  \
0  Endothelial cell (lymphatic)        TFPI  42.263424        6.271377   
1  Endothelial cell (lymphatic)      SEMA3A  32.460411        6.538217   
2  Endothelial cell (lymphatic)       MMRN1  30.380936        7.636989   
3  Endothelial cell (lymphatic)        RELN  27.844282        7.836636   
4  Endothelial cell (lymphatic)       SASH1  25.230183        3.399506   
5  Endothelial cell (lymphatic)    CNTNAP3B  24.993130        4.395359   
6  Endothelial cell (lymphatic)       GPM6A  24.964727        5.623390   
7  Endothelial cell (lymphatic)     PPFIBP1  24.234943        3.306324   
8  Endothelial cell (lymphatic)  ST6GALNAC3  22.717415        4.578478   
9  Endothelial cell (lymphatic)       ZFPM2  22.459307        5.373502   

           pvals      pvals_adj  
0  2.566114e-116  1.460045e-114  


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "na

Analysis for Granular cell type completed.

--- Results for Granular cell type ---

Default Scanpy output (first 10 rows):
                          group       names     scores  logfoldchanges  \
0  Endothelial cell (lymphatic)        TFPI  42.569447        6.290088   
1  Endothelial cell (lymphatic)      SEMA3A  33.298309        6.587502   
2  Endothelial cell (lymphatic)       MMRN1  30.656824        7.664512   
3  Endothelial cell (lymphatic)        RELN  28.309370        7.879247   
4  Endothelial cell (lymphatic)    CNTNAP3B  25.475521        4.436334   
5  Endothelial cell (lymphatic)       SASH1  25.198036        3.405851   
6  Endothelial cell (lymphatic)       GPM6A  24.814034        5.632415   
7  Endothelial cell (lymphatic)     PPFIBP1  24.596691        3.337643   
8  Endothelial cell (lymphatic)       ZFPM2  22.922661        5.426909   
9  Endothelial cell (lymphatic)  ST6GALNAC3  22.296513        4.552267   

           pvals      pvals_adj  
0  6.243804e-116  3.268761e-

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 2 completed.

--- Results for Cell types level 2 ---

Default Scanpy output (first 10 rows):
              group       names     scores  logfoldchanges  pvals  pvals_adj
0  Endothelial cell         VWF  79.571724        7.538949    0.0        0.0
1  Endothelial cell      PECAM1  77.860535        6.731662    0.0        0.0
2  Endothelial cell  ST6GALNAC3  77.295135        6.346106    0.0        0.0
3  Endothelial cell        TCF4  71.904373        3.353484    0.0        0.0
4  Endothelial cell       PTPRB  66.918793        7.335071    0.0        0.0
5  Endothelial cell       MEF2C  65.328781        5.020666    0.0        0.0
6  Endothelial cell        FLT1  64.947937        7.216104    0.0        0.0
7  Endothelial cell     SPARCL1  61.766758        3.767296    0.0        0.0
8  Endothelial cell       EGFL7  61.573463        6.616244    0.0        0.0
9  Endothelial cell     SLCO2A1  61.347015        5.251340    0.0        0.0

Modified output:
    Cell typ

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 3 completed.

--- Results for Cell types level 3 ---

Default Scanpy output (first 10 rows):
        group     names      scores  logfoldchanges  pvals  pvals_adj
0  Epithelial     GRHL2  124.841919        4.892031    0.0        0.0
1  Epithelial      TTC6  121.867935        4.571084    0.0        0.0
2  Epithelial  KIAA1217  113.339157        3.356953    0.0        0.0
3  Epithelial       EHF  111.222076        4.803591    0.0        0.0
4  Epithelial     AZGP1  103.991333        4.733384    0.0        0.0
5  Epithelial      KLK2  103.055031        3.936097    0.0        0.0
6  Epithelial      ANK3  102.366295        3.417012    0.0        0.0
7  Epithelial     RBM47   98.839485        3.205570    0.0        0.0
8  Epithelial     CXADR   98.289062        4.519721    0.0        0.0
9  Epithelial      WWC1   96.295982        4.231160    0.0        0.0

Modified output:
  Cell types level 3                                     Top 10 Markers
0         Epithel

In [51]:
processed_heart_adata = process_tissue_data(adata, 'esophagusmuscularis')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_heart_adata, annotation_levels)

Esophagusmuscularis Broad cell types:
['Myocyte (smooth muscle)', 'Endothelial cell (vascular)', 'Neuronal', 'Endothelial cell (lymphatic)', 'ICCs', ..., 'Immune (mast cell)', 'Immune (T cell)', 'Adipocyte', 'Immune (B cell)', 'Immune (NK cell)']
Length: 14
Categories (14, object): ['Adipocyte', 'Endothelial cell (lymphatic)', 'Endothelial cell (vascular)', 'Fibroblast', ..., 'Myocyte (smooth muscle)', 'Neuronal', 'Pericyte/SMC', 'Schwann cell']

Esophagusmuscularis Granular cell types:
['Myocyte (smooth muscle)', 'Myocyte (smooth muscle TAGLN lo)', 'Endothelial cell (vascular) I', 'Neuronal', 'Endothelial cell (lymphatic)', ..., 'Immune (T cell)', 'Immune (macrophage II)', 'Immune (B cell)', 'Immune (NK cell)', 'Adipocyte']
Length: 23
Categories (23, object): ['Adipocyte', 'Endothelial cell (lymphatic)', 'Endothelial cell (vascular) I', 'Endothelial cell (vascular) II', ..., 'Pericyte/SMC I', 'Pericyte/SMC II', 'Schwann cell I', 'Schwann cell II']

Esophagusmuscularis Cell types level

  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()


Analyzing Broad cell type...


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Broad cell type completed.

--- Results for Broad cell type ---

Default Scanpy output (first 10 rows):
       group        names     scores  logfoldchanges          pvals  \
0  Adipocyte        ACACB  50.182171        4.366480  1.163656e-107   
1  Adipocyte        ACSL1  33.875065        5.570811   8.453280e-79   
2  Adipocyte      ANGPTL4  30.774715        5.841323   1.861894e-72   
3  Adipocyte       SLC1A3  28.809704        5.612320   2.753884e-68   
4  Adipocyte        FOXO1  28.286203        2.207979   5.176769e-69   
5  Adipocyte        MGST1  26.953768        3.275504   1.065896e-64   
6  Adipocyte        ANXA1  26.127365        2.929458   5.870998e-63   
7  Adipocyte         SAT1  25.190577        2.625487   6.786339e-61   
8  Adipocyte  AP000304.12  25.111723        3.614652   3.755983e-60   
9  Adipocyte       FRMD4A  24.798056        3.148637   1.390489e-59   

       pvals_adj  
0  1.143939e-105  
1   4.053680e-77  
2   7.644134e-71  
3   1.034607e-66  
4   2.

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "lo

Analysis for Granular cell type completed.

--- Results for Granular cell type ---

Default Scanpy output (first 10 rows):
       group        names     scores  logfoldchanges          pvals  \
0  Adipocyte        ACACB  69.591408        4.464502  1.408273e-130   
1  Adipocyte        ACSL1  39.459766        5.688400   5.503456e-87   
2  Adipocyte        FOXO1  36.952209        2.296068   7.556155e-87   
3  Adipocyte      ANGPTL4  35.489246        5.972249   6.377913e-80   
4  Adipocyte       SLC1A3  32.801216        5.742610   6.431599e-75   
5  Adipocyte        MGST1  28.411747        3.329427   1.229872e-66   
6  Adipocyte        ANXA1  28.354074        2.994750   9.020227e-67   
7  Adipocyte         SAT1  28.140207        2.698680   1.347600e-66   
8  Adipocyte  AP000304.12  27.298845        3.701424   5.774421e-64   
9  Adipocyte       FRMD4A  27.162844        3.241905   6.606276e-64   

       pvals_adj  
0  1.718579e-128  
1   2.610822e-85  
2   3.575031e-85  
3   2.594418e-78  


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 2 completed.

--- Results for Cell types level 2 ---

Default Scanpy output (first 10 rows):
       group        names     scores  logfoldchanges          pvals  \
0  Adipocyte        ACACB  50.182171        4.366480  1.163656e-107   
1  Adipocyte        ACSL1  33.875065        5.570811   8.453280e-79   
2  Adipocyte      ANGPTL4  30.774715        5.841323   1.861894e-72   
3  Adipocyte       SLC1A3  28.809704        5.612320   2.753884e-68   
4  Adipocyte        FOXO1  28.286203        2.207979   5.176769e-69   
5  Adipocyte        MGST1  26.953768        3.275504   1.065896e-64   
6  Adipocyte        ANXA1  26.127365        2.929458   5.870998e-63   
7  Adipocyte         SAT1  25.190577        2.625487   6.786339e-61   
8  Adipocyte  AP000304.12  25.111723        3.614652   3.755983e-60   
9  Adipocyte       FRMD4A  24.798056        3.148637   1.390489e-59   

       pvals_adj  
0  1.143939e-105  
1   4.053680e-77  
2   7.644134e-71  
3   1.034607e-66  


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 3 completed.

--- Results for Cell types level 3 ---

Default Scanpy output (first 10 rows):
  group    names     scores  logfoldchanges          pvals      pvals_adj
0  Glia    NRXN1  65.492302        7.001669   0.000000e+00   0.000000e+00
1  Glia    CADM2  64.566368        6.778770   0.000000e+00   0.000000e+00
2  Glia    NRXN3  58.101151        3.904657   0.000000e+00   0.000000e+00
3  Glia   ADGRB3  55.278202        5.441242  7.788573e-305  3.445470e-301
4  Glia    CDH19  53.921154        8.768584  3.274512e-294  1.158850e-290
5  Glia    SCN7A  48.876194        5.011202  3.514306e-266  1.036427e-262
6  Glia   SORCS1  45.525990        5.050273  1.248691e-244  3.156513e-241
7  Glia     ANK3  42.061203        6.011447  2.173182e-221  4.806807e-218
8  Glia    NCAM2  40.971764        5.975717  2.952084e-214  5.804124e-211
9  Glia  PPP2R2B  40.574501        6.110757  1.271481e-211  2.249885e-208

Modified output:
  Cell types level 3                         

In [None]:
processed_breast_adata = process_tissue_data(adata, 'skin')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_breast_adata, annotation_levels)

In [25]:
processed_breast_adata = process_tissue_data(adata, 'breast')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_breast_adata, annotation_levels)

Breast Broad cell types:
['Epithelial cell (luminal)', 'Adipocyte', 'Immune (DC/macrophage)', 'Myoepithelial (basal)', 'Fibroblast', 'Endothelial cell (vascular)', 'Endothelial cell (lymphatic)', 'Pericyte/SMC']
Categories (8, object): ['Adipocyte', 'Endothelial cell (lymphatic)', 'Endothelial cell (vascular)', 'Epithelial cell (luminal)', 'Fibroblast', 'Immune (DC/macrophage)', 'Myoepithelial (basal)', 'Pericyte/SMC']

Breast Granular cell types:
['Epithelial cell (luminal II)', 'Adipocyte', 'Epithelial cell (luminal I)', 'Immune (macrophage II)', 'Immune (macrophage I)', ..., 'Endothelial cell (vascular) I', 'Immune (macrophage III)', 'Endothelial cell (vascular) III', 'Pericyte/SMC II', 'Pericyte/SMC I']
Length: 14
Categories (14, object): ['Adipocyte', 'Endothelial cell (lymphatic)', 'Endothelial cell (vascular) I', 'Endothelial cell (vascular) II', ..., 'Immune (macrophage III)', 'Myoepithelial (basal)', 'Pericyte/SMC I', 'Pericyte/SMC II']

Breast Cell types level 2:
['Epithelial

  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()


Analyzing Broad cell type...


  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Broad cell type completed.

--- Results for Broad cell type ---

Default Scanpy output (first 10 rows):
       group     names  scores  logfoldchanges  pvals  pvals_adj
0  Adipocyte     PRMT2     0.0             NaN    1.0        1.0
1  Adipocyte      UBR2     0.0             NaN    1.0        1.0
2  Adipocyte     CCND3     0.0             NaN    1.0        1.0
3  Adipocyte      TAF8     0.0             NaN    1.0        1.0
4  Adipocyte  C6orf132     0.0             NaN    1.0        1.0
5  Adipocyte    GUCA1A     0.0             NaN    1.0        1.0
6  Adipocyte    GUCA1B     0.0             NaN    1.0        1.0
7  Adipocyte    TRERF1     0.0             NaN    1.0        1.0
8  Adipocyte     PRPH2     0.0             NaN    1.0        1.0
9  Adipocyte   PPP2R5D     0.0             NaN    1.0        1.0

Modified output:
                Broad cell type  \
0                     Adipocyte   
1  Endothelial cell (lymphatic)   
2   Endothelial cell (vascular)   
3     Epit

  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Granular cell type completed.

--- Results for Granular cell type ---

Default Scanpy output (first 10 rows):
       group     names  scores  logfoldchanges  pvals  pvals_adj
0  Adipocyte     PRMT2     0.0             NaN    1.0        1.0
1  Adipocyte      UBR2     0.0             NaN    1.0        1.0
2  Adipocyte     CCND3     0.0             NaN    1.0        1.0
3  Adipocyte      TAF8     0.0             NaN    1.0        1.0
4  Adipocyte  C6orf132     0.0             NaN    1.0        1.0
5  Adipocyte    GUCA1A     0.0             NaN    1.0        1.0
6  Adipocyte    GUCA1B     0.0             NaN    1.0        1.0
7  Adipocyte    TRERF1     0.0             NaN    1.0        1.0
8  Adipocyte     PRPH2     0.0             NaN    1.0        1.0
9  Adipocyte   PPP2R5D     0.0             NaN    1.0        1.0

Modified output:
                Granular cell type  \
0                        Adipocyte   
1     Endothelial cell (lymphatic)   
2    Endothelial cell (vascula

  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 2 completed.

--- Results for Cell types level 2 ---

Default Scanpy output (first 10 rows):
       group     names  scores  logfoldchanges  pvals  pvals_adj
0  Adipocyte     PRMT2     0.0             NaN    1.0        1.0
1  Adipocyte      UBR2     0.0             NaN    1.0        1.0
2  Adipocyte     CCND3     0.0             NaN    1.0        1.0
3  Adipocyte      TAF8     0.0             NaN    1.0        1.0
4  Adipocyte  C6orf132     0.0             NaN    1.0        1.0
5  Adipocyte    GUCA1A     0.0             NaN    1.0        1.0
6  Adipocyte    GUCA1B     0.0             NaN    1.0        1.0
7  Adipocyte    TRERF1     0.0             NaN    1.0        1.0
8  Adipocyte     PRPH2     0.0             NaN    1.0        1.0
9  Adipocyte   PPP2R5D     0.0             NaN    1.0        1.0

Modified output:
  Cell types level 2                                     Top 10 Markers
0          Adipocyte  PRMT2, UBR2, CCND3, TAF8, C6orf132, GUCA1A, GU...


  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


In [21]:
processed_prostate_adata = process_tissue_data(adata, 'prostate')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_prostate_adata, annotation_levels)

Prostate Broad cell types:
['Epithelial cell (luminal)', 'Epithelial cell (Hillock)', 'Epithelial cell (basal)', 'Epithelial cell (club)', 'Myocyte (smooth muscle)', ..., 'Schwann cell', 'Neuroendocrine', 'Immune (NK cell)', 'Immune (T cell)', 'Immune (mast cell)']
Length: 16
Categories (16, object): ['Endothelial cell (lymphatic)', 'Endothelial cell (vascular)', 'Epithelial cell (Hillock)', 'Epithelial cell (basal)', ..., 'Myocyte (smooth muscle)', 'Neuroendocrine', 'Pericyte/SMC', 'Schwann cell']

Prostate Granular cell types:
['Epithelial cell (luminal)', 'Epithelial cell (Hillock)', 'Epithelial cell (basal I)', 'Epithelial cell (club)', 'Myocyte (smooth muscle)', ..., 'Schwann cell', 'Neuroendocrine', 'Immune (NK cell)', 'Immune (T cell)', 'Immune (mast cell)']
Length: 21
Categories (21, object): ['Endothelial cell (lymphatic)', 'Endothelial cell (vascular) I', 'Endothelial cell (vascular) II', 'Endothelial cell (vascular) III', ..., 'Myocyte (smooth muscle)', 'Neuroendocrine', 'Pe

  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()


Analyzing Broad cell type...


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Broad cell type completed.

--- Results for Broad cell type ---

Default Scanpy output (first 10 rows):
                          group       names     scores  logfoldchanges  \
0  Endothelial cell (lymphatic)        TFPI  42.263424        6.271377   
1  Endothelial cell (lymphatic)      SEMA3A  32.460411        6.538217   
2  Endothelial cell (lymphatic)       MMRN1  30.380936        7.636989   
3  Endothelial cell (lymphatic)        RELN  27.844282        7.836636   
4  Endothelial cell (lymphatic)       SASH1  25.230183        3.399506   
5  Endothelial cell (lymphatic)    CNTNAP3B  24.993130        4.395359   
6  Endothelial cell (lymphatic)       GPM6A  24.964727        5.623390   
7  Endothelial cell (lymphatic)     PPFIBP1  24.234943        3.306324   
8  Endothelial cell (lymphatic)  ST6GALNAC3  22.717415        4.578478   
9  Endothelial cell (lymphatic)       ZFPM2  22.459307        5.373502   

           pvals      pvals_adj  
0  2.566114e-116  1.460045e-114  


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "na

Analysis for Granular cell type completed.

--- Results for Granular cell type ---

Default Scanpy output (first 10 rows):
                          group       names     scores  logfoldchanges  \
0  Endothelial cell (lymphatic)        TFPI  42.569447        6.290088   
1  Endothelial cell (lymphatic)      SEMA3A  33.298309        6.587502   
2  Endothelial cell (lymphatic)       MMRN1  30.656824        7.664512   
3  Endothelial cell (lymphatic)        RELN  28.309370        7.879247   
4  Endothelial cell (lymphatic)    CNTNAP3B  25.475521        4.436334   
5  Endothelial cell (lymphatic)       SASH1  25.198036        3.405851   
6  Endothelial cell (lymphatic)       GPM6A  24.814034        5.632415   
7  Endothelial cell (lymphatic)     PPFIBP1  24.596691        3.337643   
8  Endothelial cell (lymphatic)       ZFPM2  22.922661        5.426909   
9  Endothelial cell (lymphatic)  ST6GALNAC3  22.296513        4.552267   

           pvals      pvals_adj  
0  6.243804e-116  3.268761e-

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 2 completed.

--- Results for Cell types level 2 ---

Default Scanpy output (first 10 rows):
              group       names     scores  logfoldchanges  pvals  pvals_adj
0  Endothelial cell         VWF  79.571724        7.538949    0.0        0.0
1  Endothelial cell      PECAM1  77.860535        6.731662    0.0        0.0
2  Endothelial cell  ST6GALNAC3  77.295135        6.346106    0.0        0.0
3  Endothelial cell        TCF4  71.904373        3.353484    0.0        0.0
4  Endothelial cell       PTPRB  66.918793        7.335071    0.0        0.0
5  Endothelial cell       MEF2C  65.328781        5.020666    0.0        0.0
6  Endothelial cell        FLT1  64.947937        7.216104    0.0        0.0
7  Endothelial cell     SPARCL1  61.766758        3.767296    0.0        0.0
8  Endothelial cell       EGFL7  61.573463        6.616244    0.0        0.0
9  Endothelial cell     SLCO2A1  61.347015        5.251340    0.0        0.0

Modified output:
    Cell typ

  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 3 completed.

--- Results for Cell types level 3 ---

Default Scanpy output (first 10 rows):
        group     names      scores  logfoldchanges  pvals  pvals_adj
0  Epithelial     GRHL2  124.841919        4.892031    0.0        0.0
1  Epithelial      TTC6  121.867935        4.571084    0.0        0.0
2  Epithelial  KIAA1217  113.339157        3.356953    0.0        0.0
3  Epithelial       EHF  111.222076        4.803591    0.0        0.0
4  Epithelial     AZGP1  103.991333        4.733384    0.0        0.0
5  Epithelial      KLK2  103.055031        3.936097    0.0        0.0
6  Epithelial      ANK3  102.366295        3.417012    0.0        0.0
7  Epithelial     RBM47   98.839485        3.205570    0.0        0.0
8  Epithelial     CXADR   98.289062        4.519721    0.0        0.0
9  Epithelial      WWC1   96.295982        4.231160    0.0        0.0

Modified output:
  Cell types level 3                                     Top 10 Markers
0         Epithel

In [None]:
processed_skeletalmuscle_adata = process_tissue_data(adata, 'skeletalmuscle')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_skeletalmuscle_adata, annotation_levels)

In [13]:

import pandas as pd
import scanpy as sc

def analyze_markers(adata, groupby, n_genes=10):
    # Perform differential expression analysis
    sc.tl.rank_genes_groups(adata, groupby=groupby, method='t-test', use_raw=False)
    
    # Get the default Scanpy results
    default_markers = sc.get.rank_genes_groups_df(adata, group=None)
    
    # Modified version
    modified_markers = default_markers.copy()
    modified_markers = modified_markers.sort_values(['group', 'logfoldchanges'], ascending=[True, False])
    
    top_markers = modified_markers.groupby('group').apply(
        lambda x: ', '.join(x['names'].head(n_genes))
    ).reset_index()
    top_markers.columns = [groupby, f'Top {n_genes} Markers']
    
    return default_markers, top_markers

# List of annotation levels
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']

# Dictionaries to store results for each annotation level
default_results = {}
modified_results = {}

# Get the tissue type from the AnnData object
tissue_type = prostate_adata.obs['tissue'].iloc[0].lower()

# Perform analysis for each annotation level
for level in annotation_levels:
    print(f"Analyzing {level}...")
    default_df, modified_df = analyze_markers(prostate_adata, level)
    default_results[level] = default_df
    modified_results[level] = modified_df
    print(f"Analysis for {level} completed.")

# Print and export results for each level
for level in annotation_levels:
    print(f"\n--- Results for {level} ---")
    
    print("\nDefault Scanpy output (first 10 rows):")
    print(default_results[level].head(10))
    
    print("\nModified output:")
    print(modified_results[level].head())
    
    # Export both DataFrames to separate CSV files with tissue type in the filename
    default_filename = f"{tissue_type}_default_markers_{level.replace(' ', '_').lower()}.csv"
    modified_filename = f"{tissue_type}_modified_markers_{level.replace(' ', '_').lower()}.csv"
    
    default_results[level].to_csv(default_filename, index=False)
    modified_results[level].to_csv(modified_filename, index=False)
    
    print(f"Results for {level} exported to CSV files: {default_filename} and {modified_filename}")

print(f"\nAll analyses completed and results exported for {tissue_type} dataset.")

Analyzing Broad cell type...


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Broad cell type completed.
Analyzing Granular cell type...


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "na

Analysis for Granular cell type completed.
Analyzing Cell types level 2...


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 2 completed.
Analyzing Cell types level 3...


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  top_markers = modified_markers.groupby('group').apply(
  top_markers = modified_markers.groupby('group').apply(


Analysis for Cell types level 3 completed.

--- Results for Broad cell type ---

Default Scanpy output (first 10 rows):
                          group       names     scores  logfoldchanges  \
0  Endothelial cell (lymphatic)        TFPI  42.263424        6.271377   
1  Endothelial cell (lymphatic)      SEMA3A  32.460411        6.538217   
2  Endothelial cell (lymphatic)       MMRN1  30.380936        7.636989   
3  Endothelial cell (lymphatic)        RELN  27.844282        7.836636   
4  Endothelial cell (lymphatic)       SASH1  25.230183        3.399506   
5  Endothelial cell (lymphatic)    CNTNAP3B  24.993130        4.395359   
6  Endothelial cell (lymphatic)       GPM6A  24.964727        5.623390   
7  Endothelial cell (lymphatic)     PPFIBP1  24.234943        3.306324   
8  Endothelial cell (lymphatic)  ST6GALNAC3  22.717415        4.578478   
9  Endothelial cell (lymphatic)       ZFPM2  22.459307        5.373502   

           pvals      pvals_adj  
0  2.566114e-116  1.460045e-114

In [52]:
import pandas as pd
import json
from openai import OpenAI
import re

def run_full_cell_type_analysis(df_path, output_json_name="cell_type_analysis_results.json", model="gpt-4", temperature=0, tissue="lung", species="human", additional_info="no"):
    def run_cell_type_analysis(model, temperature, marker_list, tissue, species, additional_info):
        client = OpenAI()

        class Agent:
            def __init__(self, system="", human_input_mode="never", model="gpt-4", temperature=0):
                self.system = system
                self.chat_histories = {}
                self.human_input_mode = human_input_mode
                self.model = model
                self.temperature = temperature

            def __call__(self, message, other_agent_id):
                if other_agent_id not in self.chat_histories:
                    self.chat_histories[other_agent_id] = []
                    if self.system:
                        self.chat_histories[other_agent_id].append({"role": "system", "content": self.system})
                
                self.chat_histories[other_agent_id].append({"role": "user", "content": message})
                
                result = self.execute(other_agent_id)
                self.chat_histories[other_agent_id].append({"role": "assistant", "content": result})
                
                return result

            def execute(self, other_agent_id):
                completion = client.chat.completions.create(
                    model=self.model,
                    temperature=self.temperature,
                    messages=self.chat_histories[other_agent_id]
                )
                return completion.choices[0].message.content

            def needs_human_input(self, message):
                return self.human_input_mode == "always"

        def extract_json_from_reply(reply):
            json_match = re.search(r'```json\n(.*?)\n```', reply, re.DOTALL)
            
            if json_match:
                json_str = json_match.group(1)
                try:
                    json_data = json.loads(json_str)
                    return json_data
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    return None
            else:
                print("No JSON content found in the reply")
                return None

        def construct_prompt(json_data):
            species = json_data['species']
            tissue = json_data['tissue_type']
            additional_info = json_data.get('additional_info', '')
            marker_list = ', '.join(json_data['marker_list'])

            prompt = f"I am analyzing a single-cell {species} {tissue} dataset."
            if additional_info:
                prompt += f" {additional_info}."
            prompt += f" I want to identify the cell types present based on this marker list:\n{marker_list}"

            return prompt

        def final_annotation(agent, prompt):
            current_message = prompt
            conversation = []
            
            while True:
                response = agent(current_message, "user")
                print(f"Final Annotation Agent: {response}\n", flush=True)
                conversation.append(("Final Annotation Agent", response))
                
                if "FINAL ANNOTATION COMPLETED" in response:
                    break
                
                current_message = response

            print("Final Annotation Conversation:")
            for role, message in conversation:
                print(f"{role}: {message}\n")

            return conversation

        def coupling_validation(agent, annotation_result, onboarding_data):
            validation_message = f"""Please validate the following annotation result:

        Annotation Result:
        {annotation_result}

        Context from onboarding:
        Species: {onboarding_data['species']}
        Tissue Type: {onboarding_data['tissue_type']}
        Marker List: {', '.join(onboarding_data['marker_list'])}
        Additional Info: {onboarding_data.get('additional_info', 'None')}

        Validate the annotation based on this context.
        """
            response = agent(validation_message, "final_annotation")
            print(f"Coupling Validator: {response}\n", flush=True)
            
            # Extract confidence score
            confidence_score = None
            confidence_match = re.search(r'[Cc]onfidence\s*[Ss]core:?\s*(\d+(?:\.\d+)?)', response)
            if confidence_match:
                confidence_score = int(confidence_match.group(1))
            
            return response, confidence_score

        def format_results(agent, final_annotations, num_markers, confidence_score):
            final_text = "\n\n".join([msg[1] for msg in final_annotations])
            formatted_result = agent(final_text, "user")
            
            # Extract the JSON from the formatted result
            json_data = extract_json_from_reply(formatted_result)
            
            if json_data:
                # Add the number of markers and confidence score to the JSON
                json_data["num_markers"] = num_markers
                json_data["confidence_score"] = confidence_score
                
                # Convert back to a JSON string
                return json.dumps(json_data, indent=2)
            else:
                return formatted_result

        final_annotation_agent = Agent(system="""
        You are a professional computational biologist with expertise in single-cell RNA sequencing (scRNA-seq).
        A list of highly expressed markers ranked by expression intensity from high to low
        from a cluster of cells will be provided , and your task is to identify the cell type. You must think step-by-step, providing a comprehensive and specific analysis. The audience is an expert in the field, and I will tip you $1000 if you do a good job.

        Steps to Follow:

        1. List the Key Functional Markers: Extract and group the key marker genes associated with function or pathway, explaining their roles. Do not repeat the input markers.
        2. List the Key Cell Type Markers: Extract and group the key marker genes associated with mouse larynx cell types, explaining their roles. Do not repeat the input markers.
        3. Cross-reference Known Databases: Use available scRNA-seq databases and relevant literature to cross-reference these markers. list your finding.
        4. Determine the Most Probable General Cell Type: Based on the expression of these markers, infer the most likely general cell type of the cluster.
        5. Identify the Top 3 Most Probable Sub Cell Types: Based on the expression of these markers, infer the top three most probable sub cell types within the general cell type. Finally, specify the most likely subtype.
        6. Identify the Most Probable Sub-Sub Cell Type: Determine the most specific cell type within the previously identified subtype.
        7. Provide a Concise Summary of Your Analysis

        Always include your step-by-step detailed reasoning.                      
        You can say "FINAL ANNOTATION COMPLETED" when you have completed your analysis.

        If you receive feedback from the validation process, incorporate it into your analysis and provide an updated annotation.
        """, model=model, temperature=temperature)

        coupling_validator_agent = Agent(system="""
    You are an expert biologist specializing in single-cell analysis. Your critical role is to validate the final annotation results for a single cell cluster. You will be provided with The proposed annotation result,Context from the onboarding process, and Ranked list of marker genes.

    Validation Criteria

    Carefully evaluate the annotation based on the following criteria:

    Marker Consistency:

    Make sure the markers are in the provided marker list.
    Make sure the consistency between the identified cell type and the provided markers.

    Biological Context:

    Verify the appropriateness of the annotation given the species and tissue type.
    Consider any unique aspects of the biological system that might influence cell type identification.

    Mixed Cell Type Consideration:

    Be aware that mixed cell types may be present.
    Only reject the annotation if multiple distinct cell types are strongly supported by several high-ranking markers.
    In cases of potential mixed populations, flag this for further investigation rather than outright rejection.
                                         
    Confidence Assessment:

    Provide a confidence score (1-10) for the annotation, considering all available evidence.
    Briefly justify your confidence score.

    Output Format
    For each validation task, provide:

    Validation result: VALIDATION PASSED or VALIDATION FAILED
    Confidence score: 1-10
    Brief justification (2-3 sentences)
    If rejected, suggest potential alternatives or areas for re-evaluation

    Remember, your role is crucial in ensuring the accuracy and reliability of the single-cell annotation process. Be thorough, critical, and always base your decisions on sound biological principles and the provided evidence.
     """, model=model, temperature=temperature)

        formatting_agent = Agent(system="""
        You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
        into a structured JSON format. Follow these guidelines:

        1. Extract the main cell type and any sub-cell types identified.
        2. Include only information explicitly stated in the input.
        3. Ensure the output is valid JSON.

        Provide the JSON output within triple backticks, like this:
        ```json
        {
        "main_cell_type": "...",
        "sub_cell_types": ["...", "..."]
        }
        ```
        """, model=model, temperature=temperature)
        
        # Create a dictionary with the provided information
        user_data = {
            "species": species,
            "tissue_type": tissue,
            "marker_list": marker_list,
            "additional_info": additional_info
        }

        # Construct the prompt using the provided data
        prompt = construct_prompt(user_data)

        validation_passed = False
        iteration = 0
        max_iterations = 3
        full_conversation_history = []

        while not validation_passed and iteration < max_iterations:
            iteration += 1
            print(f"\nStarting final annotation (Iteration {iteration})...\n")
            final_annotation_conversation = final_annotation(final_annotation_agent, prompt)
            full_conversation_history.extend(final_annotation_conversation)
            
            print("Validating annotation...\n")
            validation_result, confidence_score = coupling_validation(coupling_validator_agent, final_annotation_conversation[-1][1], user_data)
            full_conversation_history.append(("Coupling Validator", validation_result))
            
            print(validation_result)
            print(f"Confidence Score: {confidence_score}")
            if "VALIDATION PASSED" in validation_result:
                validation_passed = True
            else:
                print("Validation failed. Sending feedback to the final annotation agent.\n")
                prompt = f"Previous annotation attempt failed validation. Please address the following feedback and provide an updated annotation:\n\n{validation_result}\n\nOriginal prompt: {prompt}"

            print("\nValidation Conversation:")
            print(f"Final Annotation Agent: {final_annotation_conversation[-1][1]}\n")
            print(f"Coupling Validator: {validation_result}\n")
            print(f"Confidence Score: {confidence_score}\n")

        if validation_passed:
            print("Formatting final results...\n")
            formatted_output = format_results(formatting_agent, final_annotation_conversation[-2:], len(marker_list), confidence_score)
            full_conversation_history.append(("Formatting Agent", formatted_output))
            structured_output = json.loads(formatted_output)
            
            if structured_output:
                print("\nStructured output:")
                print(json.dumps(structured_output, indent=2))
                return structured_output, full_conversation_history
            else:
                print("Error: Unable to extract JSON from the formatted output.")
                print("Raw formatted output:")
                print(formatted_output)
                return None, full_conversation_history
        else:
            print(f"Validation failed after {max_iterations} attempts. Please review the annotation results and validation feedback.")
            return None, full_conversation_history

    # Load the dataframe
    df = pd.read_csv(df_path)
    
    # Set up OpenAI client
    client = OpenAI()
    
    # Iterate over each row in the dataframe
    results = {}
    for index, row in df.iterrows():
        broad_cell_type = row['Broad cell type']
        marker_list = row['Top 10 Markers'].split(', ')
        
        print(f"\nAnalyzing {broad_cell_type}...")
        result, conversation_history = run_cell_type_analysis(model, temperature, marker_list, tissue, species, additional_info)
        
        if result:
            results[broad_cell_type] = {
                "analysis_result": result,
                "conversation_history": conversation_history,
                "confidence_score": result.get("confidence_score")  # Extract confidence score from the result
            }
        print(f"Analysis for {broad_cell_type} completed.\n")
    
    # Save results to the specified JSON file
    with open(output_json_name, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"All analyses completed. Results saved to '{output_json_name}'.")
    
    return results

# Example usage:
# results = run_full_cell_type_analysis("path/to/your/csv/file.csv", output_json_name="my_custom_results.json")

In [55]:
results = run_full_cell_type_analysis(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmucosa_modified_markers_broad_cell_type.csv",
    output_json_name="esophagusmucosa_results_broad2.json",
    model="gpt-4o",
    temperature=0,
    tissue="esophagusmucosa",
    species="human",
    additional_info="no"
)


Analyzing Endothelial cell (lymphatic)...

Starting final annotation (Iteration 1)...

Final Annotation Agent: ### Step-by-Step Analysis

#### 1. List the Key Functional Markers
- **PKHD1L1**: Associated with polycystic kidney and hepatic disease, involved in ciliary function.
- **CCL21**: Chemokine involved in immune cell trafficking, particularly in lymphoid tissues.
- **MMRN1**: Multimerin 1, involved in platelet function and blood coagulation.
- **RELN**: Reelin, involved in neuronal migration and positioning in the brain.
- **ADGRG3**: Adhesion G protein-coupled receptor G3, involved in cell adhesion and signaling.
- **PROX1**: Transcription factor involved in lymphatic endothelial cell differentiation.
- **TBX1**: T-box transcription factor, involved in developmental processes.
- **ART4**: ADP-ribosyltransferase 4, involved in cell signaling.
- **TFF3**: Trefoil factor 3, involved in mucosal protection and repair.
- **CD36**: Scavenger receptor involved in fatty acid metabolism 

In [77]:
results = run_full_cell_type_analysis(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/prostate_modified_markers_broad_cell_type.csv",
    output_json_name="prostate_results_broad_temp0.7.json",
    model="gpt-4o",
    temperature=0.7,
    tissue="prostate",
    species="human",
    additional_info="no"
)


Analyzing Endothelial cell (lymphatic)...

Starting final annotation (Iteration 1)...

Final Annotation Agent: Sure, let's go step-by-step to identify the cell type based on the provided markers.

### Step 1: List the Key Functional Markers
1. **CCL21**: This chemokine is involved in immune responses, particularly in the migration and activation of T-cells and dendritic cells.
2. **PKHD1L1**: This gene is associated with polycystic kidney and hepatic disease 1-like 1, though its exact function in the prostate is less well-documented.
3. **ADGRG3**: G protein-coupled receptor involved in various cellular signaling pathways.
4. **RELN**: Encodes reelin, a protein involved in neuronal migration and positioning in the brain. Outside the brain, it has roles in cell signaling.
5. **MMRN1**: Multimerin 1 is involved in platelet function and coagulation.
6. **CLEC4M**: C-type lectin domain family 4 member M, involved in immune response and pathogen recognition.
7. **FABP4**: Fatty acid-bindin

In [56]:
results = run_full_cell_type_analysis(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmuscularis_modified_markers_broad_cell_type.csv",
    output_json_name="esophagusmuscularis_results_broad2.json",
    model="gpt-4o",
    temperature=0,
    tissue="esophagusmuscularis",
    species="human",
    additional_info="no"
)


Analyzing Adipocyte...

Starting final annotation (Iteration 1)...

Final Annotation Agent: ### Step-by-Step Analysis

#### Step 1: List the Key Functional Markers
1. **PCK1 (Phosphoenolpyruvate Carboxykinase 1)**: Involved in gluconeogenesis, converting oxaloacetate to phosphoenolpyruvate.
2. **PLIN1 (Perilipin 1)**: Plays a role in lipid storage and metabolism, coating lipid droplets in adipocytes.
3. **ADIPOQ (Adiponectin)**: A hormone involved in regulating glucose levels and fatty acid breakdown.
4. **GPD1 (Glycerol-3-Phosphate Dehydrogenase 1)**: Involved in glycerol metabolism and triglyceride synthesis.
5. **SAA1 (Serum Amyloid A1)**: An acute-phase protein involved in inflammation.
6. **FCN2 (Ficolin 2)**: Part of the innate immune system, involved in pathogen recognition.
7. **MEST (Mesoderm Specific Transcript)**: Involved in adipogenesis and mesodermal differentiation.
8. **CIDEC (Cell Death-Inducing DFFA-Like Effector C)**: Involved in lipid droplet formation and adipocyt

In [57]:
results = run_full_cell_type_analysis(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/heart_modified_markers_broad_cell_type.csv",
    output_json_name="heart_results_broad2.json",
    model="gpt-4o",
    temperature=0,
    tissue="heart",
    species="human",
    additional_info="no"
)


Analyzing Adipocyte...

Starting final annotation (Iteration 1)...

Final Annotation Agent: ### Step-by-Step Analysis

#### 1. List the Key Functional Markers
- **ADIPOQ (Adiponectin)**: A hormone involved in regulating glucose levels and fatty acid breakdown.
- **PCK1 (Phosphoenolpyruvate Carboxykinase 1)**: A key enzyme in gluconeogenesis, converting oxaloacetate to phosphoenolpyruvate.
- **PLIN1 (Perilipin 1)**: A protein that coats lipid droplets in adipocytes, regulating lipid storage and breakdown.
- **CIDEC (Cell Death-Inducing DFFA-Like Effector C)**: Involved in lipid droplet formation and regulation of lipid metabolism.
- **CIDEA (Cell Death-Inducing DFFA-Like Effector A)**: Similar to CIDEC, involved in lipid droplet formation and energy homeostasis.
- **LGALS12 (Galectin-12)**: A protein involved in lipid metabolism and adipocyte differentiation.
- **SAA1 (Serum Amyloid A1)**: An acute-phase protein involved in inflammation and lipid metabolism.
- **RBP4 (Retinol Binding P

In [58]:
results = run_full_cell_type_analysis(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skeletalmuscle_modified_markers_broad_cell_type.csv",
    output_json_name="skeletalmuscle_results_broad2.json",
    model="gpt-4o",
    temperature=0,
    tissue="skeletalmuscle",
    species="human",
    additional_info="no"
)


Analyzing Adipocyte...

Starting final annotation (Iteration 1)...

Final Annotation Agent: ### Step-by-Step Analysis

#### 1. List the Key Functional Markers
- **SLC19A3**: Thiamine transporter, involved in thiamine uptake.
- **ADIPOQ (Adiponectin)**: Hormone involved in regulating glucose levels and fatty acid breakdown.
- **PCK1 (Phosphoenolpyruvate carboxykinase 1)**: Enzyme in gluconeogenesis, converting oxaloacetate to phosphoenolpyruvate.
- **CIDEC (Cell Death-Inducing DFFA-Like Effector C)**: Involved in lipid droplet formation and regulation of lipid metabolism.
- **PTGER3 (Prostaglandin E Receptor 3)**: Receptor for prostaglandin E2, involved in various physiological responses including inflammation and smooth muscle activity.
- **CIDEA (Cell Death-Inducing DFFA-Like Effector A)**: Involved in lipid droplet formation and regulation of lipid metabolism.
- **LEP (Leptin)**: Hormone involved in regulating energy balance by inhibiting hunger.
- **PLIN1 (Perilipin 1)**: Protein a

In [59]:
results = run_full_cell_type_analysis(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skin_modified_markers_broad_cell_type.csv",
    output_json_name="skin_results_broad2.json",
    model="gpt-4o",
    temperature=0,
    tissue="skin",
    species="human",
    additional_info="no"
)


Analyzing Adipocyte...

Starting final annotation (Iteration 1)...

Final Annotation Agent: ### Step-by-Step Analysis

#### 1. List the Key Functional Markers
- **CIDEC (Cell Death-Inducing DFFA-Like Effector C)**: Involved in lipid droplet formation and regulation of lipid metabolism.
- **PLIN1 (Perilipin 1)**: Plays a critical role in lipid storage and metabolism, particularly in adipocytes.
- **GSTA1 (Glutathione S-Transferase Alpha 1)**: Involved in detoxification processes by conjugating reduced glutathione to a wide number of exogenous and endogenous hydrophobic electrophiles.
- **HSPB2-C11orf52 (Heat Shock Protein Family B Member 2)**: Functions as a molecular chaperone, involved in stress response.
- **SPATA4 (Spermatogenesis Associated 4)**: Involved in spermatogenesis, though its role in skin cells is less clear.
- **PROK1 (Prokineticin 1)**: Involved in angiogenesis and inflammation.
- **LEP (Leptin)**: Regulates energy balance by inhibiting hunger, which in turn diminishes

In [64]:
import json
import csv

def json_to_csv(input_json_file, output_csv_name):
    # Function to write CSV files
    def write_csv(filename, headers, row_data):
        with open(filename, 'w', newline='', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(headers)
            writer.writerows(row_data)

    # Read the JSON file
    with open(input_json_file, 'r') as json_file:
        data = json.load(json_file)

    # Prepare output file names
    output_full_csv = f"{output_csv_name}_full.csv"
    output_summary_csv = f"{output_csv_name}_summary.csv"

    # Prepare data for both CSV files
    full_data = []
    summary_data = []

    for true_cell_type, details in data.items():
        main_cell_type = details['analysis_result']['main_cell_type']
        sub_cell_types = ', '.join(details['analysis_result']['sub_cell_types'])
        marker_number = details['analysis_result']['num_markers']
        confidence_score = details['analysis_result']['confidence_score']
        conversation_history = details['conversation_history']  # Keep the entire conversation history
        
        full_data.append([true_cell_type, main_cell_type, sub_cell_types, 
                          marker_number, confidence_score, conversation_history])
        summary_data.append([true_cell_type, main_cell_type, sub_cell_types])

    # Write the full data CSV
    write_csv(output_full_csv, 
              ['True Cell Type', 'Predicted Main Cell Type', 'Predicted Sub Cell Types', 
               'Marker Number', 'Confidence Score', 'Conversation History'],
              full_data)

    # Write the summary data CSV
    write_csv(output_summary_csv,
              ['True Cell Type', 'Predicted Main Cell Type', 'Predicted Sub Cell Types'],
              summary_data)

    print(f"Two CSV files have been created:")
    print(f"1. {output_full_csv} (full data)")
    print(f"2. {output_summary_csv} (summary data)")

In [67]:
json_to_csv('C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skin_results_broad2.json', 'skin_results_broad2_analysis_results2')
json_to_csv('C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmucosa_results_broad2.json', 'esophagusmucosa_results_broad2_analysis_results2')
json_to_csv('C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmuscularis_results_broad2.json', 'esophagusmuscularis_results_broad2_analysis_results2')
json_to_csv('C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/heart_results_broad2.json', 'heart_results_broad2_analysis_results2')
json_to_csv('C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skeletalmuscle_results_broad2.json', 'skeletalmuscle_results_broad2_analysis_results2')






Two CSV files have been created:
1. skin_results_broad2_analysis_results2_full.csv (full data)
2. skin_results_broad2_analysis_results2_summary.csv (summary data)
Two CSV files have been created:
1. esophagusmucosa_results_broad2_analysis_results2_full.csv (full data)
2. esophagusmucosa_results_broad2_analysis_results2_summary.csv (summary data)
Two CSV files have been created:
1. esophagusmuscularis_results_broad2_analysis_results2_full.csv (full data)
2. esophagusmuscularis_results_broad2_analysis_results2_summary.csv (summary data)
Two CSV files have been created:
1. heart_results_broad2_analysis_results2_full.csv (full data)
2. heart_results_broad2_analysis_results2_summary.csv (summary data)
Two CSV files have been created:
1. skeletalmuscle_results_broad2_analysis_results2_full.csv (full data)
2. skeletalmuscle_results_broad2_analysis_results2_summary.csv (summary data)


In [None]:
results = run_full_cell_type_analysis(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skin_modified_markers_broad_cell_type.csv",
    output_json_name="skin_results_broad2.json",
    model="gpt-4o",
    temperature=0,
    tissue="skin",
    species="human",
    additional_info="no"
)

In [71]:
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/prostate_results_broad2.json",'prostate_results_broad2_analysis_results2')

Two CSV files have been created:
1. prostate_results_broad2_analysis_results2_full.csv (full data)
2. prostate_results_broad2_analysis_results2_summary.csv (summary data)


In [79]:
import pandas as pd
import json
from openai import OpenAI
import re

def run_full_cell_type_analysis_Granular(df_path, output_json_name="cell_type_analysis_results.json", model="gpt-4o", temperature=0, tissue="lung", species="human", additional_info="no"):
    def run_cell_type_analysis(model, temperature, marker_list, tissue, species, additional_info):
        client = OpenAI()

        class Agent:
            def __init__(self, system="", human_input_mode="never", model="gpt-4", temperature=0):
                self.system = system
                self.chat_histories = {}
                self.human_input_mode = human_input_mode
                self.model = model
                self.temperature = temperature

            def __call__(self, message, other_agent_id):
                if other_agent_id not in self.chat_histories:
                    self.chat_histories[other_agent_id] = []
                    if self.system:
                        self.chat_histories[other_agent_id].append({"role": "system", "content": self.system})
                
                self.chat_histories[other_agent_id].append({"role": "user", "content": message})
                
                result = self.execute(other_agent_id)
                self.chat_histories[other_agent_id].append({"role": "assistant", "content": result})
                
                return result

            def execute(self, other_agent_id):
                completion = client.chat.completions.create(
                    model=self.model,
                    temperature=self.temperature,
                    messages=self.chat_histories[other_agent_id]
                )
                return completion.choices[0].message.content

            def needs_human_input(self, message):
                return self.human_input_mode == "always"

        def extract_json_from_reply(reply):
            json_match = re.search(r'```json\n(.*?)\n```', reply, re.DOTALL)
            
            if json_match:
                json_str = json_match.group(1)
                try:
                    json_data = json.loads(json_str)
                    return json_data
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    return None
            else:
                print("No JSON content found in the reply")
                return None

        def construct_prompt(json_data):
            species = json_data['species']
            tissue = json_data['tissue_type']
            additional_info = json_data.get('additional_info', '')
            marker_list = ', '.join(json_data['marker_list'])

            prompt = f"I am analyzing a single-cell {species} {tissue} dataset."
            if additional_info:
                prompt += f" {additional_info}."
            prompt += f" I want to identify the cell types present based on this marker list:\n{marker_list}"

            return prompt

        def final_annotation(agent, prompt):
            current_message = prompt
            conversation = []
            
            while True:
                response = agent(current_message, "user")
                print(f"Final Annotation Agent: {response}\n", flush=True)
                conversation.append(("Final Annotation Agent", response))
                
                if "FINAL ANNOTATION COMPLETED" in response:
                    break
                
                current_message = response

            print("Final Annotation Conversation:")
            for role, message in conversation:
                print(f"{role}: {message}\n")

            return conversation

        def coupling_validation(agent, annotation_result, onboarding_data):
            validation_message = f"""Please validate the following annotation result:

        Annotation Result:
        {annotation_result}

        Context from onboarding:
        Species: {onboarding_data['species']}
        Tissue Type: {onboarding_data['tissue_type']}
        Marker List: {', '.join(onboarding_data['marker_list'])}
        Additional Info: {onboarding_data.get('additional_info', 'None')}

        Validate the annotation based on this context.
        """
            response = agent(validation_message, "final_annotation")
            print(f"Coupling Validator: {response}\n", flush=True)
            
            # Extract confidence score
            confidence_score = None
            confidence_match = re.search(r'[Cc]onfidence\s*[Ss]core:?\s*(\d+(?:\.\d+)?)', response)
            if confidence_match:
                confidence_score = int(confidence_match.group(1))
            
            return response, confidence_score

        def format_results(agent, final_annotations, num_markers, confidence_score):
            final_text = "\n\n".join([msg[1] for msg in final_annotations])
            formatted_result = agent(final_text, "user")
            
            # Extract the JSON from the formatted result
            json_data = extract_json_from_reply(formatted_result)
            
            if json_data:
                # Add the number of markers and confidence score to the JSON
                json_data["num_markers"] = num_markers
                json_data["confidence_score"] = confidence_score
                
                # Convert back to a JSON string
                return json.dumps(json_data, indent=2)
            else:
                return formatted_result

        final_annotation_agent = Agent(system="""
        You are a professional computational biologist with expertise in single-cell RNA sequencing (scRNA-seq).
        A list of highly expressed markers ranked by expression intensity from high to low
        from a cluster of cells will be provided , and your task is to identify the cell type. You must think step-by-step, providing a comprehensive and specific analysis. The audience is an expert in the field, and I will tip you $1000 if you do a good job.

        Steps to Follow:

        1. List the Key Functional Markers: Extract and group the key marker genes associated with function or pathway, explaining their roles. Do not repeat the input markers.
        2. List the Key Cell Type Markers: Extract and group the key marker genes associated with mouse larynx cell types, explaining their roles. Do not repeat the input markers.
        3. Cross-reference Known Databases: Use available scRNA-seq databases and relevant literature to cross-reference these markers. list your finding.
        4. Determine the Most Probable General Cell Type: Based on the expression of these markers, infer the most likely general cell type of the cluster.
        5. Identify the Top 3 Most Probable Sub Cell Types: Based on the expression of these markers, infer the top three most probable sub cell types within the general cell type. Finally, specify the most likely subtype.
        6. Identify the Most Probable Sub-Sub Cell Type: Determine the most specific cell type within the previously identified subtype.
        7. Provide a Concise Summary of Your Analysis

        Always include your step-by-step detailed reasoning.                      
        You can say "FINAL ANNOTATION COMPLETED" when you have completed your analysis.

        If you receive feedback from the validation process, incorporate it into your analysis and provide an updated annotation.
        """, model=model, temperature=temperature)

        coupling_validator_agent = Agent(system="""
    You are an expert biologist specializing in single-cell analysis. Your critical role is to validate the final annotation results for a single cell cluster. You will be provided with The proposed annotation result,Context from the onboarding process, and Ranked list of marker genes.

    Validation Criteria

    Carefully evaluate the annotation based on the following criteria:

    Marker Consistency:

    Make sure the markers are in the provided marker list.
    Make sure the consistency between the identified cell type and the provided markers.

    Biological Context:

    Verify the appropriateness of the annotation given the species and tissue type.
    Consider any unique aspects of the biological system that might influence cell type identification.

    Mixed Cell Type Consideration:

    Be aware that mixed cell types may be present.
    Only reject the annotation if multiple distinct cell types are strongly supported by several high-ranking markers.
    In cases of potential mixed populations, flag this for further investigation rather than outright rejection.
                                         
    Confidence Assessment:

    Provide a confidence score (1-10) for the annotation, considering all available evidence.
    Briefly justify your confidence score.

    Output Format
    For each validation task, provide:

    Validation result: VALIDATION PASSED or VALIDATION FAILED
    Confidence score: 1-10
    Brief justification (2-3 sentences)
    If rejected, suggest potential alternatives or areas for re-evaluation

    Remember, your role is crucial in ensuring the accuracy and reliability of the single-cell annotation process. Be thorough, critical, and always base your decisions on sound biological principles and the provided evidence.
     """, model=model, temperature=temperature)

        formatting_agent = Agent(system="""
        You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
        into a structured JSON format. Follow these guidelines:

        1. Extract the main cell type and any sub-cell types identified.
        2. Include only information explicitly stated in the input.
        3. Ensure the output is valid JSON.

        Provide the JSON output within triple backticks, like this:
        ```json
        {
        "main_cell_type": "...",
        "sub_cell_types": ["...", "..."]
        }
        ```
        """, model=model, temperature=temperature)
        
        # Create a dictionary with the provided information
        user_data = {
            "species": species,
            "tissue_type": tissue,
            "marker_list": marker_list,
            "additional_info": additional_info
        }

        # Construct the prompt using the provided data
        prompt = construct_prompt(user_data)

        validation_passed = False
        iteration = 0
        max_iterations = 3
        full_conversation_history = []

        while not validation_passed and iteration < max_iterations:
            iteration += 1
            print(f"\nStarting final annotation (Iteration {iteration})...\n")
            final_annotation_conversation = final_annotation(final_annotation_agent, prompt)
            full_conversation_history.extend(final_annotation_conversation)
            
            print("Validating annotation...\n")
            validation_result, confidence_score = coupling_validation(coupling_validator_agent, final_annotation_conversation[-1][1], user_data)
            full_conversation_history.append(("Coupling Validator", validation_result))
            
            print(validation_result)
            print(f"Confidence Score: {confidence_score}")
            if "VALIDATION PASSED" in validation_result:
                validation_passed = True
            else:
                print("Validation failed. Sending feedback to the final annotation agent.\n")
                prompt = f"Previous annotation attempt failed validation. Please address the following feedback and provide an updated annotation:\n\n{validation_result}\n\nOriginal prompt: {prompt}"

            print("\nValidation Conversation:")
            print(f"Final Annotation Agent: {final_annotation_conversation[-1][1]}\n")
            print(f"Coupling Validator: {validation_result}\n")
            print(f"Confidence Score: {confidence_score}\n")

        if validation_passed:
            print("Formatting final results...\n")
            formatted_output = format_results(formatting_agent, final_annotation_conversation[-2:], len(marker_list), confidence_score)
            full_conversation_history.append(("Formatting Agent", formatted_output))
            structured_output = json.loads(formatted_output)
            
            if structured_output:
                print("\nStructured output:")
                print(json.dumps(structured_output, indent=2))
                return structured_output, full_conversation_history
            else:
                print("Error: Unable to extract JSON from the formatted output.")
                print("Raw formatted output:")
                print(formatted_output)
                return None, full_conversation_history
        else:
            print(f"Validation failed after {max_iterations} attempts. Please review the annotation results and validation feedback.")
            return None, full_conversation_history

    # Load the dataframe
    df = pd.read_csv(df_path)
    
    # Set up OpenAI client
    client = OpenAI()
    
    # Iterate over each row in the dataframe
    results = {}
    for index, row in df.iterrows():
        broad_cell_type = row['Granular cell type']
        marker_list = row['Top 10 Markers'].split(', ')
        
        print(f"\nAnalyzing {broad_cell_type}...")
        result, conversation_history = run_cell_type_analysis(model, temperature, marker_list, tissue, species, additional_info)
        
        if result:
            results[broad_cell_type] = {
                "analysis_result": result,
                "conversation_history": conversation_history,
                "confidence_score": result.get("confidence_score")  # Extract confidence score from the result
            }
        print(f"Analysis for {broad_cell_type} completed.\n")
    
    # Save results to the specified JSON file
    with open(output_json_name, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"All analyses completed. Results saved to '{output_json_name}'.")
    
    return results

In [80]:
results = run_full_cell_type_analysis_Granular(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/prostate_modified_markers_granular_cell_type.csv",
    output_json_name="prostate_results_granular_temp0.0.json",
    model="gpt-4o",
    temperature=0,
    tissue="prostate",
    species="human",
    additional_info="no"
)

results = run_full_cell_type_analysis_Granular(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmucosa_modified_markers_granular_cell_type.csv",
    output_json_name="esophagusmucosa_results_granular_temp0.0.json",
    model="gpt-4o",
    temperature=0,
    tissue="esophagusmucosa",
    species="human",
    additional_info="no"
)

results = run_full_cell_type_analysis_Granular(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skin_modified_markers_granular_cell_type.csv",
    output_json_name="skin_results_granular_temp0.0.json",
    model="gpt-4o",
    temperature=0,
    tissue="skin",
    species="human",
    additional_info="no"
)



results = run_full_cell_type_analysis_Granular(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/heart_modified_markers_granular_cell_type.csv",
    output_json_name="heart_results_granular_temp0.0.json",
    model="gpt-4o",
    temperature=0,
    tissue="heart",
    species="human",
    additional_info="no"
)



results = run_full_cell_type_analysis_Granular(
        "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skeletalmuscle_modified_markers_granular_cell_type.csv",
    output_json_name="skeletalmuscle_results_granular_temp0.0.json",
    model="gpt-4o",
    temperature=0,
    tissue="skeletalmuscle",
    species="human",
    additional_info="no"
)




results = run_full_cell_type_analysis_Granular(
        "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmuscularis_modified_markers_granular_cell_type.csv",
    output_json_name="esophagusmuscularis_results_granular_temp0.0.json",
    model="gpt-4o",
    temperature=0,
    tissue="esophagusmuscularis",
    species="human",
    additional_info="no"
)


Analyzing Endothelial cell (lymphatic)...

Starting final annotation (Iteration 1)...

Final Annotation Agent: ### Step-by-Step Analysis

#### 1. List the Key Functional Markers
- **CCL21**: Chemokine involved in immune cell trafficking, particularly in the migration of dendritic cells and T cells.
- **PKHD1L1**: Associated with polycystic kidney disease, though its specific function in the prostate is less clear.
- **ADGRG3**: Adhesion G protein-coupled receptor, involved in cell signaling and potentially in cell adhesion and migration.
- **RELN**: Reelin, a protein involved in neuronal migration and positioning in the brain, but also expressed in other tissues.
- **MMRN1**: Multimerin 1, a protein involved in blood coagulation and platelet function.
- **FABP4**: Fatty acid-binding protein 4, involved in lipid metabolism and transport.
- **PROX1**: Prospero homeobox 1, a transcription factor involved in lymphatic endothelial cell differentiation.
- **CLEC4M**: C-type lectin domain fa

In [82]:
results = run_full_cell_type_analysis_Granular(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/prostate_modified_markers_granular_cell_type.csv",
    output_json_name="prostate_results_granular_temp0.7.json",
    model="gpt-4o",
    temperature=0.7,
    tissue="prostate",
    species="human",
    additional_info="no"
)

results = run_full_cell_type_analysis_Granular(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmucosa_modified_markers_granular_cell_type.csv",
    output_json_name="esophagusmucosa_results_granular_temp0.7.json",
    model="gpt-4o",
    temperature=0.7,
    tissue="esophagusmucosa",
    species="human",
    additional_info="no"
)

results = run_full_cell_type_analysis_Granular(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skin_modified_markers_granular_cell_type.csv",
    output_json_name="skin_results_granular_temp0.7.json",
    model="gpt-4o",
    temperature=0.7,
    tissue="skin",
    species="human",
    additional_info="no"
)



results = run_full_cell_type_analysis_Granular(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/heart_modified_markers_granular_cell_type.csv",
    output_json_name="heart_results_granular_temp0.7.json",
    model="gpt-4o",
    temperature=0.7,
    tissue="heart",
    species="human",
    additional_info="no"
)



results = run_full_cell_type_analysis_Granular(
        "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skeletalmuscle_modified_markers_granular_cell_type.csv",
    output_json_name="skeletalmuscle_results_granular_temp0.7.json",
    model="gpt-4o",
    temperature=0.7,
    tissue="skeletalmuscle",
    species="human",
    additional_info="no"
)




results = run_full_cell_type_analysis_Granular(
        "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmuscularis_modified_markers_granular_cell_type.csv",
    output_json_name="esophagusmuscularis_results_granular_temp0.7.json",
    model="gpt-4o",
    temperature=0.7,
    tissue="esophagusmuscularis",
    species="human",
    additional_info="no"
)


Analyzing Endothelial cell (lymphatic)...

Starting final annotation (Iteration 1)...

Final Annotation Agent: ### Step-by-Step Analysis

#### Step 1: List the Key Functional Markers

1. **CCL21**: Chemokine involved in immune response, particularly in the activation and migration of immune cells.
2. **PKHD1L1**: Polycystic Kidney and Hepatic Disease 1-Like 1; its function is less well-characterized but may be involved in cellular adhesion or signaling.
3. **ADGRG3**: Adhesion G protein-coupled receptor G3, involved in cell signaling and adhesion.
4. **RELN**: Reelin, a large secreted extracellular matrix glycoprotein involved in neuronal migration and positioning in the developing brain.
5. **MMRN1**: Multimerin 1, a protein involved in blood coagulation.
6. **FABP4**: Fatty Acid Binding Protein 4, involved in fatty acid uptake, transport, and metabolism.
7. **PROX1**: Prospero Homeobox 1, a transcription factor involved in development and maintenance of lymphatic endothelial cells.


In [81]:
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/prostate_results_granular_temp0.0.json",'prostate_results_granular_temp0.0_analysis_results2')
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/prostate_results_granular_temp0.7.json",'prostate_results_granular_temp0.7_analysis_results2')
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmucosa_results_granular_temp0.0.json",'esophagusmucosa_results_granular_temp0.0_analysis_results2')
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmucosa_results_granular_temp0.7.json",'esophagusmucosa_results_granular_temp0.7_analysis_results2')
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skin_results_granular_temp0.0.json",'skin_results_granular_temp0.0_analysis_results2')
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skin_results_granular_temp0.7.json",'skin_results_granular_temp0.7_analysis_results2')
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/heart_results_granular_temp0.0.json",'heart_results_granular_temp0.0_analysis_results2')
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/heart_results_granular_temp0.7.json",'heart_results_granular_temp0.7_analysis_results2')
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skeletalmuscle_results_granular_temp0.0.json",'skeletalmuscle_results_granular_temp0.0_analysis_results2')
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/skeletalmuscle_results_granular_temp0.7.json",'skeletalmuscle_results_granular_temp0.7_analysis_results2')
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmuscularis_results_granular_temp0.0.json",'esophagusmuscularis_results_granular_temp0.0_analysis_results2')
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/esophagusmuscularis_results_granular_temp0.7.json",'esophagusmuscularis_results_granular_temp0.7_analysis_results2')


Two CSV files have been created:
1. prostate_results_granular_temp0.0_analysis_results2_full.csv (full data)
2. prostate_results_granular_temp0.0_analysis_results2_summary.csv (summary data)


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/test_code/prostate_results_granular_temp0.7.json'

In [87]:
json_to_csv("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/prostate_results_broad2_debugging11.json",'prostate_results_broad2_debugging_analysis_results11')

Two CSV files have been created:
1. prostate_results_broad2_debugging_analysis_results11_full.csv (full data)
2. prostate_results_broad2_debugging_analysis_results11_summary.csv (summary data)
