In [1]:
import scanpy as sc
import pandas as pd
import scanpy as sc
import json
import csv

def process_tissue_data(adata, tissue_type):
    """
    Process the AnnData object for a specific tissue type.
    
    Parameters:
    adata (AnnData): The original AnnData object containing all tissues.
    tissue_type (str): The tissue type to process (e.g., 'prostate', 'lung').
    
    Returns:
    AnnData: Processed AnnData object for the specified tissue.
    """
    # Subset the AnnData object to include only the specified tissue
    tissue_adata = adata[adata.obs['tissue'] == tissue_type].copy()
    
    # Print unique values for different cell type classifications
    print(f"{tissue_type.capitalize()} Broad cell types:")
    print(tissue_adata.obs['Broad cell type'].unique())
    print(f"\n{tissue_type.capitalize()} Granular cell types:")
    print(tissue_adata.obs['Granular cell type'].unique())
    print(f"\n{tissue_type.capitalize()} Cell types level 2:")
    print(tissue_adata.obs['Cell types level 2'].unique())
    print(f"\n{tissue_type.capitalize()} Cell types level 3:")
    print(tissue_adata.obs['Cell types level 3'].unique())
    
    # Normalize the data by total counts per cell and scale to 10,000 reads per cell
    sc.pp.normalize_total(tissue_adata, target_sum=1e4)
    
    # Log-transform the data after adding a pseudocount of 1
    sc.pp.log1p(tissue_adata)
    
    # Perform batch correction
    sc.pp.combat(tissue_adata, key='batch')
    
    return tissue_adata

In [2]:


def analyze_and_export_markers(adata, annotation_levels, n_genes=10):
    """
    Analyze markers for multiple annotation levels and export results.
    
    Parameters:
    adata (AnnData): The annotated data matrix.
    annotation_levels (list): List of column names in adata.obs to use for grouping.
    n_genes (int): Number of top genes to include in the modified output.
    
    Returns:
    dict: Dictionary containing default and modified results for each annotation level.
    """
    def analyze_markers(adata, groupby, n_genes):
        sc.tl.rank_genes_groups(adata, groupby=groupby, method='t-test', use_raw=False)
        default_markers = sc.get.rank_genes_groups_df(adata, group=None)
        modified_markers = default_markers.copy()
        modified_markers = modified_markers.sort_values(['group', 'logfoldchanges'], ascending=[True, False])
        top_markers = modified_markers.groupby('group').apply(
            lambda x: ', '.join(x['names'].head(n_genes))
        ).reset_index()
        top_markers.columns = [groupby, 'top_markers']  # Changed column name here
        return default_markers, top_markers

    results = {}
    tissue_type = adata.obs['tissue'].iloc[0].lower()

    for level in annotation_levels:
        print(f"Analyzing {level}...")
        default_df, modified_df = analyze_markers(adata, level, n_genes)
        results[level] = {'default': default_df, 'modified': modified_df}
        print(f"Analysis for {level} completed.")

        print(f"\n--- Results for {level} ---")
        print("\nDefault Scanpy output (first 10 rows):")
        print(default_df.head(10))
        print("\nModified output:")
        print(modified_df.head())

        default_filename = f"{tissue_type}_default_markers_{level.replace(' ', '_').lower()}_{n_genes}genes.csv"
        modified_filename = f"{tissue_type}_modified_markers_{level.replace(' ', '_').lower()}_{n_genes}genes.csv"
        
        default_df.to_csv(default_filename, index=False)
        modified_df.to_csv(modified_filename, index=False)
        
        print(f"Results for {level} exported to CSV files: {default_filename} and {modified_filename}")

    print(f"\nAll analyses completed and results exported for {tissue_type} dataset.")
    return results



In [None]:
def split_markers(marker_string):
    # First, try splitting by comma and space
    markers = re.split(r',\s*', marker_string)
    
    # If that results in only one marker, try splitting by comma only
    if len(markers) == 1:
        markers = marker_string.split(',')
    
    # If still only one marker, try splitting by space
    if len(markers) == 1:
        markers = marker_string.split()
    
    # Remove any empty strings
    markers = [m.strip() for m in markers if m.strip()]
    
    return markers

In [77]:
def check_formatted_output(structured_output):
    return 'main_cell_type' in structured_output and 'sub_cell_types' in structured_output

def rerun_formatting_agent(agent, full_conversation_history):
    full_text = "\n\n".join([f"{role}: {message}" for role, message in full_conversation_history])
    formatted_result = agent(full_text, "user")
    return extract_json_from_reply(formatted_result)

In [None]:
def run_cell_type_analysis(model, temperature, marker_list, tissue, species, additional_info):
    client = OpenAI()

    class Agent:
        def __init__(self, system="", human_input_mode="never", model="gpt-4", temperature=0):
            self.system = system
            self.chat_histories = {}
            self.human_input_mode = human_input_mode
            self.model = model
            self.temperature = temperature

        def __call__(self, message, other_agent_id):
            if other_agent_id not in self.chat_histories:
                self.chat_histories[other_agent_id] = []
                if self.system:
                    self.chat_histories[other_agent_id].append({"role": "system", "content": self.system})
            
            self.chat_histories[other_agent_id].append({"role": "user", "content": message})
            
            result = self.execute(other_agent_id)
            self.chat_histories[other_agent_id].append({"role": "assistant", "content": result})
            
            return result

        def execute(self, other_agent_id):
            completion = client.chat.completions.create(
                model=self.model,
                temperature=self.temperature,
                messages=self.chat_histories[other_agent_id]
            )
            return completion.choices[0].message.content

        def needs_human_input(self, message):
            return self.human_input_mode == "always"

    def extract_json_from_reply(reply):
        json_match = re.search(r'```json\n(.*?)\n```', reply, re.DOTALL)
        
        if json_match:
            json_str = json_match.group(1)
            try:
                json_data = json.loads(json_str)
                return json_data
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                return None
        else:
            print("No JSON content found in the reply")
            return None

    def construct_prompt(json_data):
        species = json_data['species']
        tissue = json_data['tissue_type']
        additional_info = json_data.get('additional_info')
        marker_list = ', '.join(json_data['marker_list'])

        prompt = f"Your task is to annotate a single-cell {species} {tissue} dataset. Please identify the cell type based on this ranked marker list:\n{marker_list}"
        
        if additional_info and additional_info.lower() != "no":
            prompt += f" below is some additional information about the dataset:\n{additional_info}."

        return prompt

    def final_annotation(agent, prompt):
        current_message = prompt
        conversation = []
        
        while True:
            print("current prompt: ", current_message)
            response = agent(current_message, "user")
            print(f"Final Annotation Agent: {response}\n", flush=True)
            conversation.append(("Final Annotation Agent", response))
            
            if "FINAL ANNOTATION COMPLETED" in response:
                break
            
            current_message = response

        print("Final Annotation Conversation:")
        for role, message in conversation:
            print(f"{role}: {message}\n")

        return conversation

    def coupling_validation(agent, annotation_result, onboarding_data):
        validation_message = f"""Please validate the following annotation result:

    Annotation Result:
    {annotation_result}

    Context:

    Marker List: {', '.join(onboarding_data['marker_list'])}
    Additional Info: {onboarding_data.get('additional_info', 'None')}

    Validate the annotation based on this context.
    """
        response = agent(validation_message, "final_annotation")
        print(f"Coupling Validator: {response}\n", flush=True)
        
        return response

    def format_results(agent, final_annotations, num_markers):
        final_text = "\n\n".join([msg[1] for msg in final_annotations])
        formatted_result = agent(final_text, "user")
        
        # Extract the JSON from the formatted result
        json_data = extract_json_from_reply(formatted_result)
        
        if json_data:
            # Add the number of markers to the JSON
            json_data["num_markers"] = num_markers
            
            # Convert back to a JSON string
            return json.dumps(json_data, indent=2)
        else:
            return formatted_result

    final_annotation_agent = Agent(system="""
    You are a professional computational biologist with expertise in single-cell RNA sequencing (scRNA-seq).
    A list of highly expressed markers ranked by expression intensity from high to low
    from a cluster of cells will be provided , and your task is to identify the cell type. You must think step-by-step, providing a comprehensive and specific analysis. The audience is an expert in the field, and you will be rewarded $10000 if you do a good job.

    Steps to Follow:

    1. List the Key Functional Markers: Extract and group the key marker genes associated with function or pathway, explaining their roles.
    2. List the Key Cell Type Markers: Extract and group the key marker genes associated with target tissue cell types, explaining their roles.
    3. Cross-reference Known Databases: Use available scRNA-seq databases and relevant literature to cross-reference these markers.
    4. Determine the Most Probable General Cell Type: Based on the expression of these markers, infer the most likely general cell type of the cluster.
    5. Identify the Top 3 Most Probable Sub Cell Types: Based on the expression of these markers, infer the top three most probable sub cell types within the general cell type. Rank them from most likely to least likely. Finally, specify the most likely subtype based on the markers.
    6. Provide a Concise Summary of Your Analysis

    Always include your step-by-step detailed reasoning.                      
    You can say "FINAL ANNOTATION COMPLETED" when you have completed your analysis.

    If you receive feedback from the validation process, incorporate it into your analysis and provide an updated annotation.
    """, model=model, temperature=temperature)

    coupling_validator_agent = Agent(system="""
You are an expert biologist specializing in single-cell analysis. Your critical role is to validate the final annotation results for a cell cluster. You will be provided with The proposed annotation result, and a Ranked list of marker genes it used.


Below are steps to follow:
                                    
1.Marker Consistency: Make sure the markers are in the provided marker list.
Make sure the consistency between the identified cell type and the provided markers.
                                        

2.Mixed Cell Type Consideration:
Be aware that mixed cell types may be present. Only raise this point if multiple distinct cell types are strongly supported by several high-ranking markers. In cases of potential mixed populations, flag this for further investigation rather than outright rejection.
                                        
Output Format: 
                                        
if pass,

Validation result: VALIDATION PASSED

If failed,
                                                            
Validation result: VALIDATION FAILED
Feedback: give detailed feedback and instruction for revising the annotation

    """.strip(), model=model, temperature=temperature)

    formatting_agent = Agent(system="""
    You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
    into a structured JSON format. Follow these guidelines:

    1. Extract the main cell type and any sub-cell types identified.
    2. Include only information explicitly stated in the input.
    3. If there are possible mixed cell types highlighted, list them.

    Provide the JSON output within triple backticks, like this:
    ```json
    {
    "main_cell_type": "...",
    "sub_cell_types": ["...", "..."],
    "possible_mixed_cell_types": ["...", "..."]
    }
    ```
    """, model=model, temperature=temperature)
    
    # Create a dictionary with the provided information
    user_data = {
        "species": species,
        "tissue_type": tissue,
        "marker_list": marker_list,
    }
    if additional_info and additional_info.lower() != "no":
        user_data["additional_info"] = additional_info

    # Construct the prompt using the provided data
    prompt = construct_prompt(user_data)

    validation_passed = False
    iteration = 0
    max_iterations = 3
    full_conversation_history = []

    while not validation_passed and iteration < max_iterations:
        iteration += 1
        print(f"\nStarting final annotation (Iteration {iteration})...\n")
        
        if iteration > 1:
            # Update the prompt with previous response and validation feedback
            prompt = f"""Previous annotation attempt failed validation. Please review your previous response and the validation feedback, then provide an updated annotation:

Previous response:
{final_annotation_conversation[-1][1]}

Validation feedback:
{validation_result}

Original prompt:
{prompt}

Please provide an updated annotation addressing the validation feedback."""

        final_annotation_conversation = final_annotation(final_annotation_agent, prompt)
        print("updated prompt: ", prompt)
        full_conversation_history.extend(final_annotation_conversation)
        
        print("Validating annotation...\n")
        validation_result = coupling_validation(coupling_validator_agent, final_annotation_conversation[-1][1], user_data)
        full_conversation_history.append(("Coupling Validator", validation_result))
        
        print(validation_result)
        if "VALIDATION PASSED" in validation_result:
            validation_passed = True
        else:
            print("Validation failed. Will update prompt for next iteration.\n")

        print("\nValidation Conversation:")
        print(f"Final Annotation Agent: {final_annotation_conversation[-1][1]}\n")
        print(f"Coupling Validator: {validation_result}\n")

    if validation_passed:
        print("Formatting final results...\n")
        formatted_output = format_results(formatting_agent, final_annotation_conversation[-2:], len(marker_list))
        full_conversation_history.append(("Formatting Agent", formatted_output))
        structured_output = json.loads(formatted_output)
        
        if structured_output:
            structured_output["iterations"] = iteration  # Add the number of iterations to the structured output
            print("\nStructured output:")
            print(json.dumps(structured_output, indent=2))
            return structured_output, full_conversation_history
        else:
            print("Error: Unable to extract JSON from the formatted output.")
            print("Raw formatted output:")
            print(formatted_output)
            return None, full_conversation_history
    else:
        print(f"Validation failed after {max_iterations} attempts. Please review the annotation results and validation feedback.")
        return {"iterations": iteration}, full_conversation_history  # Return iteration count even

In [95]:
##try to fix the missed celltypes

def run_cell_type_analysis(model, temperature, marker_list, tissue, species, additional_info):
    client = OpenAI()

    class Agent:
        def __init__(self, system="", human_input_mode="never", model="gpt-4", temperature=0):
            self.system = system
            self.chat_histories = {}
            self.human_input_mode = human_input_mode
            self.model = model
            self.temperature = temperature

        def __call__(self, message, other_agent_id):
            if other_agent_id not in self.chat_histories:
                self.chat_histories[other_agent_id] = []
                if self.system:
                    self.chat_histories[other_agent_id].append({"role": "system", "content": self.system})
            
            self.chat_histories[other_agent_id].append({"role": "user", "content": message})
            
            result = self.execute(other_agent_id)
            self.chat_histories[other_agent_id].append({"role": "assistant", "content": result})
            
            return result

        def execute(self, other_agent_id):
            completion = client.chat.completions.create(
                model=self.model,
                temperature=self.temperature,
                messages=self.chat_histories[other_agent_id]
            )
            return completion.choices[0].message.content

        def needs_human_input(self, message):
            return self.human_input_mode == "always"

    def extract_json_from_reply(reply):
        json_match = re.search(r'```json\n(.*?)\n```', reply, re.DOTALL)
        
        if json_match:
            json_str = json_match.group(1)
            try:
                json_data = json.loads(json_str)
                return json_data
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                return None
        else:
            print("No JSON content found in the reply")
            return None

    def construct_prompt(json_data):
        species = json_data['species']
        tissue = json_data['tissue_type']
        additional_info = json_data.get('additional_info')
        marker_list = ', '.join(json_data['marker_list'])

        prompt = f"Your task is to annotate a single-cell {species} {tissue} dataset. Please identify the cell type based on this ranked marker list:\n{marker_list}"
        
        if additional_info and additional_info.lower() != "no":
            prompt += f" below is some additional information about the dataset:\n{additional_info}."

        return prompt

    def final_annotation(agent, prompt):
        current_message = prompt
        conversation = []
        
        while True:
            print("current prompt: ", current_message)
            response = agent(current_message, "user")
            print(f"Final Annotation Agent: {response}\n", flush=True)
            conversation.append(("Final Annotation Agent", response))
            
            if "FINAL ANNOTATION COMPLETED" in response:
                break
            
            current_message = response

        print("Final Annotation Conversation:")
        for role, message in conversation:
            print(f"{role}: {message}\n")

        return conversation

    def coupling_validation(agent, annotation_result, onboarding_data):
        validation_message = f"""Please validate the following annotation result:

    Annotation Result:
    {annotation_result}

    Context:

    Marker List: {', '.join(onboarding_data['marker_list'])}
    Additional Info: {onboarding_data.get('additional_info', 'None')}

    Validate the annotation based on this context.
    """
        response = agent(validation_message, "final_annotation")
        print(f"Coupling Validator: {response}\n", flush=True)
        
        return response

    def format_results(agent, final_annotations, num_markers):
        final_text = "\n\n".join([msg[1] for msg in final_annotations])
        formatted_result = agent(final_text, "user")
        
        # Extract the JSON from the formatted result
        json_data = extract_json_from_reply(formatted_result)
        
        if json_data:
            # Add the number of markers to the JSON
            json_data["num_markers"] = num_markers
            
            # Convert back to a JSON string
            return json.dumps(json_data, indent=2)
        else:
            return formatted_result

    final_annotation_agent = Agent(system="""
    You are a professional computational biologist with expertise in single-cell RNA sequencing (scRNA-seq).
    A list of highly expressed markers ranked by expression intensity from high to low
    from a cluster of cells will be provided , and your task is to identify the cell type. You must think step-by-step, providing a comprehensive and specific analysis. The audience is an expert in the field, and you will be rewarded $10000 if you do a good job.

    Steps to Follow:

    1. List the Key Functional Markers: Extract and group the key marker genes associated with function or pathway, explaining their roles.
    2. List the Key Cell Type Markers: Extract and group the key marker genes associated with target tissue cell types, explaining their roles.
    3. Cross-reference Known Databases: Use available scRNA-seq databases and relevant literature to cross-reference these markers.
    4. Determine the Most Probable General Cell Type: Based on the expression of these markers, infer the most likely general cell type of the cluster.
    5. Identify the Top 3 Most Probable Sub Cell Types: Based on the expression of these markers, infer the top three most probable sub cell types within the general cell type. Rank them from most likely to least likely. Finally, specify the most likely subtype based on the markers.
    6. Provide a Concise Summary of Your Analysis

    Always include your step-by-step detailed reasoning.                      
    You can say "FINAL ANNOTATION COMPLETED" when you have completed your analysis.

    If you receive feedback from the validation process, incorporate it into your analysis and provide an updated annotation.
    """, model=model, temperature=temperature)

    coupling_validator_agent = Agent(system="""
You are an expert biologist specializing in single-cell analysis. Your critical role is to validate the final annotation results for a cell cluster. You will be provided with The proposed annotation result, and a Ranked list of marker genes it used.


Below are steps to follow:
                                    
1.Marker Consistency: Make sure the markers are in the provided marker list.
Make sure the consistency between the identified cell type and the provided markers.
                                        

2.Mixed Cell Type Consideration:
Be aware that mixed cell types may be present. Only raise this point if multiple distinct cell types are strongly supported by several high-ranking markers. In cases of potential mixed populations, flag this for further investigation rather than outright rejection.
                                        
Output Format: 
                                        
if pass,

Validation result: VALIDATION PASSED

If failed,
                                                            
Validation result: VALIDATION FAILED
Feedback: give detailed feedback and instruction for revising the annotation

    """.strip(), model=model, temperature=temperature)

    formatting_agent = Agent(system="""
    You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
    into a structured JSON format. Follow these guidelines:

    1. Extract the main cell type and any sub-cell types identified.
    2. Include only information explicitly stated in the input.
    3. If there are possible mixed cell types highlighted, list them.

    Provide the JSON output within triple backticks, like this:
    ```json
    {
    "main_cell_type": "...",
    "sub_cell_types": ["...", "..."],
    "possible_mixed_cell_types": ["...", "..."]
    }
    ```
    """, model=model, temperature=temperature)
    
    # Create a dictionary with the provided information
    user_data = {
        "species": species,
        "tissue_type": tissue,
        "marker_list": marker_list,
    }
    if additional_info and additional_info.lower() != "no":
        user_data["additional_info"] = additional_info

    # Construct the prompt using the provided data
    prompt = construct_prompt(user_data)

    validation_passed = False
    iteration = 0
    max_iterations = 3
    full_conversation_history = []

    while not validation_passed and iteration < max_iterations:
        iteration += 1
        print(f"\nStarting final annotation (Iteration {iteration})...\n")
        
        if iteration > 1:
            # Update the prompt with previous response and validation feedback
            prompt = f"""Previous annotation attempt failed validation. Please review your previous response and the validation feedback, then provide an updated annotation:

Previous response:
{final_annotation_conversation[-1][1]}

Validation feedback:
{validation_result}

Original prompt:
{prompt}

Please provide an updated annotation addressing the validation feedback."""

        final_annotation_conversation = final_annotation(final_annotation_agent, prompt)
        print("updated prompt: ", prompt)
        full_conversation_history.extend(final_annotation_conversation)
        
        print("Validating annotation...\n")
        validation_result = coupling_validation(coupling_validator_agent, final_annotation_conversation[-1][1], user_data)
        full_conversation_history.append(("Coupling Validator", validation_result))
        
        print(validation_result)
        if "VALIDATION PASSED" in validation_result:
            validation_passed = True
        else:
            print("Validation failed. Will update prompt for next iteration.\n")

        print("\nValidation Conversation:")
        print(f"Final Annotation Agent: {final_annotation_conversation[-1][1]}\n")
        print(f"Coupling Validator: {validation_result}\n")



    if validation_passed:
        print("Formatting final results...\n")
        formatting_system = """
        You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
        into a structured JSON format. Follow these guidelines:

        1. Extract the main cell type and any sub-cell types identified.
        2. Include only information explicitly stated in the input.
        3. If there are possible mixed cell types highlighted, list them.

        Provide the JSON output within triple backticks, like this:
        ```json
        {
        "main_cell_type": "...",
        "sub_cell_types": ["...", "..."],
        "possible_mixed_cell_types": ["...", "..."],
        }
        ```
        """
    else:
        print(f"Validation failed after {max_iterations} attempts. Proceeding with modified formatting.\n")
        formatting_system = """
        You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
        into a structured JSON format, with special consideration for uncertain or conflicting annotations. Follow these guidelines:

        1. The analsyis failed after multiple attempts. Please try to extract as much information as possible. Summerize what has gone wrong and what has been tried.
        2.Provide a detailed feedback on why the analysis failed, and what has been tried and why it did not work.
        3.Finally, provide a detailed step-by-step reasoning of how to fix the analysis.


        Provide the JSON output within triple backticks, like this:
        ```json
        {
        "main_cell_type": "if any",
        "sub_cell_types": "if any",
        "possible_cell_types": "if any",
        "feedback": "...",
        "Next_steps": "..."
        }
        ```
        """

    print("Formatting final results...\n")
    formatting_agent = Agent(system=formatting_system, model=model, temperature=temperature)
    formatted_output = format_results(formatting_agent, final_annotation_conversation[-2:], len(marker_list))
    full_conversation_history.append(("Formatting Agent", formatted_output))
    structured_output = json.loads(formatted_output)
    
    if structured_output:
        structured_output["iterations"] = iteration  # Add the number of iterations to the structured output
        print("\nStructured output:")
        print(json.dumps(structured_output, indent=2))
        return structured_output, full_conversation_history
    else:
        print("Error: Unable to extract JSON from the formatted output.")
        print("Raw formatted output:")
        print(formatted_output)
        return None, full_conversation_history

In [94]:
##try to not harm the performance of the formating agent

##try to fix the missed celltypes

def run_cell_type_analysis_with_summary(model, temperature, marker_list, tissue, species, additional_info):
    client = OpenAI()

    class Agent:
        def __init__(self, system="", human_input_mode="never", model="gpt-4", temperature=0):
            self.system = system
            self.chat_histories = {}
            self.human_input_mode = human_input_mode
            self.model = model
            self.temperature = temperature

        def __call__(self, message, other_agent_id):
            if other_agent_id not in self.chat_histories:
                self.chat_histories[other_agent_id] = []
                if self.system:
                    self.chat_histories[other_agent_id].append({"role": "system", "content": self.system})
            
            self.chat_histories[other_agent_id].append({"role": "user", "content": message})
            
            result = self.execute(other_agent_id)
            self.chat_histories[other_agent_id].append({"role": "assistant", "content": result})
            
            return result

        def execute(self, other_agent_id):
            completion = client.chat.completions.create(
                model=self.model,
                temperature=self.temperature,
                messages=self.chat_histories[other_agent_id]
            )
            return completion.choices[0].message.content

        def needs_human_input(self, message):
            return self.human_input_mode == "always"

    def extract_json_from_reply(reply):
        json_match = re.search(r'```json\n(.*?)\n```', reply, re.DOTALL)
        
        if json_match:
            json_str = json_match.group(1)
            try:
                json_data = json.loads(json_str)
                return json_data
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                return None
        else:
            print("No JSON content found in the reply")
            return None

    def construct_prompt(json_data):
        species = json_data['species']
        tissue = json_data['tissue_type']
        additional_info = json_data.get('additional_info')
        marker_list = ', '.join(json_data['marker_list'])

        prompt = f"Your task is to annotate a single-cell {species} {tissue} dataset. Please identify the cell type based on this ranked marker list:\n{marker_list}"
        
        if additional_info and additional_info.lower() != "no":
            prompt += f" below is some additional information about the dataset:\n{additional_info}."

        return prompt

    def final_annotation(agent, prompt):
        current_message = prompt
        conversation = []
        
        while True:
            print("current prompt: ", current_message)
            response = agent(current_message, "user")
            print(f"Final Annotation Agent: {response}\n", flush=True)
            conversation.append(("Final Annotation Agent", response))
            
            if "FINAL ANNOTATION COMPLETED" in response:
                break
            
            current_message = response

        print("Final Annotation Conversation:")
        for role, message in conversation:
            print(f"{role}: {message}\n")

        return conversation

    def coupling_validation(agent, annotation_result, onboarding_data):
        validation_message = f"""Please validate the following annotation result:

    Annotation Result:
    {annotation_result}

    Context:

    Marker List: {', '.join(onboarding_data['marker_list'])}
    Additional Info: {onboarding_data.get('additional_info', 'None')}

    Validate the annotation based on this context.
    """
        response = agent(validation_message, "final_annotation")
        print(f"Coupling Validator: {response}\n", flush=True)
        
        return response

    def format_results(agent, final_annotations, num_markers):
        final_text = "\n\n".join([msg[1] for msg in final_annotations])
        formatted_result = agent(final_text, "user")
        
        # Extract the JSON from the formatted result
        json_data = extract_json_from_reply(formatted_result)
        
        if json_data:
            # Add the number of markers to the JSON
            json_data["num_markers"] = num_markers
            
            # Convert back to a JSON string
            return json.dumps(json_data, indent=2)
        else:
            return formatted_result

    final_annotation_agent = Agent(system="""
    You are a professional computational biologist with expertise in single-cell RNA sequencing (scRNA-seq).
    A list of highly expressed markers ranked by expression intensity from high to low
    from a cluster of cells will be provided , and your task is to identify the cell type. You must think step-by-step, providing a comprehensive and specific analysis. The audience is an expert in the field, and you will be rewarded $10000 if you do a good job.

    Steps to Follow:

    1. List the Key Functional Markers: Extract and group the key marker genes associated with function or pathway, explaining their roles.
    2. List the Key Cell Type Markers: Extract and group the key marker genes associated with target tissue cell types, explaining their roles.
    3. Cross-reference Known Databases: Use available scRNA-seq databases and relevant literature to cross-reference these markers.
    4. Determine the Most Probable General Cell Type: Based on the expression of these markers, infer the most likely general cell type of the cluster.
    5. Identify the Top 3 Most Probable Sub Cell Types: Based on the expression of these markers, infer the top three most probable sub cell types within the general cell type. Rank them from most likely to least likely. Finally, specify the most likely subtype based on the markers.
    6. Provide a Concise Summary of Your Analysis

    Always include your step-by-step detailed reasoning.                      
    You can say "FINAL ANNOTATION COMPLETED" when you have completed your analysis.

    If you receive feedback from the validation process, incorporate it into your analysis and provide an updated annotation.
    """, model=model, temperature=temperature)

    coupling_validator_agent = Agent(system="""
You are an expert biologist specializing in single-cell analysis. Your critical role is to validate the final annotation results for a cell cluster. You will be provided with The proposed annotation result, and a Ranked list of marker genes it used.


Below are steps to follow:
                                    
1.Marker Consistency: Make sure the markers are in the provided marker list.
Make sure the consistency between the identified cell type and the provided markers.
                                        

2.Mixed Cell Type Consideration:
Be aware that mixed cell types may be present. Only raise this point if multiple distinct cell types are strongly supported by several high-ranking markers. In cases of potential mixed populations, flag this for further investigation rather than outright rejection.
                                        
Output Format: 
                                        
if pass,

Validation result: VALIDATION PASSED

If failed,
                                                            
Validation result: VALIDATION FAILED
Feedback: give detailed feedback and instruction for revising the annotation

    """.strip(), model=model, temperature=temperature)

    formatting_agent = Agent(system="""
    You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
    into a structured JSON format. Follow these guidelines:

    1. Extract the main cell type and any sub-cell types identified.
    2. Include only information explicitly stated in the input.
    3. If there are possible mixed cell types highlighted, list them.

    Provide the JSON output within triple backticks, like this:
    ```json
    {
    "main_cell_type": "...",
    "sub_cell_types": ["...", "..."],
    "possible_mixed_cell_types": ["...", "..."]
    }
    ```
    """, model=model, temperature=temperature)
    
    # Create a dictionary with the provided information
    user_data = {
        "species": species,
        "tissue_type": tissue,
        "marker_list": marker_list,
    }
    if additional_info and additional_info.lower() != "no":
        user_data["additional_info"] = additional_info

    # Construct the prompt using the provided data
    prompt = construct_prompt(user_data)

    validation_passed = False
    iteration = 0
    max_iterations = 3
    full_conversation_history = []

    while not validation_passed and iteration < max_iterations:
        iteration += 1
        print(f"\nStarting final annotation (Iteration {iteration})...\n")
        
        if iteration > 1:
            # Update the prompt with previous response and validation feedback
            prompt = f"""Previous annotation attempt failed validation. Please review your previous response and the validation feedback, then provide an updated annotation:

Previous response:
{final_annotation_conversation[-1][1]}

Validation feedback:
{validation_result}

Original prompt:
{prompt}

Please provide an updated annotation addressing the validation feedback."""

        final_annotation_conversation = final_annotation(final_annotation_agent, prompt)
        print("updated prompt: ", prompt)
        full_conversation_history.extend(final_annotation_conversation)
        
        print("Validating annotation...\n")
        validation_result = coupling_validation(coupling_validator_agent, final_annotation_conversation[-1][1], user_data)
        full_conversation_history.append(("Coupling Validator", validation_result))
        
        print(validation_result)
        if "VALIDATION PASSED" in validation_result:
            validation_passed = True
        else:
            print("Validation failed. Will update prompt for next iteration.\n")

        print("\nValidation Conversation:")
        print(f"Final Annotation Agent: {final_annotation_conversation[-1][1]}\n")
        print(f"Coupling Validator: {validation_result}\n")



    if validation_passed:
        print("Generating summary of the analysis...\n")
        summary_agent = Agent(system="""
        You are an expert in single-cell RNA sequencing analysis. Your task is to provide a concise summary of the cell type annotation process based on the final annotation and validation results. Focus on:
        1. The main cell type identified
        2. Key markers that led to this conclusion
        3. Any notable sub-cell types or possible mixed populations
        4. The confidence level of the annotation
        5. Any challenges or uncertainties in the analysis
        6. Any suggestions for future research or improvements to the analysis

        Provide your summary in a clear, concise paragraph.
        """, model=model, temperature=temperature)

        # Prepare the input for the summary agent
        summary_input = f"""
        Final Annotation:
        {final_annotation_conversation[-1][1]}

        Validation Result:
        {validation_result}

        Please provide a short summary of this cell type annotation analysis.
        """

        summary = summary_agent(summary_input, "summary")
        print("Summary of analysis:\n", summary)

        print("Formatting final results...\n")
        formatting_system = """
        You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
        into a structured JSON format. Follow these guidelines:

        1. Extract the main cell type and any sub-cell types identified.
        2. Include only information explicitly stated in the input.
        3. If there are possible mixed cell types highlighted, list them.
        4. Include the provided summary of the analysis in a "summary" field.

        Provide the JSON output within triple backticks, like this:
        ```json
        {
        "main_cell_type": "...",
        "sub_cell_types": ["...", "..."],
        "possible_mixed_cell_types": ["...", "..."],
        }
        ```
        """
        formatting_agent = Agent(system=formatting_system, model=model, temperature=temperature)
        formatted_output = format_results(formatting_agent, final_annotation_conversation[-2:] + [("Summary", summary)], len(marker_list))
        full_conversation_history.append(("Formatting Agent/Summary Agent", formatted_output))
    else:
        print(f"Validation failed after {max_iterations} attempts. Proceeding with modified formatting.\n")
        formatting_system = """
        You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
        into a structured JSON format, with special consideration for uncertain or conflicting annotations. Follow these guidelines:

        1. The analsyis failed after multiple attempts. Please try to extract as much information as possible. Summerize what has gone wrong and what has been tried.
        2.Provide a detailed feedback on why the analysis failed, and what has been tried and why it did not work.
        3.Finally, provide a detailed step-by-step reasoning of how to fix the analysis.


        Provide the JSON output within triple backticks, like this:
        ```json
        {
        "main_cell_type": "if any",
        "sub_cell_types": "if any",
        "possible_cell_types": "if any",
        "feedback": "...",
        "Next_steps": "..."
        }
        ```
        """

        print("Formatting final results...\n")
        formatting_agent = Agent(system=formatting_system, model=model, temperature=temperature)
        formatted_output = format_results(formatting_agent, final_annotation_conversation[-2:], len(marker_list))
        full_conversation_history.append(("Formatting Agent", formatted_output))



    structured_output = json.loads(formatted_output)
    
    if structured_output:
        structured_output["iterations"] = iteration  # Add the number of iterations to the structured output
        print("\nStructured output:")
        print(json.dumps(structured_output, indent=2))
        return structured_output, full_conversation_history
    else:
        print("Error: Unable to extract JSON from the formatted output.")
        print("Raw formatted output:")
        print(formatted_output)
        return None, full_conversation_history

In [89]:
import pandas as pd
import json
from openai import OpenAI
import re

def run_cell_type_analysis_batchrun(df_path, output_json_name="cell_type_analysis_results.json", model="gpt-4", temperature=0, tissue="lung", species="human", additional_info=None, celltype_column="Broad cell type"):
    # Load the dataframe
    df = pd.read_csv(df_path)
    
    # Set up OpenAI client
    client = OpenAI()
    
    # Iterate over each row in the dataframe
    results = {}
    for index, row in df.iterrows():
        cell_type = row[celltype_column]
        marker_list = split_markers(row['top_markers'])
        num_markers = len(marker_list)
        print(f"\nAnalyzing {cell_type}...")
        result, conversation_history = run_cell_type_analysis(model, temperature, marker_list, tissue, species, additional_info)
        
        if result:
            results[cell_type] = {
                "analysis_result": result,
                "conversation_history": conversation_history,
                "iterations": result.get("iterations", 1)
            }
        print(f"Analysis for {cell_type} completed.\n")
    
    # Save results to the specified JSON file
    with open(output_json_name, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"All analyses completed. Results saved to '{output_json_name}'.")
    
    return results

# Example usage:
# results = run_full_cell_type_analysis("path/to/your/csv/file.csv", output_json_name="my_custom_results.json")

In [96]:
###easy swith version

def run_cell_type_analysis_batchrun(df_path, output_json_name="cell_type_analysis_results.json", model="gpt-4", temperature=0, tissue="lung", species="human", additional_info=None, celltype_column="Broad cell type", use_summary=False):
    # Load the dataframe
    df = pd.read_csv(df_path)
    
    # Set up OpenAI client
    client = OpenAI()
    
    # Choose the appropriate analysis function
    analysis_function = run_cell_type_analysis_with_summary if use_summary else run_cell_type_analysis
    
    # Iterate over each row in the dataframe
    results = {}
    for index, row in df.iterrows():
        cell_type = row[celltype_column]
        marker_list = split_markers(row['top_markers'])
        num_markers = len(marker_list)
        print(f"\nAnalyzing {cell_type}...")
        result, conversation_history = analysis_function(model, temperature, marker_list, tissue, species, additional_info)
        
        if result:
            results[cell_type] = {
                "analysis_result": result,
                "conversation_history": conversation_history,
                "iterations": result.get("iterations", 1)
            }
        print(f"Analysis for {cell_type} completed.\n")
    
    # Save results to the specified JSON file
    with open(output_json_name, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"All analyses completed. Results saved to '{output_json_name}'.")
    
    return results

# Example usage:
# results = run_cell_type_analysis_batchrun("path/to/your/csv/file.csv", output_json_name="my_custom_results.json", use_summary=True)

In [70]:
import pandas as pd
import json
from openai import OpenAI
import re

def run_cell_type_analysis_tissue_blind(model, temperature, marker_list, species, additional_info):
    client = OpenAI()

    class Agent:
        def __init__(self, system="", human_input_mode="never", model="gpt-4", temperature=0):
            self.system = system
            self.chat_histories = {}
            self.human_input_mode = human_input_mode
            self.model = model
            self.temperature = temperature

        def __call__(self, message, other_agent_id):
            if other_agent_id not in self.chat_histories:
                self.chat_histories[other_agent_id] = []
                if self.system:
                    self.chat_histories[other_agent_id].append({"role": "system", "content": self.system})
            
            self.chat_histories[other_agent_id].append({"role": "user", "content": message})
            
            result = self.execute(other_agent_id)
            self.chat_histories[other_agent_id].append({"role": "assistant", "content": result})
            
            return result

        def execute(self, other_agent_id):
            completion = client.chat.completions.create(
                model=self.model,
                temperature=self.temperature,
                messages=self.chat_histories[other_agent_id]
            )
            return completion.choices[0].message.content

        def needs_human_input(self, message):
            return self.human_input_mode == "always"

    def extract_json_from_reply(reply):
        json_match = re.search(r'```json\n(.*?)\n```', reply, re.DOTALL)
        
        if json_match:
            json_str = json_match.group(1)
            try:
                json_data = json.loads(json_str)
                return json_data
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                return None
        else:
            print("No JSON content found in the reply")
            return None

    def construct_prompt(json_data):
        species = json_data['species']
        marker_list = ', '.join(json_data['marker_list'])

        prompt = f"Your task is to annotate a single-cell {species} dataset. Please identify the cell type based on this ranked marker list:\n{marker_list}"
        
        if additional_info and additional_info.lower() != "no":
            prompt += f" below is some additional information about the dataset:\n{additional_info}."

        return prompt

    def final_annotation(agent, prompt):
        current_message = prompt
        conversation = []
        
        while True:
            print("current prompt: ", current_message)
            response = agent(current_message, "user")
            print(f"Final Annotation Agent: {response}\n", flush=True)
            conversation.append(("Final Annotation Agent", response))
            
            if "FINAL ANNOTATION COMPLETED" in response:
                break
            
            current_message = response

        print("Final Annotation Conversation:")
        for role, message in conversation:
            print(f"{role}: {message}\n")

        return conversation

    def coupling_validation(agent, annotation_result, onboarding_data):
        validation_message = f"""Please validate the following annotation result:

    Annotation Result:
    {annotation_result}

    Context:

    Marker List: {', '.join(onboarding_data['marker_list'])}
    Additional Info: {onboarding_data.get('additional_info', 'None')}

    Validate the annotation based on this context.
    """
        response = agent(validation_message, "final_annotation")
        print(f"Coupling Validator: {response}\n", flush=True)
        
        return response

    def format_results(agent, final_annotations, num_markers):
        final_text = "\n\n".join([msg[1] for msg in final_annotations])
        formatted_result = agent(final_text, "user")
        
        # Extract the JSON from the formatted result
        json_data = extract_json_from_reply(formatted_result)
        
        if json_data:
            # Add the number of markers to the JSON
            json_data["num_markers"] = num_markers
            
            # Convert back to a JSON string
            return json.dumps(json_data, indent=2)
        else:
            return formatted_result

    final_annotation_agent = Agent(system="""
    You are a professional computational biologist with expertise in single-cell RNA sequencing (scRNA-seq).
    A list of highly expressed markers ranked by expression intensity from high to low
    from a cluster of cells will be provided , and your task is to identify the cell type. You must think step-by-step, providing a comprehensive and specific analysis. The audience is an expert in the field, and you will be rewarded $10000 if you do a good job.

    Steps to Follow:

    1. List the Key Functional Markers: Extract and group the key marker genes associated with function or pathway, explaining their roles.
    2. List the Key Cell Type Markers: Extract and group the key marker genes associated with target tissue cell types, explaining their roles.
    3. Cross-reference Known Databases: Use available scRNA-seq databases and relevant literature to cross-reference these markers.
    4. Determine the possible tissue type: Determine the possible tissue type based on the marker list, and provide a detailed explanation for your reasoning.
    4. Determine the Most Probable General Cell Type: Based on the expression of these markers, infer the most likely general cell type of the cluster.
    5. Identify the Top 3 Most Probable Sub Cell Types: Based on the expression of these markers, infer the top three most probable sub cell types within the general cell type. Rank them from most likely to least likely. Finally, specify the most likely subtype based on the markers.
    6. Provide a Concise Summary of Your Analysis

    Always include your step-by-step detailed reasoning.                      
    You can say "FINAL ANNOTATION COMPLETED" when you have completed your analysis.

    If you receive feedback from the validation process, incorporate it into your analysis and provide an updated annotation.
    """, model=model, temperature=temperature)

    coupling_validator_agent = Agent(system="""
You are an expert biologist specializing in single-cell analysis. Your critical role is to validate the final annotation results for a cell cluster. You will be provided with The proposed annotation result, and a Ranked list of marker genes it used.


Below are steps to follow:
                                    
1.Marker Consistency: Make sure the markers are in the provided marker list.
Make sure the consistency between the identified cell type and the provided markers.
                                        

2.Mixed Cell Type Consideration:
Be aware that mixed cell types may be present. Only raise this point if multiple distinct cell types are strongly supported by several high-ranking markers. In cases of potential mixed populations, flag this for further investigation rather than outright rejection.
                                        
Output Format: 
                                        
if pass,

Validation result: VALIDATION PASSED

If failed,
                                                            
Validation result: VALIDATION FAILED
Feedback: give detailed feedback and instruction for revising the annotation

    """.strip(), model=model, temperature=temperature)

    formatting_agent = Agent(system="""
    You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
    into a structured JSON format. Follow these guidelines:

    1. Extract the main cell type and any sub-cell types identified.
    2. Include only information explicitly stated in the input.
    3. If there are possible mixed cell types highlighted, list them.

    Provide the JSON output within triple backticks, like this:
    ```json
    {
    "main_cell_type": "...",
    "sub_cell_types": ["...", "..."],
    "possible_mixed_cell_types": ["...", "..."]
    }
    ```
    """, model=model, temperature=temperature)
    
    # Create a dictionary with the provided information
    user_data = {
        "species": species,
        "marker_list": marker_list,
    }
    if additional_info and additional_info.lower() != "no":
        user_data["additional_info"] = additional_info

    # Construct the prompt using the provided data
    prompt = construct_prompt(user_data)

    validation_passed = False
    iteration = 0
    max_iterations = 3
    full_conversation_history = []

    while not validation_passed and iteration < max_iterations:
        iteration += 1
        print(f"\nStarting final annotation (Iteration {iteration})...\n")
        
        if iteration > 1:
            # Update the prompt with previous response and validation feedback
            prompt = f"""Previous annotation attempt failed validation. Please review your previous response and the validation feedback, then provide an updated annotation:

Previous response:
{final_annotation_conversation[-1][1]}

Validation feedback:
{validation_result}

Original prompt:
{prompt}

Please provide an updated annotation addressing the validation feedback."""

        final_annotation_conversation = final_annotation(final_annotation_agent, prompt)
        print("updated prompt: ", prompt)
        full_conversation_history.extend(final_annotation_conversation)
        
        print("Validating annotation...\n")
        validation_result = coupling_validation(coupling_validator_agent, final_annotation_conversation[-1][1], user_data)
        full_conversation_history.append(("Coupling Validator", validation_result))
        
        print(validation_result)
        if "VALIDATION PASSED" in validation_result:
            validation_passed = True
        else:
            print("Validation failed. Will update prompt for next iteration.\n")

        print("\nValidation Conversation:")
        print(f"Final Annotation Agent: {final_annotation_conversation[-1][1]}\n")
        print(f"Coupling Validator: {validation_result}\n")

    if validation_passed:
        print("Formatting final results...\n")
        formatted_output = format_results(formatting_agent, final_annotation_conversation[-2:], len(marker_list))
        full_conversation_history.append(("Formatting Agent", formatted_output))
        structured_output = json.loads(formatted_output)
        
        if structured_output:
            structured_output["iterations"] = iteration  # Add the number of iterations to the structured output
            print("\nStructured output:")
            print(json.dumps(structured_output, indent=2))
            return structured_output, full_conversation_history
        else:
            print("Error: Unable to extract JSON from the formatted output.")
            print("Raw formatted output:")
            print(formatted_output)
            return None, full_conversation_history
    else:
        print(f"Validation failed after {max_iterations} attempts. Please review the annotation results and validation feedback.")
        return {"iterations": iteration}, full_conversation_history  # Return iteration count even if validation fails

In [85]:
##updating version

import pandas as pd
import json
from openai import OpenAI
import re

def run_cell_type_analysis_tissue_blind(model, temperature, marker_list, species, additional_info):
    client = OpenAI()

    class Agent:
        def __init__(self, system="", human_input_mode="never", model="gpt-4", temperature=0):
            self.system = system
            self.chat_histories = {}
            self.human_input_mode = human_input_mode
            self.model = model
            self.temperature = temperature

        def __call__(self, message, other_agent_id):
            if other_agent_id not in self.chat_histories:
                self.chat_histories[other_agent_id] = []
                if self.system:
                    self.chat_histories[other_agent_id].append({"role": "system", "content": self.system})
            
            self.chat_histories[other_agent_id].append({"role": "user", "content": message})
            
            result = self.execute(other_agent_id)
            self.chat_histories[other_agent_id].append({"role": "assistant", "content": result})
            
            return result

        def execute(self, other_agent_id):
            completion = client.chat.completions.create(
                model=self.model,
                temperature=self.temperature,
                messages=self.chat_histories[other_agent_id]
            )
            return completion.choices[0].message.content

        def needs_human_input(self, message):
            return self.human_input_mode == "always"

    def extract_json_from_reply(reply):
        json_match = re.search(r'```json\n(.*?)\n```', reply, re.DOTALL)
        
        if json_match:
            json_str = json_match.group(1)
            try:
                json_data = json.loads(json_str)
                return json_data
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                return None
        else:
            print("No JSON content found in the reply")
            return None

    def construct_prompt(json_data):
        species = json_data['species']
        marker_list = ', '.join(json_data['marker_list'])

        prompt = f"Your task is to annotate a single-cell {species} dataset. Please identify the cell type based on this ranked marker list:\n{marker_list}"
        
        if additional_info and additional_info.lower() != "no":
            prompt += f" below is some additional information about the dataset:\n{additional_info}."

        return prompt

    def final_annotation(agent, prompt):
        current_message = prompt
        conversation = []
        
        while True:
            print("current prompt: ", current_message)
            response = agent(current_message, "user")
            print(f"Final Annotation Agent: {response}\n", flush=True)
            conversation.append(("Final Annotation Agent", response))
            
            if "FINAL ANNOTATION COMPLETED" in response:
                break
            
            current_message = response

        print("Final Annotation Conversation:")
        for role, message in conversation:
            print(f"{role}: {message}\n")

        return conversation

    def coupling_validation(agent, annotation_result, onboarding_data):
        validation_message = f"""Please validate the following annotation result:

    Annotation Result:
    {annotation_result}

    Context:

    Marker List: {', '.join(onboarding_data['marker_list'])}
    Additional Info: {onboarding_data.get('additional_info', 'None')}

    Validate the annotation based on this context.
    """
        response = agent(validation_message, "final_annotation")
        print(f"Coupling Validator: {response}\n", flush=True)
        
        return response

    def format_results(agent, final_annotations, num_markers):
        final_text = "\n\n".join([msg[1] for msg in final_annotations])
        formatted_result = agent(final_text, "user")
        
        # Extract the JSON from the formatted result
        json_data = extract_json_from_reply(formatted_result)
        
        if json_data:
            # Add the number of markers to the JSON
            json_data["num_markers"] = num_markers
            
            # Convert back to a JSON string
            return json.dumps(json_data, indent=2)
        else:
            return formatted_result

    final_annotation_agent = Agent(system="""
    You are a professional computational biologist with expertise in single-cell RNA sequencing (scRNA-seq).
    A list of highly expressed markers ranked by expression intensity from high to low
    from a cluster of cells will be provided , and your task is to identify the cell type. You must think step-by-step, providing a comprehensive and specific analysis. The audience is an expert in the field, and you will be rewarded $10000 if you do a good job.

    Steps to Follow:

    1. List the Key Functional Markers: Extract and group the key marker genes associated with function or pathway, explaining their roles.
    2. List the Key Cell Type Markers: Extract and group the key marker genes associated with target tissue cell types, explaining their roles.
    3. Cross-reference Known Databases: Use available scRNA-seq databases and relevant literature to cross-reference these markers.
    4. Determine the possible tissue type: Determine the possible tissue type based on the marker list, and provide a detailed explanation for your reasoning.
    4. Determine the Most Probable General Cell Type: Based on the expression of these markers, infer the most likely general cell type of the cluster.
    5. Identify the Top 3 Most Probable Sub Cell Types: Based on the expression of these markers, infer the top three most probable sub cell types within the general cell type. Rank them from most likely to least likely. Finally, specify the most likely subtype based on the markers.
    6. Provide a Concise Summary of Your Analysis

    Always include your step-by-step detailed reasoning.                      
    You can say "FINAL ANNOTATION COMPLETED" when you have completed your analysis.

    If you receive feedback from the validation process, incorporate it into your analysis and provide an updated annotation.
    """, model=model, temperature=temperature)

    coupling_validator_agent = Agent(system="""
You are an expert biologist specializing in single-cell analysis. Your critical role is to validate the final annotation results for a cell cluster. You will be provided with The proposed annotation result, and a Ranked list of marker genes it used.


Below are steps to follow:
                                    
1.Marker Consistency: Make sure the markers are in the provided marker list.
Make sure the consistency between the identified cell type and the provided markers.
                                        

2.Mixed Cell Type Consideration:
Be aware that mixed cell types may be present. Only raise this point if multiple distinct cell types are strongly supported by several high-ranking markers. In cases of potential mixed populations, flag this for further investigation rather than outright rejection.
                                        
Output Format: 
                                        
if pass,

Validation result: VALIDATION PASSED

If failed,
                                                            
Validation result: VALIDATION FAILED
Feedback: give detailed feedback and instruction for revising the annotation

    """.strip(), model=model, temperature=temperature)

    formatting_agent = Agent(system="""
    You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
    into a structured JSON format. Follow these guidelines:

    1. Extract the main cell type and any sub-cell types identified.
    2. Include only information explicitly stated in the input.
    3. If there are possible mixed cell types highlighted, list them.

    Provide the JSON output within triple backticks, like this:
    ```json
    {
    "main_cell_type": "...",
    "sub_cell_types": ["...", "..."],
    "possible_mixed_cell_types": ["...", "..."]
    }
    ```
    """, model=model, temperature=temperature)
    
    # Create a dictionary with the provided information
    user_data = {
        "species": species,
        "marker_list": marker_list,
    }
    if additional_info and additional_info.lower() != "no":
        user_data["additional_info"] = additional_info

    # Construct the prompt using the provided data
    prompt = construct_prompt(user_data)

    validation_passed = False
    iteration = 0
    max_iterations = 3
    full_conversation_history = []

    while not validation_passed and iteration < max_iterations:
        iteration += 1
        print(f"\nStarting final annotation (Iteration {iteration})...\n")
        
        if iteration > 1:
            # Update the prompt with previous response and validation feedback
            prompt = f"""Previous annotation attempt failed validation. Please review your previous response and the validation feedback, then provide an updated annotation:

Previous response:
{final_annotation_conversation[-1][1]}

Validation feedback:
{validation_result}

Original prompt:
{prompt}

Please provide an updated annotation addressing the validation feedback."""

        final_annotation_conversation = final_annotation(final_annotation_agent, prompt)
        print("updated prompt: ", prompt)
        full_conversation_history.extend(final_annotation_conversation)
        
        print("Validating annotation...\n")
        validation_result = coupling_validation(coupling_validator_agent, final_annotation_conversation[-1][1], user_data)
        full_conversation_history.append(("Coupling Validator", validation_result))
        
        print(validation_result)
        if "VALIDATION PASSED" in validation_result:
            validation_passed = True
        else:
            print("Validation failed. Will update prompt for next iteration.\n")

        print("\nValidation Conversation:")
        print(f"Final Annotation Agent: {final_annotation_conversation[-1][1]}\n")
        print(f"Coupling Validator: {validation_result}\n")

    if validation_passed:
        print("Formatting final results...\n")
        formatting_system = """
        You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
        into a structured JSON format. Follow these guidelines:

        1. Extract the main cell type and any sub-cell types identified.
        2. Include only information explicitly stated in the input.
        3. If there are possible mixed cell types highlighted, list them.

        Provide the JSON output within triple backticks, like this:
        ```json
        {
        "main_cell_type": "...",
        "sub_cell_types": ["...", "..."],
        "possible_mixed_cell_types": ["...", "..."],
        }
        ```
        """
    else:
        print(f"Validation failed after {max_iterations} attempts. Proceeding with modified formatting.\n")
        formatting_system = """
        You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
        into a structured JSON format, with special consideration for uncertain or conflicting annotations. Follow these guidelines:

        1. The analsyis failed after multiple attempts. Please try to extract as much information as possible. Summerize what has gone wrong and what has been tried.
        2.Provide a detailed feedback on why the analysis failed, and what has been tried and why it did not work.
        3.Finally, provide a detailed step-by-step reasoning of how to fix the analysis.


        Provide the JSON output within triple backticks, like this:
        ```json
        {
        "main_cell_type": "if any",
        "sub_cell_types": "if any",
        "possible_cell_types": "if any",
        "feedback": "...",
        "Next_steps": "..."
        }
        ```
        """

    print("Formatting final results...\n")
    formatting_agent = Agent(system=formatting_system, model=model, temperature=temperature)
    formatted_output = format_results(formatting_agent, final_annotation_conversation[-2:], len(marker_list))
    full_conversation_history.append(("Formatting Agent", formatted_output))
    structured_output = json.loads(formatted_output)
    
    if structured_output:
        structured_output["iterations"] = iteration  # Add the number of iterations to the structured output
        print("\nStructured output:")
        print(json.dumps(structured_output, indent=2))
        return structured_output, full_conversation_history
    else:
        print("Error: Unable to extract JSON from the formatted output.")
        print("Raw formatted output:")
        print(formatted_output)
        return None, full_conversation_history

In [99]:
##updating version

import pandas as pd
import json
from openai import OpenAI
import re

def run_cell_type_analysis_tissue_blind_with_summary(model, temperature, marker_list, species, additional_info):
    client = OpenAI()

    class Agent:
        def __init__(self, system="", human_input_mode="never", model="gpt-4", temperature=0):
            self.system = system
            self.chat_histories = {}
            self.human_input_mode = human_input_mode
            self.model = model
            self.temperature = temperature

        def __call__(self, message, other_agent_id):
            if other_agent_id not in self.chat_histories:
                self.chat_histories[other_agent_id] = []
                if self.system:
                    self.chat_histories[other_agent_id].append({"role": "system", "content": self.system})
            
            self.chat_histories[other_agent_id].append({"role": "user", "content": message})
            
            result = self.execute(other_agent_id)
            self.chat_histories[other_agent_id].append({"role": "assistant", "content": result})
            
            return result

        def execute(self, other_agent_id):
            completion = client.chat.completions.create(
                model=self.model,
                temperature=self.temperature,
                messages=self.chat_histories[other_agent_id]
            )
            return completion.choices[0].message.content

        def needs_human_input(self, message):
            return self.human_input_mode == "always"

    def extract_json_from_reply(reply):
        json_match = re.search(r'```json\n(.*?)\n```', reply, re.DOTALL)
        
        if json_match:
            json_str = json_match.group(1)
            try:
                json_data = json.loads(json_str)
                return json_data
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                return None
        else:
            print("No JSON content found in the reply")
            return None

    def construct_prompt(json_data):
        species = json_data['species']
        marker_list = ', '.join(json_data['marker_list'])

        prompt = f"Your task is to annotate a single-cell {species} dataset. Please identify the cell type based on this ranked marker list:\n{marker_list}"
        
        if additional_info and additional_info.lower() != "no":
            prompt += f" below is some additional information about the dataset:\n{additional_info}."

        return prompt

    def final_annotation(agent, prompt):
        current_message = prompt
        conversation = []
        
        while True:
            print("current prompt: ", current_message)
            response = agent(current_message, "user")
            print(f"Final Annotation Agent: {response}\n", flush=True)
            conversation.append(("Final Annotation Agent", response))
            
            if "FINAL ANNOTATION COMPLETED" in response:
                break
            
            current_message = response

        print("Final Annotation Conversation:")
        for role, message in conversation:
            print(f"{role}: {message}\n")

        return conversation

    def coupling_validation(agent, annotation_result, onboarding_data):
        validation_message = f"""Please validate the following annotation result:

    Annotation Result:
    {annotation_result}

    Context:

    Marker List: {', '.join(onboarding_data['marker_list'])}
    Additional Info: {onboarding_data.get('additional_info', 'None')}

    Validate the annotation based on this context.
    """
        response = agent(validation_message, "final_annotation")
        print(f"Coupling Validator: {response}\n", flush=True)
        
        return response

    def format_results(agent, final_annotations, num_markers):
        final_text = "\n\n".join([msg[1] for msg in final_annotations])
        formatted_result = agent(final_text, "user")
        
        # Extract the JSON from the formatted result
        json_data = extract_json_from_reply(formatted_result)
        
        if json_data:
            # Add the number of markers to the JSON
            json_data["num_markers"] = num_markers
            
            # Convert back to a JSON string
            return json.dumps(json_data, indent=2)
        else:
            return formatted_result

    final_annotation_agent = Agent(system="""
    You are a professional computational biologist with expertise in single-cell RNA sequencing (scRNA-seq).
    A list of highly expressed markers ranked by expression intensity from high to low
    from a cluster of cells will be provided , and your task is to identify the cell type. You must think step-by-step, providing a comprehensive and specific analysis. The audience is an expert in the field, and you will be rewarded $10000 if you do a good job.

    Steps to Follow:

    1. List the Key Functional Markers: Extract and group the key marker genes associated with function or pathway, explaining their roles.
    2. List the Key Cell Type Markers: Extract and group the key marker genes associated with target tissue cell types, explaining their roles.
    3. Cross-reference Known Databases: Use available scRNA-seq databases and relevant literature to cross-reference these markers.
    4. Determine the possible tissue type: Determine the possible tissue type based on the marker list, and provide a detailed explanation for your reasoning.
    4. Determine the Most Probable General Cell Type: Based on the expression of these markers, infer the most likely general cell type of the cluster.
    5. Identify the Top 3 Most Probable Sub Cell Types: Based on the expression of these markers, infer the top three most probable sub cell types within the general cell type. Rank them from most likely to least likely. Finally, specify the most likely subtype based on the markers.
    6. Provide a Concise Summary of Your Analysis

    Always include your step-by-step detailed reasoning.                      
    You can say "FINAL ANNOTATION COMPLETED" when you have completed your analysis.

    If you receive feedback from the validation process, incorporate it into your analysis and provide an updated annotation.
    """, model=model, temperature=temperature)

    coupling_validator_agent = Agent(system="""
You are an expert biologist specializing in single-cell analysis. Your critical role is to validate the final annotation results for a cell cluster. You will be provided with The proposed annotation result, and a Ranked list of marker genes it used.


Below are steps to follow:
                                    
1.Marker Consistency: Make sure the markers are in the provided marker list.
Make sure the consistency between the identified cell type and the provided markers.
                                        

2.Mixed Cell Type Consideration:
Be aware that mixed cell types may be present. Only raise this point if multiple distinct cell types are strongly supported by several high-ranking markers. In cases of potential mixed populations, flag this for further investigation rather than outright rejection.
                                        
Output Format: 
                                        
if pass,

Validation result: VALIDATION PASSED

If failed,
                                                            
Validation result: VALIDATION FAILED
Feedback: give detailed feedback and instruction for revising the annotation

    """.strip(), model=model, temperature=temperature)

    formatting_agent = Agent(system="""
    You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
    into a structured JSON format. Follow these guidelines:

    1. Extract the main cell type and any sub-cell types identified.
    2. Include only information explicitly stated in the input.
    3. If there are possible mixed cell types highlighted, list them.

    Provide the JSON output within triple backticks, like this:
    ```json
    {
    "main_cell_type": "...",
    "sub_cell_types": ["...", "..."],
    "possible_mixed_cell_types": ["...", "..."]
    }
    ```
    """, model=model, temperature=temperature)
    
    # Create a dictionary with the provided information
    user_data = {
        "species": species,
        "marker_list": marker_list,
    }
    if additional_info and additional_info.lower() != "no":
        user_data["additional_info"] = additional_info

    # Construct the prompt using the provided data
    prompt = construct_prompt(user_data)

    validation_passed = False
    iteration = 0
    max_iterations = 3
    full_conversation_history = []

    while not validation_passed and iteration < max_iterations:
        iteration += 1
        print(f"\nStarting final annotation (Iteration {iteration})...\n")
        
        if iteration > 1:
            # Update the prompt with previous response and validation feedback
            prompt = f"""Previous annotation attempt failed validation. Please review your previous response and the validation feedback, then provide an updated annotation:

Previous response:
{final_annotation_conversation[-1][1]}

Validation feedback:
{validation_result}

Original prompt:
{prompt}

Please provide an updated annotation addressing the validation feedback."""

        final_annotation_conversation = final_annotation(final_annotation_agent, prompt)
        print("updated prompt: ", prompt)
        full_conversation_history.extend(final_annotation_conversation)
        
        print("Validating annotation...\n")
        validation_result = coupling_validation(coupling_validator_agent, final_annotation_conversation[-1][1], user_data)
        full_conversation_history.append(("Coupling Validator", validation_result))
        
        print(validation_result)
        if "VALIDATION PASSED" in validation_result:
            validation_passed = True
        else:
            print("Validation failed. Will update prompt for next iteration.\n")

        print("\nValidation Conversation:")
        print(f"Final Annotation Agent: {final_annotation_conversation[-1][1]}\n")
        print(f"Coupling Validator: {validation_result}\n")

    if validation_passed:
        print("Generating summary of the analysis...\n")
        summary_agent = Agent(system="""
        You are an expert in single-cell RNA sequencing analysis. Your task is to provide a concise summary of the cell type annotation process based on the final annotation and validation results. Focus on:
        1. The main cell type identified
        2. Key markers that led to this conclusion
        3. Any notable sub-cell types or possible mixed populations
        4. The confidence level of the annotation
        5. Any challenges or uncertainties in the analysis
        6. Any suggestions for future research or improvements to the analysis

        Provide your summary in a clear, concise paragraph.
        """, model=model, temperature=temperature)

        # Prepare the input for the summary agent
        summary_input = f"""
        Final Annotation:
        {final_annotation_conversation[-1][1]}

        Validation Result:
        {validation_result}

        Please provide a short summary of this cell type annotation analysis.
        """

        summary = summary_agent(summary_input, "summary")
        print("Summary of analysis:\n", summary)

        print("Formatting final results...\n")
        formatting_system = """
        You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
        into a structured JSON format. Follow these guidelines:

        1. Extract the main cell type and any sub-cell types identified.
        2. Include only information explicitly stated in the input.
        3. If there are possible mixed cell types highlighted, list them.
        4. Include the provided summary of the analysis in a "summary" field.

        Provide the JSON output within triple backticks, like this:
        ```json
        {
        "main_cell_type": "...",
        "sub_cell_types": ["...", "..."],
        "possible_mixed_cell_types": ["...", "..."],
        }
        ```
        """
        formatting_agent = Agent(system=formatting_system, model=model, temperature=temperature)
        formatted_output = format_results(formatting_agent, final_annotation_conversation[-2:] + [("Summary", summary)], len(marker_list))
        full_conversation_history.append(("Formatting Agent/Summary Agent", formatted_output))
    else:
        print(f"Validation failed after {max_iterations} attempts. Proceeding with modified formatting.\n")
        formatting_system = """
        You are a formatting assistant for single-cell analysis results. Your task is to convert the final integrated results 
        into a structured JSON format, with special consideration for uncertain or conflicting annotations. Follow these guidelines:

        1. The analsyis failed after multiple attempts. Please try to extract as much information as possible. Summerize what has gone wrong and what has been tried.
        2.Provide a detailed feedback on why the analysis failed, and what has been tried and why it did not work.
        3.Finally, provide a detailed step-by-step reasoning of how to fix the analysis.


        Provide the JSON output within triple backticks, like this:
        ```json
        {
        "main_cell_type": "if any",
        "sub_cell_types": "if any",
        "possible_cell_types": "if any",
        "feedback": "...",
        "Next_steps": "..."
        }
        ```
        """

        print("Formatting final results...\n")
        formatting_agent = Agent(system=formatting_system, model=model, temperature=temperature)
        formatted_output = format_results(formatting_agent, final_annotation_conversation[-2:], len(marker_list))
        full_conversation_history.append(("Formatting Agent", formatted_output))



    structured_output = json.loads(formatted_output)
    
    if structured_output:
        structured_output["iterations"] = iteration  # Add the number of iterations to the structured output
        print("\nStructured output:")
        print(json.dumps(structured_output, indent=2))
        return structured_output, full_conversation_history
    else:
        print("Error: Unable to extract JSON from the formatted output.")
        print("Raw formatted output:")
        print(formatted_output)
        return None, full_conversation_history

In [74]:
import pandas as pd
import json
from openai import OpenAI
import re



def run_cell_type_analysis_batchrun_tissue_blind(df_path, output_json_name="cell_type_analysis_results.json", model="gpt-4", temperature=0, species="human", additional_info=None, celltype_column="Broad cell type"):

    # Load the dataframe
    df = pd.read_csv(df_path)
    
    # Set up OpenAI client
    client = OpenAI()
    
    # Iterate over each row in the dataframe
    results = {}
    for index, row in df.iterrows():
        cell_type = row[celltype_column]
        marker_list = split_markers(row['top_markers'])
        num_markers = len(marker_list)
        print(f"\nAnalyzing {cell_type}...")
        result, conversation_history = run_cell_type_analysis_tissue_blind(model, temperature, marker_list, species, additional_info)
        
        if result:
            results[cell_type] = {
                "analysis_result": result,
                "conversation_history": conversation_history,
                "iterations": result.get("iterations", 1),
                "num_markers": num_markers
            }
        print(f"Analysis for {cell_type} completed.\n")
    
    # Save results to the specified JSON file
    with open(output_json_name, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"All analyses completed. Results saved to '{output_json_name}'.")
    
    return results

In [100]:
#updated

###easy swith version

def run_cell_type_analysis_tissue_blind_batchrun(df_path, output_json_name="cell_type_analysis_results.json", model="gpt-4", temperature=0, species="human", additional_info=None, celltype_column="Broad cell type", use_summary=False):
    # Load the dataframe
    df = pd.read_csv(df_path)
    
    # Set up OpenAI client
    client = OpenAI()
    
    # Choose the appropriate analysis function
    analysis_function = run_cell_type_analysis_tissue_blind if use_summary else run_cell_type_analysis_tissue_blind_with_summary
    
    # Iterate over each row in the dataframe
    results = {}
    for index, row in df.iterrows():
        cell_type = row[celltype_column]
        marker_list = split_markers(row['top_markers'])
        num_markers = len(marker_list)
        print(f"\nAnalyzing {cell_type}...")
        result, conversation_history = analysis_function(model, temperature, marker_list, species, additional_info)
        
        if result:
            results[cell_type] = {
                "analysis_result": result,
                "conversation_history": conversation_history,
                "iterations": result.get("iterations", 1)
            }
        print(f"Analysis for {cell_type} completed.\n")
    
    # Save results to the specified JSON file
    with open(output_json_name, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"All analyses completed. Results saved to '{output_json_name}'.")
    
    return results

# Example usage:
# results = run_cell_type_analysis_batchrun("path/to/your/csv/file.csv", output_json_name="my_custom_results.json", use_summary=True)

In [101]:
results = run_cell_type_analysis_tissue_blind_batchrun(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/transformed_gene_data_merge_marker - Copy.csv",
    output_json_name="C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/transformed_gene_data_merge_marker_results_tissuesetasbrain_cancer11.json",
    model="gpt-4o",
    temperature=0,
    species="human",
    additional_info=None,
    celltype_column="jackcvs",
    use_summary=False
)
print("Function completed successfully")


Analyzing 1...

Starting final annotation (Iteration 1)...

current prompt:  Your task is to annotate a single-cell human dataset. Please identify the cell type based on this ranked marker list:
APOC1, AIF1, ZNF385D, CAPG, GPM6A, LSP1, FABP5, APOE, SLC26A3, STEAP1B
Final Annotation Agent: To annotate the cell type based on the provided ranked marker list, I will follow the outlined steps:

### Step 1: List the Key Functional Markers
- **APOC1 (Apolipoprotein C1):** Involved in lipid metabolism and is known to play a role in the regulation of lipoprotein metabolism.
- **AIF1 (Allograft Inflammatory Factor 1):** Associated with immune response, particularly in macrophages and microglia.
- **CAPG (Capping Actin Protein, Gelsolin Like):** Involved in actin filament dynamics, often associated with cell motility and immune response.
- **FABP5 (Fatty Acid Binding Protein 5):** Involved in fatty acid uptake, transport, and metabolism.
- **APOE (Apolipoprotein E):** Plays a crucial role in lip

In [98]:
results = run_cell_type_analysis_batchrun(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/transformed_gene_data_merge_marker - Copy.csv",
    output_json_name="C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/transformed_gene_data_merge_marker_results_tissuesetasbrain_cancer10.json",
    model="gpt-4o",
    temperature=0,
    species="human",
    tissue="Brain Cancer",
    additional_info=None,
    celltype_column="jackcvs",
    use_summary=False
)
print("Function completed successfully")


Analyzing 1...

Starting final annotation (Iteration 1)...

current prompt:  Your task is to annotate a single-cell human Brain Cancer dataset. Please identify the cell type based on this ranked marker list:
APOC1, AIF1, ZNF385D, CAPG, GPM6A, LSP1, FABP5, APOE, SLC26A3, STEAP1B
Final Annotation Agent: To annotate the cell type based on the provided ranked marker list, I will follow the outlined steps:

### Step 1: List the Key Functional Markers
1. **APOC1 (Apolipoprotein C1):** Involved in lipid metabolism and transport. It is often associated with inflammatory responses and is expressed in various cell types, including macrophages.
2. **AIF1 (Allograft Inflammatory Factor 1):** Known as IBA1, it is a marker for microglia and is involved in immune response and inflammation.
3. **CAPG (Capping Actin Protein, Gelsolin Like):** Involved in actin filament dynamics, often associated with cell motility and immune response.
4. **FABP5 (Fatty Acid Binding Protein 5):** Involved in fatty acid

In [86]:
results = run_cell_type_analysis_batchrun_tissue_blind(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/transformed_gene_data_merge_marker - Copy.csv",
    output_json_name="C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/transformed_gene_data_merge_marker_results_tissuesetasbrain_cancer6.json",
    model="gpt-4o",
    temperature=0,
    species="human",
    additional_info=None,
    celltype_column="jackcvs"
)
print("Function completed successfully")


Analyzing 1...

Starting final annotation (Iteration 1)...

current prompt:  Your task is to annotate a single-cell human dataset. Please identify the cell type based on this ranked marker list:
APOC1, AIF1, ZNF385D, CAPG, GPM6A, LSP1, FABP5, APOE, SLC26A3, STEAP1B
Final Annotation Agent: To annotate the cell type based on the provided ranked marker list, I will follow the outlined steps:

### Step 1: List the Key Functional Markers
1. **APOC1 (Apolipoprotein C1):** Involved in lipid metabolism and transport. It is often associated with lipid-rich environments and can be a marker for cells involved in lipid processing.
2. **AIF1 (Allograft Inflammatory Factor 1):** Known for its role in immune response, particularly in macrophages and microglia. It is involved in inflammatory processes.
3. **CAPG (Capping Actin Protein, Gelsolin Like):** Plays a role in actin filament dynamics, which is crucial for cell motility and shape.
4. **FABP5 (Fatty Acid Binding Protein 5):** Involved in fatty

In [73]:
create_csv_from_json("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/transformed_gene_data_merge_marker_results_tissuesetasbrain_cancer.json")


Two CSV files have been created:
1. C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/transformed_gene_data_merge_marker_results_tissuesetasbrain_cancer_full.csv (full data)
2. C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/transformed_gene_data_merge_marker_results_tissuesetasbrain_cancer_summary.csv (summary data)


In [61]:
#version3
def create_csv_from_json(json_filename):
    import json
    import csv
    import os

    # Read the JSON file
    with open(json_filename, 'r') as json_file:
        data = json.load(json_file)

    # Function to write CSV files
    def write_csv(filename, headers, row_data):
        with open(filename, 'w', newline='', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(headers)
            writer.writerows(row_data)

    # Function to safely get nested dictionary values
    def safe_get(dict_obj, *keys):
        for key in keys:
            try:
                dict_obj = dict_obj[key]
            except (KeyError, TypeError):
                return "N/A"
        return dict_obj

    # Prepare data for both CSV files
    full_data = []
    summary_data = []

    for true_cell_type, details in data.items():
        main_cell_type = safe_get(details, 'analysis_result', 'main_cell_type')
        sub_cell_types = ', '.join(safe_get(details, 'analysis_result', 'sub_cell_types') or [])
        possible_mixed_cell_types = ', '.join(safe_get(details, 'analysis_result', 'possible_mixed_cell_types') or [])
        marker_number = safe_get(details, 'analysis_result', 'num_markers')
        iterations = safe_get(details, 'analysis_result', 'iterations')
        
        conversation_history = ' | '.join([f"{entry[0]}: {entry[1]}" for entry in safe_get(details, 'conversation_history') or []])
        
        full_data.append([true_cell_type, main_cell_type, sub_cell_types, possible_mixed_cell_types,
                          marker_number, iterations, conversation_history])
        summary_data.append([true_cell_type, main_cell_type, sub_cell_types, possible_mixed_cell_types, iterations])

    # Generate output filenames based on input JSON filename
    base_name = os.path.splitext(json_filename)[0]
    full_csv_name = f"{base_name}_full.csv"
    summary_csv_name = f"{base_name}_summary.csv"

    # Write the full data CSV
    write_csv(full_csv_name, 
              ['True Cell Type', 'Predicted Main Cell Type', 'Predicted Sub Cell Types', 'Possible Mixed Cell Types',
               'Marker Number', 'Iterations', 'Conversation History'],
              full_data)

    # Write the summary data CSV
    write_csv(summary_csv_name,
              ['True Cell Type', 'Predicted Main Cell Type', 'Predicted Sub Cell Types', 'Possible Mixed Cell Types', 'Iterations'],
              summary_data)

    print(f"Two CSV files have been created:")
    print(f"1. {full_csv_name} (full data)")
    print(f"2. {summary_csv_name} (summary data)")

In [8]:

def evaluate_annotation(prompt, system_prompt='''You are an professional biologist specialized in single cell analysis. Your task is to analyze the given annotation result and compare it to the true cell type based on your expertise. You will then categorize the result as either correct, partially correct, or incorrect. Include a short one sentence explanation for each evaluation. Note that sometimes celltypes names can be slightly different, it is okay, use your judgement to see if they are actually the same celltype.

## Input Format

You will receive information like below:

True cell type: Cell type name,
main_cell_type: Predicted main cell type,
sub_cell_types: subcelltype1, subcelltype2, subcelltype3
 

## Evaluation steps

1. If the main_cell_type matches the true cell type, consider it fully correct. If the main_cell_type is a broader category that includes the true cell type, look at the sub_cell_types.
2. If the first celltype in the sub_celltypes matche the true cell type, consider it fully correct.
3. If the first celltype in the sub_celltypes is closely related to the true cell type, consider it partially correct.
4. Otherwise, consider it incorrect.

Provide an summary at last

Correct: number of correct
Partially correct: number of partially correct
Incorrect: number of incorrect


Example1

True Cell Type: Epithelial cell (club)

Predicted Main Cell Type: secretory cells
Predicted Sub Cell Types: club cells, goblet cells, multiciliated cells
Evaluation: fully correct: first celltype in sub cell type is club cell matches the true celltype

Example2

True Cell Type:Fibroblast
Predicted Main Cell Type:mesenchymal cells
Predicted Sub Cell Types:fibroblasts, smooth muscle cells, myofibroblasts

Evaluation:
Fully Correct (First sub cell type matches true cell type)

Example3

True Cell Type:Epithelial cell (alveolar type II)
Predicted Main Cell Type:alveolar type II (AT2) cells
Predicted Sub Cell Types:dendritic cells, lung epithelial cells

Evaluation:Fully Correct (The main type is a fully match though first subtype is different, it does not matter, as long as the main type is correct this is fully correct)

''', model="gpt-4o", temperature=0):
    from openai import OpenAI
    client = OpenAI()

    # Convert prompt to string if it's a DataFrame or Series
    if isinstance(prompt, (pd.DataFrame, pd.Series)):
        prompt = prompt.to_string(index=False)

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            temperature=temperature
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"An error occurred: {str(e)}"

In [10]:
def process_multiple_tissues(adata, tissues, n_genes=50):
    """
    Process multiple tissues and generate marker genes for each.
    
    Parameters:
    adata (AnnData): The original AnnData object containing all tissues.
    tissues (list): List of tissue names to process.
    n_genes (int): Number of top genes to include in the analysis (default: 50).
    
    Returns:
    dict: A dictionary containing results for each tissue.
    """
    annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
    results = {}

    for tissue in tissues:
        print(f"\nProcessing {tissue}...")
        processed_adata = process_tissue_data(adata, tissue)
        tissue_results = analyze_and_export_markers(processed_adata, annotation_levels, n_genes=n_genes)
        results[tissue] = tissue_results
        print(f"Processing and analysis completed for {tissue}.")

    return results

In [12]:
def run_analysis_for_tissues(tissues, temperature, model="gpt-4o", n_genes=50, additional_info=None):
    """
    Run cell type analysis for multiple tissues with specified parameters.
    
    Parameters:
    tissues (list): List of tissue names to analyze.
    temperature (float): Temperature setting for the GPT model.
    model (str): The GPT model to use (default: "gpt-4o").
    n_genes (int): Number of top genes to include in the analysis (default: 50).
    additional_info (str or None): Additional information to provide for the analysis (default: None).
    
    Returns:
    dict: A dictionary containing results for each tissue.
    """
    results = {}
    
    for tissue in tissues:
        print(f"\nProcessing {tissue}...")
        
        # Construct the input CSV file path
        input_csv = f"C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/GTEX/{tissue}_modified_markers_broad_cell_type_{n_genes}genes.csv"
        
        # Construct the output JSON file name
        output_json = f"{tissue}_results_broad2_debugging_genes_{n_genes}_markers_temp{temperature}_v1.json"
        
        try:
            # Run the cell type analysis for the current tissue
            tissue_results = run_cell_type_analysis_batchrun(
                input_csv,
                output_json_name=output_json,
                model=model,
                temperature=temperature,
                tissue=tissue,
                species="human",
                additional_info=additional_info
            )
            
            results[tissue] = tissue_results
            print(f"Analysis completed for {tissue}.")
            
            # Create CSV files from the JSON results
            create_csv_from_json(output_json)
            print(f"CSV files created for {tissue}.")
            
        except Exception as e:
            print(f"Error processing {tissue}: {str(e)}")
    
    return results

# Example usage:
# tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
# results = run_analysis_for_tissues(tissues_to_analyze, temperature=0.3, n_genes=50)

In [39]:
def run_analysis_for_tissues_granular(tissues, temperature, model="gpt-4o", n_genes=50, additional_info=None):
    """
    Run cell type analysis for multiple tissues with specified parameters.
    
    Parameters:
    tissues (list): List of tissue names to analyze.
    temperature (float): Temperature setting for the GPT model.
    model (str): The GPT model to use (default: "gpt-4o").
    n_genes (int): Number of top genes to include in the analysis (default: 50).
    additional_info (str or None): Additional information to provide for the analysis (default: None).
    
    Returns:
    dict: A dictionary containing results for each tissue.
    """
    results = {}
    
    for tissue in tissues:
        print(f"\nProcessing {tissue}...")
        
        # Construct the input CSV file path
        input_csv = f"C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/GTEX/{tissue}_modified_markers_granular_cell_type_{n_genes}genes.csv"
        
        # Construct the output JSON file name
        output_json = f"C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/GTEX/json_csv_results/{tissue}_results_granular2_debugging_genes_{n_genes}_markers_temp{temperature}_v1.json"
        
        try:
            # Run the cell type analysis for the current tissue
            tissue_results = run_cell_type_analysis_batchrun(
                input_csv,
                output_json_name=output_json,
                model=model,
                temperature=temperature,
                tissue=tissue,
                species="human",
                additional_info=additional_info,
                celltype_column="Granular cell type"
            )
            
            results[tissue] = tissue_results
            print(f"Analysis completed for {tissue}.")
            
            # Create CSV files from the JSON results
            create_csv_from_json(output_json)
            print(f"CSV files created for {tissue}.")
            
        except Exception as e:
            print(f"Error processing {tissue}: {str(e)}")
    
    return results

# Example usage:
# tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
# results = run_analysis_for_tissues(tissues_to_analyze, temperature=0.3, n_genes=50)

#Actual test start here

In [9]:
adata = sc.read_h5ad("C:/Users/ellio/Downloads/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad")

In [11]:
%%capture

tissues_to_process = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle','prostate']
results = process_multiple_tissues(adata, tissues_to_process, n_genes=20)


tissues_to_process = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle','prostate']
results = process_multiple_tissues(adata, tissues_to_process, n_genes=70)


print("Processing and analysis completed silently.")

In [22]:
%%capture

processed_prostate_adata = process_tissue_data(adata, 'heart')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_prostate_adata, annotation_levels, n_genes=50)

print("Processing and analysis completed silently.")

%%capture

processed_prostate_adata = process_tissue_data(adata, 'lung')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_prostate_adata, annotation_levels, n_genes=50)

print("Processing and analysis completed silently.")


%%capture

processed_prostate_adata = process_tissue_data(adata, 'skin')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_prostate_adata, annotation_levels, n_genes=50)

print("Processing and analysis completed silently.")

%%capture

processed_prostate_adata = process_tissue_data(adata, 'esophagusmucosa')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_prostate_adata, annotation_levels, n_genes=50)

print("Processing and analysis completed silently.")

%%capture

processed_prostate_adata = process_tissue_data(adata, 'esophagusmuscularis')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_prostate_adata, annotation_levels, n_genes=50)

print("Processing and analysis completed silently.")

%%capture

processed_prostate_adata = process_tissue_data(adata, 'skeletalmuscle')
annotation_levels = ['Broad cell type', 'Granular cell type', 'Cell types level 2', 'Cell types level 3']
results = analyze_and_export_markers(processed_prostate_adata, annotation_levels, n_genes=50)

print("Processing and analysis completed silently.")

In [None]:

results = run_full_cell_type_analysis(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/GTEX/esophagusmuscularis_modified_markers_granular_cell_type_50genes.csv",
    output_json_name="esophagusmuscularis_results_broad2_debugging_genes_50_markers_temp0_v1.json",
    model="gpt-4o",
    temperature=0,
    tissue="esophagusmuscularis",
    species="human",
    additional_info=None
)
print("Function completed successfully")

In [None]:
results = run_full_cell_type_analysis(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/GTEX/esophagusmuscularis_modified_markers_granular_cell_type_50genes.csv",
    output_json_name="esophagusmuscularis_results_broad2_debugging_genes_50_markers_temp0_v1.json",
    model="gpt-4o",
    temperature=0,
    tissue="esophagusmuscularis",
    species="human",
    additional_info=None
)
print("Function completed successfully")

In [52]:
results = run_full_cell_type_analysis_tissue_blind(
    "C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/GTEX/transformed_gene_data_merge_marker.csv",
    output_json_name="C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/jackcvs_10_markers_temp0_test.json",
    model="gpt-4o",
    temperature=0,
    species="human",
    additional_info=None,
    celltype_column="jackcvs"
)


Analyzing 1...

Starting final annotation (Iteration 1)...

current prompt:  Your task is to annotate a single-cell human dataset. Please identify the cell type based on this ranked marker list:
APOC1,AIF1,ZNF385D,CAPG,GPM6A,LSP1,FABP5,APOE,SLC26A3,STEAP1B
Final Annotation Agent: To annotate the cell type based on the provided ranked marker list, let's proceed step-by-step:

### Step 1: List the Key Functional Markers
1. **APOC1 (Apolipoprotein C1)**: Involved in lipid metabolism and transport. It plays a role in the regulation of lipoprotein metabolism.
2. **AIF1 (Allograft Inflammatory Factor 1)**: Associated with immune response, particularly in macrophages and microglia.
3. **CAPG (Capping Actin Protein, Gelsolin Like)**: Involved in actin filament dynamics, often associated with cell motility and immune response.
4. **FABP5 (Fatty Acid Binding Protein 5)**: Involved in fatty acid uptake, transport, and metabolism.
5. **APOE (Apolipoprotein E)**: Plays a crucial role in lipid meta

In [64]:
create_csv_from_json("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/jackcvs_10_markers_temp0_test.json")
#price:9.7$  --11.2$ -->1.5

Two CSV files have been created:
1. C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/jackcvs_10_markers_temp0_test_full.csv (full data)
2. C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/jackcvs_10_markers_temp0_test_summary.csv (summary data)


In [55]:
with open("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/jackcvs_10_markers_temp0_test.json", 'r') as json_file:
        data = json.load(json_file)

In [58]:
with open("C:/Users/ellio/OneDrive - UW-Madison/cellgpt_final_folder/Test_results/Elliot/csvjack/jackcvs_10_markers_temp0_test.json", 'r') as json_file:
        data = json.load(json_file)
        
for true_cell_type, details in data.items():
        main_cell_type = details['analysis_result']['main_cell_type']


KeyError: 'main_cell_type'

In [56]:
    # Function to write CSV files
def write_csv(filename, headers, row_data):
        with open(filename, 'w', newline='', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(headers)
            writer.writerows(row_data)

    # Prepare data for both CSV files
full_data = []
summary_data = []

for true_cell_type, details in data.items():
        main_cell_type = details['analysis_result']['main_cell_type']
        sub_cell_types = ', '.join(details['analysis_result']['sub_cell_types'])
        marker_number = details['analysis_result']['num_markers']
        iterations = details['iterations']  # Get the number of iterations
        conversation_history = ' | '.join([f"{entry[0]}: {entry[1]}" for entry in details['conversation_history']])
        
        full_data.append([true_cell_type, main_cell_type, sub_cell_types, 
                          marker_number, iterations, conversation_history])
        summary_data.append([true_cell_type, main_cell_type, sub_cell_types, iterations])

    # Generate output filenames based on input JSON filename
base_name = os.path.splitext(json_filename)[0]
full_csv_name = f"{base_name}_full.csv"
summary_csv_name = f"{base_name}_summary.csv"

KeyError: 'main_cell_type'

In [None]:
#price:9.7$  --11.2$ -->1.5
tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
results = run_analysis_for_tissues_granular(tissues_to_analyze, temperature=0, n_genes=10)


In [35]:
%%capture
tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
results = run_analysis_for_tissues_granular(tissues_to_analyze, temperature=0, n_genes=30)


In [40]:
%%capture
tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
results = run_analysis_for_tissues_granular(tissues_to_analyze, temperature=0.7, n_genes=50)

In [36]:
%%capture
tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
results = run_analysis_for_tissues_granular(tissues_to_analyze, temperature=0, n_genes=50)

In [37]:
%%capture
tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
results = run_analysis_for_tissues_granular(tissues_to_analyze, temperature=0, n_genes=100)

In [38]:
%%capture
tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
results = run_analysis_for_tissues_granular(tissues_to_analyze, temperature=0, n_genes=70)

In [18]:
%%capture
tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
results = run_analysis_for_tissues(tissues_to_analyze, temperature=0, n_genes=100)

In [None]:
%%capture
tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
results = run_analysis_for_tissues(tissues_to_analyze, temperature=0, n_genes=100)

In [16]:
%%capture
tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
results = run_analysis_for_tissues(tissues_to_analyze, temperature=0, n_genes=30)

In [17]:
%%capture
tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
results = run_analysis_for_tissues(tissues_to_analyze, temperature=0, n_genes=70)

In [15]:
# Example usage:
%%capture
tissues_to_analyze = ['heart', 'lung', 'skin', 'esophagusmucosa', 'esophagusmuscularis', 'skeletalmuscle', 'prostate']
results = run_analysis_for_tissues(tissues_to_analyze, temperature=0, n_genes=10)


Processing heart...

Analyzing Adipocyte...

Starting final annotation (Iteration 1)...

current prompt:  Your task is to annotate a single-cell human heart dataset. Please identify the cell type based on this ranked marker list:
ADIPOQ, PCK1, PLIN1, CIDEC, CIDEA, LGALS12, SAA1, RBP4, THRSP, GPAM
Final Annotation Agent: To annotate the cell type based on the provided ranked marker list, let's follow the outlined steps:

### Step 1: List the Key Functional Markers
- **ADIPOQ (Adiponectin, C1Q and Collagen Domain Containing):** A hormone involved in regulating glucose levels and fatty acid breakdown.
- **PCK1 (Phosphoenolpyruvate Carboxykinase 1):** An enzyme critical for gluconeogenesis, converting oxaloacetate to phosphoenolpyruvate.
- **PLIN1 (Perilipin 1):** A protein that coats lipid droplets in adipocytes, playing a role in lipid storage and metabolism.
- **CIDEC (Cell Death-Inducing DFFA-Like Effector C):** Involved in lipid droplet formation and regulation of lipid metabolism.
-

In [38]:
create_csv_from_json('esophagusmuscularis_results_broad2_debugging_genes_50_markers_temp0_v1.json')
create_csv_from_json('esophagusmucosa_results_broad2_debugging_genes_50_markers_temp0_v1.json')
create_csv_from_json('skeletalmuscle_results_broad2_debugging_genes_50_markers_temp0_v1.json')
create_csv_from_json('heart_results_broad2_debugging_genes_50_markers_temp0_v1.json')
create_csv_from_json('skin_results_broad2_debugging_genes_50_markers_temp0_v1.json')

Two CSV files have been created:
1. esophagusmuscularis_cell_type_analysis_results_full_genes.csv (full data)
2. esophagusmuscularis_cell_type_analysis_results_summary_genes.csv (summary data)
Two CSV files have been created:
1. esophagusmucosa_cell_type_analysis_results_full_genes.csv (full data)
2. esophagusmucosa_cell_type_analysis_results_summary_genes.csv (summary data)
Two CSV files have been created:
1. skeletalmuscle_cell_type_analysis_results_full_genes.csv (full data)
2. skeletalmuscle_cell_type_analysis_results_summary_genes.csv (summary data)
Two CSV files have been created:
1. heart_cell_type_analysis_results_full_genes.csv (full data)
2. heart_cell_type_analysis_results_summary_genes.csv (summary data)
Two CSV files have been created:
1. skin_cell_type_analysis_results_full_genes.csv (full data)
2. skin_cell_type_analysis_results_summary_genes.csv (summary data)


In [34]:
create_csv_from_json('skin_results_granular2_debugging_genes_10_markers_temp0_v1.json')


Two CSV files have been created:
1. skin_results_granular2_debugging_genes_10_markers_temp0_v1_full.csv (full data)
2. skin_results_granular2_debugging_genes_10_markers_temp0_v1_summary.csv (summary data)
