In [37]:
import gzip
import urllib.request
import requests
import json
import polars as pl
import openai

In [170]:
def fetch_geo_soft_http(accession):
    prefix = accession[:6] + "nnn"
    url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{prefix}/{accession}/soft/{accession}_family.soft.gz"
    
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with gzip.open(response.raw, 'rt') as f:
            return f.read()
    else:
        print(f"Error: HTTP {response.status_code}")
        return None


def parse_soft_to_json(soft_text):
    data = {}
    current_section = None
    sample_list = []  # List to hold all samples

    for line in soft_text.splitlines():
        if line.startswith('^'):
            # Check for new sections, including series and samples
            tag, value = line.split(" = ", 1)
            section_name = tag[1:].strip().lower()  # Remove "^" and make lowercase

            if section_name == "sample":
                # Initialize new sample dictionary
                current_sample = {"id": value}
                sample_list.append(current_sample)
                current_section = "sample"
            else:
                # Handle other sections (e.g., database, series)
                data[section_name] = {"id": value}
                current_section = section_name

        elif line.startswith('!'):
            # Attribute within a section or sample
            if current_section:
                tag, value = line.split(" = ", 1)
                attribute_name = tag[1:].strip().lower()
                
                # Store attributes in the current sample if we're in a sample section
                if current_section == "sample":
                    if attribute_name in current_sample:
                        # Handle multiple values for the same attribute
                        if isinstance(current_sample[attribute_name], list):
                            current_sample[attribute_name].append(value)
                        else:
                            current_sample[attribute_name] = [current_sample[attribute_name], value]
                    else:
                        current_sample[attribute_name] = value
                else:
                    # Store attributes in the general section (e.g., series)
                    if attribute_name in data[current_section]:
                        if isinstance(data[current_section][attribute_name], list):
                            data[current_section][attribute_name].append(value)
                        else:
                            data[current_section][attribute_name] = [data[current_section][attribute_name], value]
                    else:
                        data[current_section][attribute_name] = value

    # Add samples to main data if there are any
    if sample_list:
        data["samples"] = sample_list

    return data

def get_series_summary(samples_json):
    series_info = samples_json.get('series')
    series_summary = {
        'title': series_info.get('series_title'),
        'summary': series_info.get('series_summary'),
        'overall_design': ' '.join(series_info.get('series_overall_design')),
        'pubmed_id': series_info.get('pubmed_id'),
        'platform_id': series_info.get('platform_id'),
        'platform_taxid': series_info.get('platform_taxid'),
        'platform_organism': series_info.get('platform_organism'),
    }

    return series_summary

def extract_experimental_details(samples_json):
    experiment_details = []

    for sample in samples_json:
        # Extract key experimental details from each sample
        sample_details = {
            'id': sample.get('id'),
            'title': sample.get('sample_title'),
            'tissue': next((char for char in sample.get('sample_characteristics_ch1', []) if 'tissue' in char), None),
            'organism': sample.get('sample_organism_ch1'),
            'molecule': sample.get('sample_molecule_ch1'),
            'platform': sample.get('sample_platform_id'),
            'sequencing_instrument': sample.get('sample_instrument_model'),
            'library_strategy': sample.get('sample_library_strategy'),
            'library_source': sample.get('sample_library_source'),
            'data_processing': sample.get('sample_data_processing', []),
            'sample_characteristics': sample.get('sample_characteristics_ch1', []),
        }
        
        # sample characteristics is a list of strings, so we need to parse it to a dictionary
        sample_details['sample_characteristics'] = {char.split(': ')[0]: char.split(': ')[1] for char in sample_details['sample_characteristics']}

        # Filter out None values to keep only available details
        sample_details = {k: v for k, v in sample_details.items() if v}
        experiment_details.append(sample_details)

    return experiment_details

def get_geo_summary(accession):
    soft_data = fetch_geo_soft_http(accession)
    soft_json = parse_soft_to_json(soft_data)
    return {
        'series_summary': get_series_summary(soft_json),
        'samples': extract_experimental_details(soft_json['samples'])
    }


In [154]:
client = openai.Client(api_key="")
def ask_openai(question, context, model="gpt-4o-mini"):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {'role': 'system', 'content': f"{context}"},
            {'role': 'user', 'content': f"{question}"},
        ]
    )
    return response.choices[0].message.content

In [206]:
for accession in ['GSE265975', 'GSE231649', 'GSE255535']:
    geo_sample_info = get_geo_summary(accession)
    group_field = ask_openai(
        "Tell the dictionary key that corresponds to the experimental group or treatment condition of the sample. Return only the key without any quotation marks.",
        f"{geo_sample_info['samples'][0]['sample_characteristics']}"
    )
    print(f"Accession: {accession}")
    print(f"Sample Characteristics: {geo_sample_info['samples'][0]['sample_characteristics']}")
    print(f"Experimental Group Key: {group_field}")

    unique_treatments = set(sorted([x.get('sample_characteristics').get(group_field) for x in geo_sample_info.get("samples")]))

    print(f"Unique Treatments: {unique_treatments}")

    series_summary = geo_sample_info.get("series_summary")
    treatment_group_info = ask_openai(
        f"Describe each of the unique treatments in the dataset. The treatments are: {', '.join(unique_treatments)}",
        f"This is the series summary:\n{series_summary}"
    )

    print(treatment_group_info)
    
    print(pl.DataFrame(geo_sample_info.get("samples")).head(6))
    print("----\n")

Accession: GSE265975
Sample Characteristics: {'tissue': 'PBMCs', 'cell type': 'PBMCs', 'group': 'HDP-high', 'time': 'd0, Pre-priming'}
Experimental Group Key: group
Unique Treatments: {'HC', 'HDP-high', 'HDP-low'}
In the context of the study summarized, the unique treatments refer to the different groups of individuals being analyzed based on their health status and immune response to the mRNA BNT162b2 vaccine. Here's a description of each treatment:

1. **HC (Healthy Controls)**: This group consists of individuals without any chronic health conditions. They serve as a baseline for comparison to the other groups. The immune response of healthy controls is expected to be robust and effective after vaccination, making them a standard reference point for evaluating the vaccine's effectiveness in more vulnerable populations.

2. **HDP-high (High Responders)**: This group includes hemodialysis patients who respond well to the mRNA BNT162b2 vaccine. They demonstrate a strong humoral immune r