# DEG Pipeline Kickoff ðŸ§¬ðŸ’¥

### ðŸ“¥ Load Dataset with GEOparse


In [1]:
import GEOparse

# Download the dataset (only once; will reuse if already downloaded)
gse = GEOparse.get_GEO(geo="GSE288708", destdir="../data/raw")

# Print basic summary
print(gse)


17-Sep-2025 15:05:38 DEBUG utils - Directory ../data/raw already exists. Skipping.
17-Sep-2025 15:05:38 INFO GEOparse - File already exist: using local version.
17-Sep-2025 15:05:38 INFO GEOparse - Parsing ../data/raw\GSE288708_family.soft.gz: 
17-Sep-2025 15:05:38 DEBUG GEOparse - DATABASE: GeoMiame
17-Sep-2025 15:05:38 DEBUG GEOparse - SERIES: GSE288708
17-Sep-2025 15:05:38 DEBUG GEOparse - PLATFORM: GPL24676
17-Sep-2025 15:05:38 DEBUG GEOparse - SAMPLE: GSM8773456
17-Sep-2025 15:05:38 DEBUG GEOparse - SAMPLE: GSM8773457
17-Sep-2025 15:05:38 DEBUG GEOparse - SAMPLE: GSM8773458
17-Sep-2025 15:05:38 DEBUG GEOparse - SAMPLE: GSM8773459
17-Sep-2025 15:05:38 DEBUG GEOparse - SAMPLE: GSM8773460
17-Sep-2025 15:05:38 DEBUG GEOparse - SAMPLE: GSM8773461
17-Sep-2025 15:05:38 DEBUG GEOparse - SAMPLE: GSM8773462
17-Sep-2025 15:05:38 DEBUG GEOparse - SAMPLE: GSM8773463
17-Sep-2025 15:05:38 DEBUG GEOparse - SAMPLE: GSM8773464
17-Sep-2025 15:05:38 DEBUG GEOparse - SAMPLE: GSM8773465
17-Sep-2025 15:

<SERIES: GSE288708 - 20 SAMPLES, 1 d(s)>


### Explore Sample Metadata

In [2]:
# See what sample IDs exist
list(gse.gsms.keys())[:5]


['GSM8773456', 'GSM8773457', 'GSM8773458', 'GSM8773459', 'GSM8773460']

### Metadata for one sample

In [3]:
### Look at metadata for one sample
gsm = gse.gsms["GSM8773456"]
gsm.metadata

{'title': ['CH1 - Control Hypoxia'],
 'geo_accession': ['GSM8773456'],
 'status': ['Public on Jun 16 2025'],
 'submission_date': ['Feb 04 2025'],
 'last_update_date': ['Jun 16 2025'],
 'type': ['SRA'],
 'channel_count': ['1'],
 'source_name_ch1': ['IMR90'],
 'organism_ch1': ['Homo sapiens'],
 'taxid_ch1': ['9606'],
 'characteristics_ch1': ['cell line: IMR90',
  'cell type: hiPSC cardiomyocytes',
  'treatment: Control Hypoxia'],
 'treatment_protocol_ch1': ['Insulin resistance (IR) was induced in hiPSC-CMs or EHTs using a six-day protocol with two media types: glucose-free IR media (3 days) and glucose-enriched IR media (3 days), both containing palmitic acid, insulin, and supplements. Controls were cultured in maturation media. For some experiments, hiPSC-CMs were exposed to high palmitate media (0.4 mM palmitate) or hypoxia (2% O2 for 16 hours), with controls in normoxia (21% O2).'],
 'growth_protocol_ch1': ['IMR90 human induced pluripotent stem cells (hiPSCs) were differentiated into 

## Parse All Samples into a DataFrame

In [4]:
import os
import pandas as pd

# Base project path
project_dir = r"C:\Users\maira\Downloads\T2D_Drug_Target_Discovery"

# Ensure processed folder exists
processed_dir = os.path.join(project_dir, "data", "processed")
os.makedirs(processed_dir, exist_ok=True)

# Step 3 â€” Parse metadata from titles
sample_data = []

for gsm_name, gsm in gse.gsms.items():
    title = gsm.metadata.get("title", [""])[0].strip()

    # Skip entries with missing titles (to avoid blank row)
    if title == "":
        continue
    
    # Treatment
    if title.startswith("IR"):
        treatment = "Insulin Resistant"
    else:
        treatment = "Control"

    # Oxygen
    if "Hypoxia" in title:
        oxygen = "Hypoxia"
    else:
        oxygen = "Normoxia"

    # Replicate
    replicate = ''.join([c for c in title if c.isdigit()])

    # Group code
    if treatment == "Control" and oxygen == "Normoxia":
        group = "CN"
    elif treatment == "Control" and oxygen == "Hypoxia":
        group = "CH"
    elif treatment == "Insulin Resistant" and oxygen == "Normoxia":
        group = "IRN"
    else:
        group = "IRH"

    sample_data.append({
        "Sample": gsm_name,
        "Title": title,
        "Treatment": treatment,
        "Oxygen": oxygen,
        "Replicate": replicate,
        "Group": group
    })

# Step 4 â€” Save CSV
meta_df = pd.DataFrame(sample_data)
meta_df = meta_df[meta_df["Sample"].notnull()]
csv_path = os.path.join(processed_dir, "sample_metadata.csv")
meta_df.to_csv(csv_path, index=False)

print(f"âœ… Metadata saved to: {csv_path}")
print(meta_df.head())
meta_df.shape


âœ… Metadata saved to: C:\Users\maira\Downloads\T2D_Drug_Target_Discovery\data\processed\sample_metadata.csv
       Sample                  Title Treatment   Oxygen Replicate Group
0  GSM8773456  CH1 - Control Hypoxia   Control  Hypoxia         1    CH
1  GSM8773457  CH2 - Control Hypoxia   Control  Hypoxia         2    CH
2  GSM8773458  CH3 - Control Hypoxia   Control  Hypoxia         3    CH
3  GSM8773459  CH4 - Control Hypoxia   Control  Hypoxia         4    CH
4  GSM8773460  CH5 - Control Hypoxia   Control  Hypoxia         5    CH


(20, 6)