# 🧬 TCGA-PRAD + TCIA Data Correlation Notebook

This notebook helps correlate genomic and clinical data from the GDC portal with imaging metadata from TCIA for prostate cancer research.

## Step 1: Load Genomic and Clinical Data from GDC (JSON Format)

In [None]:
import pandas as pd
import json

# Load GDC JSON file (replace with actual file path)
with open('gdc_cases.json') as f:
    gdc_data = json.load(f)

# Convert JSON to DataFrame
gdc_df = pd.json_normalize(gdc_data)
gdc_df['patient_id'] = gdc_df['submitter_id'].str.upper()
gdc_df.head()

## Step 2: Load Imaging Metadata from TCIA (CSV Format)

In [None]:
# Load TCIA metadata CSV (replace with actual file path)
tcia_df = pd.read_csv('tcia_metadata.csv')
tcia_df['patient_id'] = tcia_df['PatientID'].str.upper()
tcia_df.head()

## Step 3: Merge Genomic/Clinical and Imaging Data

In [None]:
# Merge on patient_id
merged_df = pd.merge(gdc_df, tcia_df, on='patient_id')
merged_df.to_csv('merged_patient_data.csv', index=False)
merged_df.head()

## Step 4: Prepare Merged Dataset for Downstream ML (Optional)

In [None]:
# Drop unnecessary columns or prepare features
# Example: Keep only relevant features
features = merged_df[['patient_id', 'age_at_diagnosis', 'StudyInstanceUID', 'imaging modality']]
features.head()