# Extract UK Biobank Fields for Respiratory Disease Research

> Extract and verify field names from embedded field mapping

- runtime: 10min
- recommended instance: mem1_ssd1_v2_x8
- cost: <£0.10

**No external files needed - all field mappings are embedded in this notebook!**

## Step 1: Initialize dxdata and connect to dataset

In [None]:
import dxdata
import os
import json

# Initialize dxdata engine
engine = dxdata.connect(dialect="hive+pyspark")

# Connect to dataset
project = os.getenv('DX_PROJECT_CONTEXT_ID')
record = os.popen("dx find data --type Dataset --delimiter ',' | awk -F ',' '{print $5}'").read().rstrip()
DATASET_ID = project + ":" + record
dataset = dxdata.load_dataset(id=DATASET_ID)
pheno = dataset['participant']

print(f"✓ Connected to dataset: {DATASET_ID}")
print(f"✓ Current directory: {os.getcwd()}")

## Step 2: Define field mapping (embedded in notebook)

In [None]:
# Field mapping for respiratory disease research - embedded directly in notebook
field_mapping = {
    "diagnosis": {
        "41270": "Diagnoses - ICD10 (all hospital diagnoses)",
        "41202": "Diagnoses - main ICD10 (hospital main diagnosis)",
        "41204": "Diagnoses - secondary ICD10 (hospital secondary diagnoses)",
        "40001": "Underlying (primary) cause of death ICD10",
        "40002": "Contributory (secondary) causes of death ICD10",
        "20002": "Non-cancer illness code, self-reported (includes respiratory)"
    },
    "demographics": {
        "31": "Sex",
        "34": "Year of birth",
        "52": "Month of birth",
        "21000": "Ethnic background",
        "21001": "Body mass index (BMI)",
        "50": "Standing height",
        "23104": "Weight",
        "738": "Income before tax",
        "6138": "Qualifications"
    },
    "environmental": {
        "1558": "Alcohol intake frequency",
        "20116": "Smoking status",
        "2867": "Age started smoking in current smokers",
        "2887": "Age stopped smoking",
        "22506": "Tobacco smoking (pack years)",
        "1787": "Maternal smoking around birth",
        "24003": "Nitrogen dioxide air pollution 2010",
        "24004": "Nitrogen oxides air pollution 2010",
        "24005": "Particulate matter air pollution (pm10)",
        "24006": "Particulate matter air pollution (pm2.5)",
        "24016": "Particulate matter air pollution (pm2.5) absorbance"
    },
    "blood_routine": {
        "30000": "White blood cell (leukocyte) count",
        "30010": "Red blood cell (erythrocyte) count",
        "30020": "Haemoglobin concentration",
        "30030": "Haematocrit percentage",
        "30040": "Mean corpuscular volume",
        "30050": "Mean corpuscular haemoglobin",
        "30060": "Mean corpuscular haemoglobin concentration",
        "30070": "Red blood cell (erythrocyte) distribution width",
        "30080": "Platelet count",
        "30090": "Platelet distribution width",
        "30100": "Mean platelet (thrombocyte) volume",
        "30110": "Neutrophil count",
        "30120": "Lymphocyte count",
        "30130": "Monocyte count",
        "30140": "Eosinophil count",
        "30150": "Basophil count"
    },
    "liver_function": {
        "30600": "Albumin",
        "30610": "Alkaline phosphatase",
        "30620": "Alanine aminotransferase",
        "30630": "Aspartate aminotransferase",
        "30640": "Direct bilirubin",
        "30650": "Urea",
        "30660": "Cholesterol",
        "30670": "Creatinine",
        "30680": "C-reactive protein",
        "30690": "Gamma glutamyltransferase",
        "30700": "Glucose",
        "30710": "Glycated haemoglobin (HbA1c)",
        "30720": "HDL cholesterol",
        "30730": "IGF-1",
        "30740": "LDL direct",
        "30750": "Lipoprotein A",
        "30760": "Phosphate",
        "30770": "SHBG",
        "30780": "Total bilirubin",
        "30790": "Testosterone",
        "30800": "Total protein",
        "30810": "Triglycerides"
    },
    "respiratory_assessment": {
        "2316": "Wheeze or whistling in the chest in last year",
        "2335": "Chest pain or discomfort",
        "4717": "Shortness of breath walking on level ground",
        "3786": "Age hay fever, rhinitis or eczema diagnosed by doctor",
        "6152": "Medication for cholesterol, blood pressure or diabetes",
        "20255": "Spirometry QC measure",
        "20256": "FEV1 Z-score",
        "20257": "FVC Z-score",
        "20258": "FEV1/FVC ratio Z-score",
        "3063": "FEV1, best measure",
        "3064": "FVC, best measure"
    },
    "psychological": {
        "20002": "Non-cancer illness code, self-reported (includes mental health)",
        "4598": "Ever depressed for a whole week",
        "4631": "Ever unenthusiastic/disinterested for a whole week",
        "5375": "Depression possibly related to stressful experience",
        "5663": "Anxiety, tension or general nervousness",
        "20126": "Bipolar and major depression status",
        "2090": "Seen doctor (GP) for nerves, anxiety, tension or depression",
        "2100": "Seen a psychiatrist for nerves, anxiety, tension or depression",
        "4559": "Duration of walks",
        "1558": "Alcohol intake frequency"
    },
    "imaging": {
        "20208": "Long axis heart images (Cardiac MRI DICOM)",
        "20205": "ECG datasets",
        "20252": "T1 structural brain images (NIFTI)"
    }
}

print("✓ Field mapping loaded")
print(f"  Categories: {', '.join(field_mapping.keys())}")
print(f"  Total unique fields: {len(set(fid for cat in field_mapping.values() for fid in cat.keys()))}")

## Step 3: Extract field names in UK Biobank format

In [None]:
# Convert field IDs to UK Biobank format (prefixed with 'p')
field_names = []
for category, fields in field_mapping.items():
    for field_id in fields.keys():
        field_names.append(f'p{field_id}')

# Remove duplicates and sort
field_names = sorted(list(set(field_names)))

print(f"Total unique fields to extract: {len(field_names)}\n")
print(f"First 20 fields:")
for field in field_names[:20]:
    print(f"  {field}")
if len(field_names) > 20:
    print(f"\n  ... and {len(field_names) - 20} more fields")

## Step 4: Verify fields exist in dataset

In [None]:
# Verify each field and collect details
verified_fields = []
missing_fields = []

print("Verifying fields in UK Biobank dataset...\n")

for field_name in field_names:
    try:
        field = pheno.find_field(name=field_name)
        verified_fields.append({
            'name': field.name,
            'title': field.title,
            'linkout': field.linkout
        })
        print(f"✓ [{field.name}]\t{field.title}")
    except Exception as e:
        missing_fields.append(field_name)
        print(f"✗ [{field_name}]\tNot found")

print(f"\n{'='*80}")
print(f"SUMMARY:")
print(f"  ✓ Found: {len(verified_fields)} fields")
print(f"  ✗ Missing: {len(missing_fields)} fields")
if missing_fields:
    print(f"\n  Missing fields: {', '.join(missing_fields)}")
print(f"{'='*80}")

## Step 5: Save results to JSON files

In [None]:
# Save all outputs to current directory
output_dir = os.getcwd()
print(f"Saving files to: {output_dir}\n")

# 1. Simple list of field names
output_file = 'extracted_field_names.json'
with open(output_file, 'w') as f:
    json.dump([f['name'] for f in verified_fields], f, indent=0)
print(f"✓ Field names: {output_file}")
print(f"  ({len(verified_fields)} fields)")

# 2. Detailed information
detail_file = 'field_details.json'
with open(detail_file, 'w') as f:
    json.dump(verified_fields, f, indent=2)
print(f"\n✓ Field details: {detail_file}")
print(f"  (includes titles and showcase URLs)")

# 3. Original mapping
mapping_file = 'ukb_field_mapping.json'
with open(mapping_file, 'w') as f:
    json.dump(field_mapping, f, indent=2)
print(f"\n✓ Field mapping: {mapping_file}")
print(f"  (organized by category)")

print(f"\n{'='*80}")
print("All files saved successfully!")
print(f"{'='*80}")

## Step 6 (Optional): Extract actual patient data

**Uncomment and run to extract data. Warning: Takes 10-30 minutes!**

In [None]:
# UNCOMMENT ALL LINES BELOW TO EXTRACT DATA

# print("Preparing to extract data...")

# # Get field objects
# field_objects = []
# for field_info in verified_fields:
#     try:
#         field = pheno.find_field(name=field_info['name'])
#         field_objects.append(field)
#     except Exception as e:
#         print(f"Error: {field_info['name']} - {e}")

# # Add participant ID
# field_objects.insert(0, pheno.find_field(name="eid"))

# print(f"\nExtracting data for {len(field_objects)} fields...")
# print("This will take 10-30 minutes...\n")

# # Extract data
# df = pheno.retrieve_fields(fields=field_objects, engine=engine)

# print(f"\n✓ Extracted: {df.shape[0]:,} participants × {df.shape[1]} fields")
# print(f"\nFirst 5 rows:")
# print(df.head())

# # Save to CSV
# csv_file = 'ukb_respiratory_data.csv'
# df.to_csv(csv_file, index=False)
# print(f"\n✓ Data saved: {csv_file}")
# print(f"  File size: {os.path.getsize(csv_file) / 1024 / 1024:.1f} MB")