# UKB

* **Project:** ADRD-SORL1-Biobanks
* **Version:** Python/3.10
* **Last Updated:** 14-Jun-2025

## Notebook Overview
Create cohorts, gene characterization,  allele freqs

# Initialize Notebook

## Import packages

In [None]:
import pyspark
import dxdata
import dxpy
import pandas as pd
from datetime import date, datetime
import os 
import numpy as np
import random
import shutil
import glob
import requests
from functools import reduce

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)


## Initialize variables

In [None]:
gene_names = ["SORL1"]


## Initialize helper functions

In [None]:
def fetch_gene_info_ensembl(gene_names, species='human', genome_version='GRCh38'):
    gene_info_dict = {}
    server = "https://rest.ensembl.org"
    
    for gene_name in gene_names:
        endpoint = f"/lookup/symbol/{species}/{gene_name}"
        headers = {"Content-Type": "application/json"}

        response = requests.get(server + endpoint, headers=headers, params={"expand": "1"})
        if not response.ok:
            print(f"Fetching failed for {gene_name}")
            continue

        data = response.json()
        gene_info = {
            "gene_name": data.get("display_name", gene_name),
            "chromosome": f"chr{data['seq_region_name']}",
            "start": int(data["start"]),
            "end": int(data["end"]),
            "genome_version": genome_version
        }

        gene_info_dict[gene_name] = gene_info

    return gene_info_dict


# Fetch cohorts

## Grab the dataset containing participant information

In [None]:
dispensed_dataset_id = dxpy.find_one_data_object(typename="Dataset", name="app*.dataset", folder="/", name_mode="glob")["id"]
dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]


## Retrieve Cases

### Pull down the fields we need 
https://docs.google.com/document/d/1AebkQ-Nxrk63jhsDzZpn5QD-7EK4unsykHVj-saEm3U/edit?usp=sharing

In [None]:
field_names = [
    "eid", 
    "p31", 
    "p34", 
    "p21022", 
    "p42018", 
    "p42020", 
    "p42032", 
    "p22009_a1", 
    "p22009_a2", 
    "p22009_a3", 
    "p22009_a4", 
    "p22009_a5", 
    "p40000_i0",
]
df_cases = participant.retrieve_fields(names=field_names, coding_values="replace", engine=dxdata.connect())
df_cases = df_cases.toPandas()


### Rename columns to be human-readable

In [None]:
df_cases = df_cases.rename(columns={
    'eid':'ID',
    'p31':'GENETIC_SEX', 
    'p34':'BIRTH_YEAR', 
    'p21022':'AGE_OF_RECRUIT',
    'p42018':'DEM_DATE',
    'p42020':'AD_DATE',
    'p42032':'PD_DATE',
    'p22009_a1':'PC1',
    'p22009_a2':'PC2',
    'p22009_a3':'PC3',
    'p22009_a4':'PC4',
    'p22009_a5':'PC5',
    'p40000_i0':'DATE_OF_DEATH',
})


### Find participants with AD, PD, and RD

In [None]:
# AD:
df_ad = df_cases[~df_cases[f'AD_DATE'].isna() & df_cases[f'PD_DATE'].isna()]
df_ad = df_ad[[
    "ID", 
    "GENETIC_SEX", 
    "BIRTH_YEAR", 
    "AGE_OF_RECRUIT", 
    "AD_DATE", 
    "PC1", 
    "PC2", 
    "PC3", 
    "PC4", 
    "PC5", 
    "DATE_OF_DEATH",
]]
df_ad["ID"] = pd.to_numeric(df_ad["ID"])
    
# PD:
df_pd = df_cases[~df_cases[f'PD_DATE'].isna() & df_cases[f'AD_DATE'].isna() & df_cases[f'DEM_DATE'].isna()]
df_pd = df_pd[[
    "ID", 
    "GENETIC_SEX", 
    "BIRTH_YEAR", 
    "AGE_OF_RECRUIT", 
    "PD_DATE", 
    "PC1", 
    "PC2", 
    "PC3", 
    "PC4", 
    "PC5", 
    "DATE_OF_DEATH",
]]
df_pd["ID"] = pd.to_numeric(df_pd["ID"])
    
# RD:
df_rd = df_cases[
    (~df_cases['DEM_DATE'].isna() & df_cases['AD_DATE'].isna())
    | (~df_cases['AD_DATE'].isna() & ~df_cases['PD_DATE'].isna())
]
df_rd = df_rd[[
    "ID", 
    "GENETIC_SEX", 
    "BIRTH_YEAR", 
    "AGE_OF_RECRUIT", 
    "DEM_DATE", 
    "PC1", 
    "PC2", 
    "PC3", 
    "PC4", 
    "PC5", 
    "DATE_OF_DEATH",
]]
df_rd["ID"] = pd.to_numeric(df_rd["ID"])


## Retrieve Controls

### Retrieve field names of interest for each participant

In [None]:
# Date G10 first reported (huntington's disease),
# Date D11 first reported (hereditary ataxia), 
# Date G12 first reported (spinal muscular atrophy and related syndromes), 
# Date G13 first reported (systemic atrophies primarily affecting central nervous system in diseases classified elswhere), 
# Date G14 first reported (postpolio syndrome), 
# Date G20 first reported (parkinson's disease), 
# Date G21 first reported (secondary parkinsonism), 
# Date G22 first reported (parkinsonism in diseases classified elsewhere), 
# Date G23 first reported (other degenerative diseases of basal ganglia), 
# Date G24 first reported (dystonia), 
# Date G25 first reported (other extrapyramidal and movement disorders), 
# Date G30 first reported (alzheimer's disease), 
# Date G31 first reported (other degenerative diseases of nervous system, not elsewhere classified), 
# Date G32 first reported (other degenerative disorders of nervous system in diseases classified elsewhere), 
# Date G35 first reported (multiple sclerosis), 
# Date G36 first reported (other acute disseminated demyelination), 
# Date G37 first reported (other demyelinating diseases of central nervous system), 
# Date G45 first reported (transient cerebral ischaemic attacks and related syndromes), 
# Date G46 first reported (vascular syndromes of brain in cerebrovascular diseases), 
# Date G50 first reported (disorders of trigeminal nerve), 
# Date G52 first reported (disorders of other cranial nerves), 
# Date G53 first reported (cranial nerve disorders in diseases classified elsewhere), 
# Date G54 first reported (nerve root and plexus disorders), 
# Date G55 first reported (nerve root and plexus compressions in diseases classified elsewhere), 
# Date G56 first reported (mononeuropathies of upper limb), 
# Date G57 first reported (mononeuropathies of lower limb), 
# Date G58 first reported (other mononeuropathies), 
# Date G59 first reported (mononeuropathy in diseases classified elsewhere), 
# Date G60 first reported (hereditary and idiopathic neuropathy), 
# Date G61 first reported (inflammatory polyneuropathy), 
# Date G62 first reported (other polyneuropathies), 
# Date G63 first reported (polyneuropathy in diseases classified elsewhere), 
# Date G64 first reported (other disorders of peripheral nervous system), 
# Date G70 first reported (myasthenia gravis and other myoneural disorders), 
# Date G71 first reported (primary disorders of muscles), 
# Date G72 first reported (other myopathies), 
# Date G73 first reported (disorders of myoneural junction and muscle in diseases classified elsewhere), 
# Date G80 first reported (infantile cerebral palsy), 
# Date G81 first reported (hemiplegia), 
# Date G82 first reported (paraplegia and tetraplegia), 
# Date G83 first reported (other paralytic syndromes), 
# Date G90 first reported (disorders of autonomic nervous system),
# Date G91 first reported (hydrocephalus), 
# Date G92 first reported (toxic encephalopathy), 
# Date G93 first reported (other disorders of brain), 
# Date G94 first reported (other disorders of brain in diseases classified elsewhere), 
# Date G96 first reported (other disorders of central nervous system), 
# Date G97 first reported (postprocedural disorders of nervous system, not elsewhere classified),  
# Date G98 first reported (other disorders of nervous system, not elsewhere classified), 
# Date G99 first reported (other disorders of nervous system in diseases classified elsewhere), 
# Date of all cause dementia report, 
# Date of alzheimer's disease report, 
# Date of vascular dementia report, 
# Date of frontotemporal dementia report, 
# Date of motor neurone disease report, 
# Date of all cause parkinsonism report, 
# Date of parkinson's disease report, 
# Date of progressive supranuclear palsy report, 
# Date of multiple system atrophy report, 
# Genetic ethnic grouping, 
# Age at recruitment, 
# Townsend deprivation index at recruitment, 
# Sex, 
# Genetic Principal components | Array 1, 
# Genetic Principal components | Array 2, 
# Genetic Principal components | Array 3, 
# Genetic Principal components | Array 4, 
# Genetic Principal components | Array 5

field_names = ['eid', 'p131012', 'p131016', 'p131018', 'p131020', 'p131022', 'p131024', 'p131026', 'p131028', 'p131030', 'p131036', 'p131038', 'p131040', 
               'p131042', 'p131046', 'p131056', 'p131058', 'p131062', 'p131066', 'p131068', 'p131070', 'p131074', 'p131076', 'p131078', 'p131080', 'p131082', 
               'p131084', 'p131086', 'p131088', 'p131090', 'p131092', 'p131094', 'p131096', 'p131098', 'p131100', 'p131102', 'p131104', 'p131106', 'p131108', 
               'p131110', 'p131112', 'p131114', 'p131116', 'p131120', 'p131122', 'p131124', 'p131126',  'p42018', 'p42020', 'p42022', 'p42024', 'p42028', 
               'p42030', 'p42032', 'p42034', 'p42036', 'p22006', 'p21022', 'p31', 'p22009_a1', 'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5', 'p34', 
               'p40000_i0', 'p20110_i0', 'p20110_i1', 'p20110_i2', 'p20110_i3', 'p20107_i0', 'p20107_i1', 'p20107_i2', 'p20107_i3']
df_control = participant.retrieve_fields(names=field_names, coding_values="replace", engine=dxdata.connect())
df_control = df_control.toPandas()


### Remove participants with any of the listed conditions

In [None]:
df_control = df_control[df_control['p131012'].isnull() & df_control['p131016'].isnull() & df_control['p131018'].isnull() & df_control['p131020'].isnull() 
                        & df_control['p131022'].isnull() & df_control['p131024'].isnull() & df_control['p131026'].isnull() & df_control['p131028'].isnull() 
                        & df_control['p131030'].isnull() & df_control['p131036'].isnull() & df_control['p131038'].isnull() & df_control['p131040'].isnull() 
                        & df_control['p131042'].isnull() & df_control['p131046'].isnull() & df_control['p131056'].isnull() & df_control['p131058'].isnull() 
                        & df_control['p131062'].isnull() & df_control['p131066'].isnull() & df_control['p131068'].isnull() & df_control['p131070'].isnull() 
                        & df_control['p131074'].isnull() & df_control['p131076'].isnull() & df_control['p131078'].isnull() & df_control['p131080'].isnull() 
                        & df_control['p131082'].isnull() & df_control['p131084'].isnull() & df_control['p131086'].isnull() & df_control['p131088'].isnull() 
                        & df_control['p131090'].isnull() & df_control['p131092'].isnull() & df_control['p131094'].isnull() & df_control['p131096'].isnull() 
                        & df_control['p131098'].isnull() & df_control['p131100'].isnull() & df_control['p131102'].isnull() & df_control['p131104'].isnull() 
                        & df_control['p131106'].isnull() & df_control['p131108'].isnull() & df_control['p131110'].isnull() & df_control['p131112'].isnull() 
                        & df_control['p131114'].isnull() & df_control['p131116'].isnull() & df_control['p131120'].isnull() & df_control['p131122'].isnull() 
                        & df_control['p131124'].isnull() & df_control['p131126'].isnull() & df_control['p42018'].isnull() & df_control['p42020'].isnull() 
                        & df_control['p42022'].isnull() & df_control['p42024'].isnull() & df_control['p42028'].isnull() & df_control['p42030'].isnull() 
                        & df_control['p42032'].isnull() & df_control['p42034'].isnull() & df_control['p42036'].isnull()]


### Remove participants whose parents have AD or PD

In [None]:
# Columns defining all instances of parent illness
parent_illness_cols = ['p20110_i0', 'p20110_i1', 'p20110_i2', 'p20110_i3', 'p20107_i0', 'p20107_i1', 'p20107_i2', 'p20107_i3']

# Convert None values to empty lists
for illness_col in parent_illness_cols:
    df_control[illness_col] = df_control[illness_col].apply(lambda l: l if isinstance(l, list) else [])

# Define a condition as anybody who has never reported a parent as having AD or PD
condition = lambda participant: all(("Alzheimer's disease/dementia" not in illnesses and "Parkinson's disease" not in illnesses) for illnesses in participant[parent_illness_cols])

# Apply the condition to give all participants who have a parent who has/had AD or PD
df_control = df_control[df_control.apply(condition, axis=1)]


### Remove participants below the defined age threshold

In [None]:
df_control = df_control[df_control['p21022'] >= 65]


### Rename columns

In [None]:
df_control = df_control[['eid', 'p21022', 'p31', 'p22009_a1', 'p22009_a2', 'p22009_a3', 'p22009_a4', 'p22009_a5', 'p34', 'p22006', 'p40000_i0']]
df_control.rename(columns={
    'eid':'ID',
    'p21022':'AGE_OF_RECRUIT', 
    'p31':'GENETIC_SEX', 
    'p22009_a1':'PC1', 
    'p22009_a2':'PC2', 
    'p22009_a3':'PC3', 
    'p22009_a4':'PC4', 
    'p22009_a5':'PC5', 
    'p34':'BIRTH_YEAR', 
    'p22006':'ETHNICITY', 
    'p40000_i0':'DATE_OF_DEATH',
}, inplace=True)
df_control["ID"] = pd.to_numeric(df_control["ID"])
df_control.info()


# Find ancestry information about each cohort

## Read ancestry label mappings

In [None]:
! dx download data/ukbb_imputed_genotypes_umap_linearsvc_predicted_labels.txt --overwrite
ancestries = pd.read_csv("ukbb_imputed_genotypes_umap_linearsvc_predicted_labels.txt", sep="\t")


## Add labels to cohort dataframes

In [None]:
df_control = df_control.merge(ancestries[["IID","label"]], left_on="ID", right_on="IID").drop("IID", axis=1)
df_ad = df_ad.merge(ancestries[["IID","label"]], left_on="ID", right_on="IID").drop("IID", axis=1)
df_pd = df_pd.merge(ancestries[["IID","label"]], left_on="ID", right_on="IID").drop("IID", axis=1)
df_rd = df_rd.merge(ancestries[["IID","label"]], left_on="ID", right_on="IID").drop("IID", axis=1)


## Get list of IDs for each cohort

In [None]:
ids_control = df_control["ID"].tolist()
ids_ad = df_ad["ID"].tolist()
ids_pd = df_pd["ID"].tolist()
ids_rd = df_rd["ID"].tolist()


# Remove related individuals

### Fetch relatedness data

In [None]:
df_full_related = pd.read_csv('../../mnt/project/Bulk/Genotype Results/Genotype calls/ukb_rel.dat', sep = ' ')
df_full_related = df_full_related[df_full_related['Kinship'] > 0.0884]


### Define cohorts to maximize cases included

In [None]:
ids_full_cohort = ids_ad + ids_pd + ids_rd + ids_control
ids_case = ids_ad + ids_rd + ids_pd


### Keep only rows with both participants in cohorts of interest

In [None]:
df_related_cohort = df_full_related.loc[df_full_related['ID1'].isin(ids_full_cohort) & df_full_related['ID2'].isin(ids_full_cohort)]
df_related_cohort = df_related_cohort.reset_index(drop=True)


### Maximize the number of cases included

In [None]:
df_flipped = df_related_cohort[df_related_cohort["ID1"].isin(ids_control) & df_related_cohort["ID2"].isin(ids_case)].copy()
df_related_cohort = df_related_cohort[~(df_related_cohort["ID1"].isin(ids_control) & df_related_cohort["ID2"].isin(ids_case))]
df_flipped.rename(columns={"ID1":"ID2", "ID2":"ID1"}, inplace=True)
df_related_cohort = pd.concat([df_related_cohort, df_flipped])


### Get set of participants to remove

In [None]:
ids_to_remove = set(df_related_cohort["ID2"])
print(f"Removing {len(ids_to_remove)} participants")


### Filter ID lists accordingly

In [None]:
ids_ad = [iid for iid in ids_ad if iid not in ids_to_remove]
ids_pd = [iid for iid in ids_pd if iid not in ids_to_remove]
ids_rd = [iid for iid in ids_rd if iid not in ids_to_remove]
ids_control = [iid for iid in ids_control if iid not in ids_to_remove]
ids_total = ids_ad + ids_pd + ids_rd + ids_control


In [None]:
print(len(ids_ad))
print(len(ids_pd))
print(len(ids_rd))
print(len(ids_control))
print()
print(len([iid for iid in ids_ad if iid in ids_pd]))
print(len([iid for iid in ids_ad if iid in ids_rd]))
print(len([iid for iid in ids_ad if iid in ids_control]))
print(len([iid for iid in ids_pd if iid in ids_rd]))
print(len([iid for iid in ids_pd if iid in ids_control]))
print(len([iid for iid in ids_rd if iid in ids_control]))


### Save the IDs of each participant to a txt file

In [None]:
with open('ad_ids_pre_VCF.txt', 'w') as file:
    for iid in ids_ad:
        file.write(f"{iid}\n")
        

In [None]:
with open('pd_ids_pre_VCF.txt', 'w') as file:
    for iid in ids_pd:
        file.write(f"{iid}\n")


In [None]:
with open('rd_ids_pre_VCF.txt', 'w') as file:
    for iid in ids_rd:
        file.write(f"{iid}\n")


In [None]:
with open('control_ids_pre_VCF.txt', 'w') as file:
    for iid in ids_control:
        file.write(f"{iid}\n")


In [None]:
with open('ids_pre_VCF.txt', 'w') as file:
    for iid in ids_total:
        file.write(f"{iid}\n")


# Filter out participants without WGS data

## Only include participants with WGS data

In [None]:
! dx download /data/pvcf_full_ids.txt --overwrite
! grep -Fwf pvcf_full_ids.txt ids_pre_VCF.txt > filtered_sample_ids.txt
! grep -Fwf pvcf_full_ids.txt ad_ids_pre_VCF.txt > filtered_ad_ids.txt
! grep -Fwf pvcf_full_ids.txt pd_ids_pre_VCF.txt > filtered_pd_ids.txt
! grep -Fwf pvcf_full_ids.txt rd_ids_pre_VCF.txt > filtered_rd_ids.txt
! grep -Fwf pvcf_full_ids.txt control_ids_pre_VCF.txt > filtered_control_ids.txt


In [None]:
with open('filtered_ad_ids.txt', 'r') as file:
    ids_ad = [int(line.strip()) for line in file]
with open('filtered_rd_ids.txt', 'r') as file:
    ids_rd = [int(line.strip()) for line in file]
with open('filtered_pd_ids.txt', 'r') as file:
    ids_pd = [int(line.strip()) for line in file]
with open('filtered_control_ids.txt', 'r') as file:
    ids_control = [int(line.strip()) for line in file]


## Get participant IDs for each cohort

In [None]:
df_ad = df_ad[df_ad["ID"].isin(ids_ad)]
df_pd = df_pd[df_pd["ID"].isin(ids_pd)]
df_rd = df_rd[df_rd["ID"].isin(ids_rd)]
df_control = df_control[df_control["ID"].isin(ids_control)]


In [None]:
print(f"Number of AD participants:       {len(ids_ad)}")
print(f"Number of PD participants:       {len(ids_pd)}")
print(f"Number of RD participants:       {len(ids_rd)}")
print(f"Number of Control participants:  {len(ids_control)}")


In [None]:
! dx upload filtered_sample_ids.txt --path /results/sorl1/sample_ids.txt
! dx upload filtered_ad_ids.txt --path /results/sorl1/ad_ids.txt
! dx upload filtered_pd_ids.txt --path /results/sorl1/pd_ids.txt
! dx upload filtered_rd_ids.txt --path /results/sorl1/rd_ids.txt
! dx upload filtered_control_ids.txt --path /results/sorl1/control_ids.txt


# Save and print cohort statistics

In [None]:
df_control.to_csv("Control.csv", header=True, index=False)
df_ad.to_csv(f'AD.csv', header=True, index=False)
df_pd.to_csv(f'PD.csv', header=True, index=False)
df_rd.to_csv(f'RD.csv', header=True, index=False)


In [None]:
! dx upload Control.csv --path /results/sorl1/Control.csv
! dx upload AD.csv --path /results/sorl1/AD.csv
! dx upload PD.csv --path /results/sorl1/PD.csv
! dx upload RD.csv --path /results/sorl1/RD.csv


In [None]:
print(df_control["label"].value_counts())
print(df_ad["label"].value_counts())
print(df_pd["label"].value_counts())
print(df_rd["label"].value_counts())
print("\n")

print(df_control["GENETIC_SEX"].value_counts())
print(df_ad["GENETIC_SEX"].value_counts())
print(df_pd["GENETIC_SEX"].value_counts())
print(df_rd["GENETIC_SEX"].value_counts())
print("\n")

print(f'{df_control[df_control["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].mean()} +/- {df_control[df_control["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].std()}')
print(f'{df_ad[df_ad["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].mean()} +/- {df_ad[df_ad["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].std()}')
print(f'{df_pd[df_pd["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].mean()} +/- {df_pd[df_pd["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].std()}')
print(f'{df_rd[df_rd["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].mean()} +/- {df_rd[df_rd["GENETIC_SEX"] == "Male"]["AGE_OF_RECRUIT"].std()}')
print(f'{df_control[df_control["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].mean()} +/- {df_control[df_control["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].std()}')
print(f'{df_ad[df_ad["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].mean()} +/- {df_ad[df_ad["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].std()}')
print(f'{df_pd[df_pd["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].mean()} +/- {df_pd[df_pd["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].std()}')
print(f'{df_rd[df_rd["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].mean()} +/- {df_rd[df_rd["GENETIC_SEX"] == "Female"]["AGE_OF_RECRUIT"].std()}')
print("\n")


# Fetch pVCF chunks for SORL1

In [None]:
print(fetch_gene_info_ensembl(gene_names))
start = fetch_gene_info_ensembl(gene_names)["SORL1"]["start"] // 20000
end = fetch_gene_info_ensembl(gene_names)["SORL1"]["end"] // 20000 + 1
print(f"Chromosome:    {fetch_gene_info_ensembl(gene_names)['SORL1']['chromosome']}")
print(f"Start b-val:   {start}")
print(f"End b-val:     {end}")


In [None]:
%%bash
for b_val in {6072..6082};
do
    dx run swiss-army-knife \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr11/ukb24310_c11_b${b_val}_v1.vcf.gz" \
    -iin="/Bulk/DRAGEN\ WGS/DRAGEN\ population\ level\ WGS\ variants,\ pVCF\ format\ [500k\ release]/chr11/ukb24310_c11_b${b_val}_v1.vcf.gz.tbi" \
    -iin="/results/sorl1/sample_ids.txt" \
    -icmd="bcftools view -O z -S sample_ids.txt ukb24310_c11_b${b_val}_v1.vcf.gz -o SORL1_b${b_val}.vcf.gz" \
    --instance-type mem2_ssd1_v2_x32 \
    --destination "${projectid}:/results/sorl1/01_pvcf_chunks"
done


# Combine pVCF chunks into one file

In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/results/sorl1/01_pvcf_chunks/SORL1_b6072.vcf.gz" \
-iin="/results/sorl1/01_pvcf_chunks/SORL1_b6073.vcf.gz" \
-iin="/results/sorl1/01_pvcf_chunks/SORL1_b6074.vcf.gz" \
-iin="/results/sorl1/01_pvcf_chunks/SORL1_b6075.vcf.gz" \
-iin="/results/sorl1/01_pvcf_chunks/SORL1_b6076.vcf.gz" \
-iin="/results/sorl1/01_pvcf_chunks/SORL1_b6077.vcf.gz" \
-iin="/results/sorl1/01_pvcf_chunks/SORL1_b6078.vcf.gz" \
-iin="/results/sorl1/01_pvcf_chunks/SORL1_b6079.vcf.gz" \
-iin="/results/sorl1/01_pvcf_chunks/SORL1_b6080.vcf.gz" \
-iin="/results/sorl1/01_pvcf_chunks/SORL1_b6081.vcf.gz" \
-iin="/results/sorl1/01_pvcf_chunks/SORL1_b6082.vcf.gz" \
-icmd="bcftools concat -O z SORL1_b6072.vcf.gz SORL1_b6073.vcf.gz SORL1_b6074.vcf.gz SORL1_b6075.vcf.gz SORL1_b6076.vcf.gz SORL1_b6077.vcf.gz SORL1_b6078.vcf.gz SORL1_b6079.vcf.gz SORL1_b6080.vcf.gz SORL1_b6081.vcf.gz SORL1_b6082.vcf.gz -o SORL1.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/results/sorl1/02_pvcf_genes"


# Normalize VCF before annotation

## Split multiallelic sites into biallelic records

In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/results/sorl1/02_pvcf_genes/SORL1.vcf.gz" \
-icmd="bcftools norm -m-both -o biallelic.vcf SORL1.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/results/sorl1/03_normalized"


## Left-align and normalize

In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------ PAUSE HERE UNTIL PREVIOUS STEP COMPLETES ------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/results/sorl1/03_normalized/biallelic.vcf" \
-iin="/data/Homo_sapiens_assembly38.fasta" \
-icmd="bcftools norm -f Homo_sapiens_assembly38.fasta -o normalized.vcf biallelic.vcf" \
--instance-type mem2_ssd1_v2_x64 \
--destination "${projectid}:/results/sorl1/03_normalized/"


# Annotation

## Get subset of participant IDs

In [None]:
! dx download results/sorl1/ad_ids.txt --overwrite
! dx download results/sorl1/pd_ids.txt --overwrite
! dx download results/sorl1/rd_ids.txt --overwrite
! dx download results/sorl1/control_ids.txt --overwrite


In [None]:
ids_ad = "ad_ids.txt"
ids_pd = "pd_ids.txt"
ids_rd = "rd_ids.txt"
ids_control = "control_ids.txt"
output_file = "annot_ids.txt"

with open(ids_ad, "r") as f1, open(ids_pd, "r") as f2, open(ids_rd, "r") as f3, open(ids_control, "r") as f4, open(output_file, "w") as out:
    out.write(f1.readline().strip() + "\n")
    out.write(f2.readline().strip() + "\n")
    out.write(f3.readline().strip() + "\n")
    out.write(f4.readline().strip() + "\n")


In [None]:
! dx upload annot_ids.txt --path results/sorl1/annot_ids.txt


## Filter VCFs to only include a few participants

In [None]:
%%bash

dx run swiss-army-knife \
-iin="/results/sorl1/03_normalized/normalized.vcf" \
-iin="/results/sorl1/annot_ids.txt" \
-icmd="bcftools view -O z -S annot_ids.txt normalized.vcf -o filtered.vcf.gz" \
--instance-type mem2_ssd1_v2_x32 \
--destination "${projectid}:/results/sorl1/04_annotated"


## Fetch Annovar libraries and reference genome data

In [None]:
%%capture

! wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz
! tar -xzf annovar.latest.tar.gz
! chmod a+x ./annovar/*.pl
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar refGene annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar avsnp150 annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar clinvar_20221231 annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar dbnsfp30a annovar/humandb/
! annovar/annotate_variation.pl -downdb -buildver hg38 -webfrom annovar gnomad40_genome annovar/humandb/
! dx download data/Homo_sapiens_assembly38.fasta --overwrite
! dx download data/Homo_sapiens_assembly38.fasta.fai --overwrite
! dx download data/Homo_sapiens_assembly38.dict --overwrite
! dx download results/sorl1/04_annotated/filtered.vcf.gz


## Perform annotation

In [None]:
%%bash

annovar/table_annovar.pl filtered.vcf.gz annovar/humandb/ \
--buildver hg38 \
--thread 96 \
--remove \
--protocol refGene,avsnp150,clinvar_20221231,dbnsfp30a,gnomad40_genome \
--operation g,f,f,f,f \
--nopolish \
--nastring . \
--out var_calling.annovar \
--vcfinput


In [None]:
df_annot = pd.read_csv(f"var_calling.annovar.hg38_multianno.txt", sep = '\t')
df_annot.to_csv(f"annotated.csv", index=False)
! dx upload annotated.csv --path results/sorl1/04_annotated/annotated.csv


# Calculate allele frequencies

## Subset IDs for all cohort-ancestry combinations

In [None]:
! dx download results/sorl1/AD.csv --overwrite
! dx download results/sorl1/PD.csv --overwrite
! dx download results/sorl1/RD.csv --overwrite
! dx download results/sorl1/Control.csv --overwrite


In [None]:
df_ad = pd.read_csv("AD.csv")
df_pd = pd.read_csv("PD.csv")
df_rd = pd.read_csv("RD.csv")
df_control = pd.read_csv("Control.csv")


In [None]:
df_ad = df_ad[["ID","label"]]
df_pd = df_pd[["ID","label"]]
df_rd = df_rd[["ID","label"]]
df_control = df_control[["ID","label"]]


In [None]:
for ancestry in df_ad["label"].unique():
    ids = df_ad[df_ad["label"] == ancestry]["ID"]
    with open(f"AD_{ancestry}.txt", 'w') as file:
        for iid in ids:
            file.write(f"{iid}\n")
    !dx upload AD_{ancestry}.txt --path results/sorl1/id_files/AD_{ancestry}.txt

for ancestry in df_pd["label"].unique():
    ids = df_pd[df_pd["label"] == ancestry]["ID"]
    with open(f"PD_{ancestry}.txt", 'w') as file:
        for iid in ids:
            file.write(f"{iid}\n")
    !dx upload PD_{ancestry}.txt --path results/sorl1/id_files/PD_{ancestry}.txt

for ancestry in df_rd["label"].unique():
    ids = df_rd[df_rd["label"] == ancestry]["ID"]
    with open(f"RD_{ancestry}.txt", 'w') as file:
        for iid in ids:
            file.write(f"{iid}\n")
    !dx upload RD_{ancestry}.txt --path results/sorl1/id_files/RD_{ancestry}.txt

for ancestry in df_control["label"].unique():
    ids = df_control[df_control["label"] == ancestry]["ID"]
    with open(f"Control_{ancestry}.txt", 'w') as file:
        for iid in ids:
            file.write(f"{iid}\n")
    !dx upload Control_{ancestry}.txt --path results/sorl1/id_files/Control_{ancestry}.txt


## Get frequencies for each gene-cohort-ancestry combination

In [None]:
%%bash

for cohort in {"AD","PD","RD","Control"};
do
    for ancestry in {"AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"};
    do
        dx run swiss-army-knife \
        -iin="/results/sorl1/03_normalized/normalized.vcf" \
        -iin="/results/sorl1/id_files/${cohort}_${ancestry}.txt" \
        -icmd="plink2 --vcf normalized.vcf --set-all-var-ids 'chr@:#:\$r:\$a' --new-id-max-allele-len 999 --keep ${cohort}_${ancestry}.txt --freq --out ${cohort}_${ancestry}" \
        --instance-type mem1_hdd1_v2_x16 \
        --destination "${projectid}:/results/sorl1/05_frequencies"
    done
done


## Get zygosity info

In [None]:
%%bash

for cohort in {"AD","PD","RD","Control"};
do
    for ancestry in {"AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"};
    do
        dx run swiss-army-knife \
        -iin="/results/sorl1/03_normalized/normalized.vcf" \
        -iin="/results/sorl1/id_files/${cohort}_${ancestry}.txt" \
        -iin="/results/sorl1/05_frequencies/${cohort}_${ancestry}.afreq" \
        -icmd="plink2 --vcf normalized.vcf --set-all-var-ids 'chr@:#:\$r:\$a' --new-id-max-allele-len 999 --keep ${cohort}_${ancestry}.txt --read-freq ${cohort}_${ancestry}.afreq --export A --het --out ${cohort}_${ancestry}" \
        --instance-type mem1_hdd1_v2_x16 \
        --destination "${projectid}:/results/sorl1/05_frequencies"
    done
done


## Find homozygous/heterozygous counts

In [None]:
for cohort in ["AD","PD","RD","Control"]:
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        ! dx download /results/sorl1/05_frequencies/{cohort}_{ancestry}.raw


In [None]:
%%bash

for cohort in {"AD","PD","RD","Control"};
do
    for ancestry in {"AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"};
    do
        output_file="counts_${cohort}_${ancestry}.tsv"
        temp_output="temp_${cohort}_${ancestry}.tsv"

        > $output_file

        input_file="cut.raw"

        grep "FID" ${cohort}_${ancestry}.raw | cut -d$'\t' -f7- > header.tmp

        cut -d$'\t' -f7- "${cohort}_${ancestry}.raw" > $input_file

        declare -A count_2
        declare -A count_1
        declare -A count_0

        awk -F'\t' '
        {
            for (i=1; i<=NF; i++) {
                if ($i == 2) count_2[i]++;
                else if ($i == 1) count_1[i]++;
                else if ($i == 0) count_0[i]++;
            }
        }
        END {
            for (i=1; i<=NF; i++) {
                printf("%d", count_2[i]);
                if (i<NF) printf("\t");
            }
            print "";

            for (i=1; i<=NF; i++) {
                printf("%d", count_1[i]);
                if (i<NF) printf("\t");
            }
            print "";

            for (i=1; i<=NF; i++) {
                printf("%d", count_0[i]);
                if (i<NF) printf("\t");
            }
            print "";
        }' $input_file > count.tmp

        cat header.tmp count.tmp > $output_file

        rm $input_file count.tmp header.tmp

        echo "Counts have been appended to $output_file"
    done
done


In [None]:
for cohort in ["AD","PD","RD","Control"]:
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        ! dx upload counts_{cohort}_{ancestry}.tsv --path results/sorl1/06_zygosity/counts_{cohort}_{ancestry}.tsv


In [None]:
for cohort in ["AD","PD","RD","Control"]:
    zyg_cohort = []
    for ancestry in ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]:
        try:
            df = pd.read_csv(f"counts_{cohort}_{ancestry}.tsv", sep="\t")
            variant_ids = df.columns.values
            rename_dict = {}
            for vid in variant_ids:
                rename_dict[vid] = vid.split("_")[0]
            df.rename(rename_dict, axis=1, inplace=True)
            zyg_cohort.append(df)
        except:
            print(f"No data found at counts_{cohort}_{ancestry}.tsv")
    result_zyg = reduce(lambda x, y: x + y, zyg_cohort)
    result_zyg.to_csv(f"{cohort}_Final.csv", index=False)
    ! dx upload {cohort}_Final.csv --path results/sorl1/06_zygosity/{cohort}_Final.csv


In [None]:
ad_zyg = pd.read_csv(f"AD_Final.csv")
pd_zyg = pd.read_csv(f"PD_Final.csv")
rd_zyg = pd.read_csv(f"RD_Final.csv")
control_zyg = pd.read_csv(f"Control_Final.csv")

ad_zyg = ad_zyg.T
pd_zyg = pd_zyg.T
rd_zyg = rd_zyg.T
control_zyg = control_zyg.T

ad_zyg = ad_zyg.reset_index()
pd_zyg = pd_zyg.reset_index()
rd_zyg = rd_zyg.reset_index()
control_zyg = control_zyg.reset_index()

ad_zyg.rename(columns={'index': 'ID', 0: 'AD_Homozygous_Ref', 1: 'AD_Heterozygous', 2: 'AD_Homozygous_Alt'}, inplace=True)
pd_zyg.rename(columns={'index': 'ID', 0: 'PD_Homozygous_Ref', 1: 'PD_Heterozygous', 2: 'PD_Homozygous_Alt'}, inplace=True)
rd_zyg.rename(columns={'index': 'ID', 0: 'RD_Homozygous_Ref', 1: 'RD_Heterozygous', 2: 'RD_Homozygous_Alt'}, inplace=True)
control_zyg.rename(columns={'index': 'ID', 0: 'Control_Homozygous_Ref', 1: 'Control_Heterozygous', 2: 'Control_Homozygous_Alt'}, inplace=True)

final_zyg = ad_zyg.merge(pd_zyg, on="ID")
final_zyg = final_zyg.merge(rd_zyg, on="ID")
final_zyg = final_zyg.merge(control_zyg, on="ID")

final_zyg.to_csv("final_zygosity.csv", index=False)
! dx upload final_zygosity.csv --path results/sorl1/06_zygosity/final_zygosity.csv


# Merge annotations with allele frequency outputs

## Merge frequencies for all ancestries across each cohort

In [None]:
for cohort in ["AD","PD","RD","Control"]:
    ! dx download results/sorl1/05_frequencies/{cohort}*.afreq --overwrite
    freq_files = glob.glob(f"{cohort}*.afreq")

    df = pd.read_csv(freq_files[0], sep="\t")
    df = df[["ID","ALT_FREQS","OBS_CT"]]

    ancestry = freq_files[0].split("_")[-1].split(".")[0]
    df.rename({"ALT_FREQS":f"ALT_FREQS_{ancestry}_{cohort}", "OBS_CT":f"OBS_CT_{ancestry}_{cohort}"}, inplace=True, axis=1)

    for i in range(1, len(freq_files)):
        df_merge = pd.read_csv(freq_files[i], sep="\t")
        df_merge = df_merge[["ID","ALT_FREQS","OBS_CT"]]

        ancestry = freq_files[i].split("_")[-1].split(".")[0]
        df_merge.rename({"ALT_FREQS":f"ALT_FREQS_{ancestry}_{cohort}", "OBS_CT":f"OBS_CT_{ancestry}_{cohort}"}, inplace=True, axis=1)

        df = df.merge(df_merge, on="ID")

    df.to_csv(f"{cohort}_results.csv", index=False)


## Convert back to vcf format for external CADD calculation

In [None]:
var_ids = list(pd.read_csv("AD_results.csv")["ID"])
df_for_cadd = pd.DataFrame({"ID":var_ids})


In [None]:
df_for_cadd[["#CHROM","POS","REF","ALT"]] = df_for_cadd["ID"].str.split(':', expand=True)
df_for_cadd.drop("ID", axis=1, inplace=True)
df_for_cadd.insert(2, 'ID', '.')
df_for_cadd.to_csv("for_CADD.vcf", index=False, sep="\t")


In [None]:
! gzip -c for_CADD.vcf > for_CADD.vcf.gz


In [None]:
"""
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
---------------- DOWNLOAD OUTPUT FILE AND PASS THROUGH EXTERNAL CADD SCORE CALCULATOR ----------------
------------------------------ (https://cadd.gs.washington.edu/upload) -------------------------------
--------------------------- RENAME RESULT TO "CADD.tsv.gz" AND UPLOAD HERE ---------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
"""

In [None]:
! gzip -d CADD.tsv.gz
! grep -v "##" CADD.tsv > CADD_prelim.tsv
df_cadd = pd.read_csv("CADD_prelim.tsv", sep="\t")
df_cadd["#Chrom"] = "chr" + df_cadd["#Chrom"].astype(str)
df_cadd["ID"] = df_cadd["#Chrom"] + ":" + df_cadd["Pos"].astype(str) + ":" + df_cadd["Ref"] + ":" + df_cadd["Alt"]
df_cadd = df_cadd[["ID","PHRED"]]
df_cadd.rename({"PHRED":"CADD"}, axis=1, inplace=True)
df_cadd.to_csv("CADD_final.csv", index=False)


## Merge CADD scores, frequency files, and annotations

### Reformat annotated file

In [None]:
! dx download results/sorl1/04_annotated/annotated.csv --overwrite
df_annot = pd.read_csv("annotated.csv", low_memory=False)
df_annot.insert(1, "ID", df_annot[["Chr","Start","Ref","Alt"]].astype(str).agg(':'.join, axis=1))
filtered_columns = [col for col in df_annot.columns if "Otherinfo" not in col]
df_annot = df_annot[filtered_columns]


### Reformat frequency files

In [None]:
df_ad_freq = pd.read_csv("AD_results.csv")
df_ad_freq.drop(columns="ID", inplace=True)
df_pd_freq = pd.read_csv("PD_results.csv")
df_pd_freq.drop(columns="ID", inplace=True)
df_rd_freq = pd.read_csv("RD_results.csv")
df_rd_freq.drop(columns="ID", inplace=True)
df_control_freq = pd.read_csv("Control_results.csv")
df_control_freq = df_control_freq.merge(final_zyg, on="ID")
df_control_freq = df_control_freq.merge(df_cadd, on="ID", how="left")
df_control_freq.drop(columns="ID", inplace=True)


### Merge annotation and frequency files

In [None]:
df_merged = pd.concat([df_annot,df_ad_freq], axis=1)
df_merged = pd.concat([df_merged,df_pd_freq], axis=1)
df_merged = pd.concat([df_merged,df_rd_freq], axis=1)
df_merged = pd.concat([df_merged,df_control_freq], axis=1)
df_merged.to_csv("merged.csv", index=False)
! dx upload merged.csv --path results/sorl1/07_merged/merged.csv


# Filter Results

In [None]:
! dx download results/sorl1/07_merged/merged.csv


In [None]:
df_merged = pd.read_csv("merged.csv")


In [None]:
display(df_merged)

## Filter by gene, function, and CADD

In [None]:
gene_info_dict = fetch_gene_info_ensembl(gene_names)
print(gene_info_dict)


In [None]:
criteria_list = []
for gene in gene_names:
    chrnum = gene_info_dict[gene]["chromosome"]
    start = gene_info_dict[gene]["start"]
    end = gene_info_dict[gene]["end"]
    criteria_list.append((df_merged["Chr"] == chrnum) 
                         & (df_merged["Start"] > start) 
                         & (df_merged["Start"] < end)
                         & (df_merged["Func.refGene"].isin(["exonic", "splicing"]))
                         & (df_merged["CADD"] >= 20)
                        )

filter_criteria = criteria_list[0]
for criterion in criteria_list[1:]:
    filter_criteria |= criterion

df_filtered = df_merged[filter_criteria]
df_filtered = df_filtered[df_filtered["Func.refGene"].isin(["exonic", "splicing"])]


## Only include variants present in cases

In [None]:
ancestries = ["AAC","AFR","AJ","AMR","CAH","CAS","EAS","EUR","FIN","MDE","SAS"]
ad_col_names = [f"ALT_FREQS_{ancestry}_AD" for ancestry in ancestries if f"ALT_FREQS_{ancestry}_AD" in df_filtered.columns.values]
pd_col_names = [f"ALT_FREQS_{ancestry}_PD" for ancestry in ancestries if f"ALT_FREQS_{ancestry}_PD" in df_filtered.columns.values]
rd_col_names = [f"ALT_FREQS_{ancestry}_RD" for ancestry in ancestries if f"ALT_FREQS_{ancestry}_RD" in df_filtered.columns.values]
ctrl_col_names = [f"ALT_FREQS_{ancestry}_Control" for ancestry in ancestries if f"ALT_FREQS_{ancestry}_Control" in df_filtered.columns.values]
df_filtered_allcases = df_filtered[(df_filtered[ad_col_names + rd_col_names + pd_col_names]>0).any(axis=1)]


## Add column indicating phenotypes expressing each variant

In [None]:
df_filtered_allcases["Disease"] = ""
df_filtered_allcases["Disease"][(df_filtered_allcases[ad_col_names]>0).any(axis=1)] = "AD"
df_filtered_allcases["Disease"][(df_filtered_allcases[pd_col_names]>0).any(axis=1)] = "PD"
df_filtered_allcases["Disease"][(df_filtered_allcases[rd_col_names]>0).any(axis=1)] = "RD"
df_filtered_allcases["Disease"][(df_filtered_allcases[ad_col_names]>0).any(axis=1) & (df_filtered_allcases[pd_col_names]>0).any(axis=1)] = "AD and PD"
df_filtered_allcases["Disease"][(df_filtered_allcases[ad_col_names]>0).any(axis=1) & (df_filtered_allcases[rd_col_names]>0).any(axis=1)] = "AD and RD"
df_filtered_allcases["Disease"][(df_filtered_allcases[pd_col_names]>0).any(axis=1) & (df_filtered_allcases[rd_col_names]>0).any(axis=1)] = "PD and RD"
df_filtered_allcases["Disease"][(df_filtered_allcases[ad_col_names]>0).any(axis=1) & (df_filtered_allcases[pd_col_names]>0).any(axis=1) & (df_filtered_allcases[rd_col_names]>0).any(axis=1)] = "All"
display(df_filtered_allcases)
df_filtered_allcases.to_csv("filtered.csv", index=False)


In [None]:
! dx upload filtered.csv --path results/sorl1/07_merged/filtered.csv


# Remove variants expressed in controls

In [None]:
df_filtered_onlycases = df_filtered_allcases[(df_filtered_allcases["Control_Heterozygous"] == 0) & (df_filtered_allcases["Control_Homozygous_Alt"] == 0)]
df_filtered_onlycases.to_csv("filtered_onlycases.csv", index=False)


In [None]:
! dx upload filtered_onlycases.csv --path results/sorl1/07_merged/filtered_onlycases.csv
