# AMP DLB

* **Project:** ADRD-SORL1-Biobanks
* **Version:** Python/3.10
* **Last Updated:** 14-Jun-2025

## Notebook Overview

Characterization of SORL1 variants, allele freqs

# Query AMP DLB to check for variants of interest, and allele frequency

## Variables used 
- `${ANCESTRY}` = EUR, AFR, AMR, AAC, AJ, MDE, SAS, CAS, EAS, FIN, CAH


In [None]:
import pandas as pd

In [2]:
qc_covar_DLB2 = pd.read_csv("/${WORK_DIR}/addedPHENO_COVFILE_releasev3_SEPT2022_fromTerra.csv", sep=",")
qc_covar_DLB2.head()

Unnamed: 0,ID,SEX,AGE_BASELINE,AGE_DIAGNOSIS,FAMILY_HISTORY,EDUCATION,LATEST_DX,DX_TYPE,ENROLL_STUDY_ARM,COHORT,PD_PHENO,PD_EXTRA_PHENO,DLB_PHENO,MSA_PHENO,PSP_PHENO,CBD_PHENO,AGE_ANALYSIS
0,BF-1001,1,55,,0.0,1.0,No PD Nor Other Neurological Disorder,,Healthy Control,BIOFIND,1,1,1,1,1,1,55.0
1,BF-1002,2,66,61.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,61.0
2,BF-1003,1,61,56.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,56.0
3,BF-1004,1,62,55.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,55.0
4,BF-1005,2,61,,0.0,1.0,No PD Nor Other Neurological Disorder,,Healthy Control,BIOFIND,1,1,1,1,1,1,61.0


In [None]:
import pandas as pd

ancestries = ["AFR", "AJ", "EUR", "MDE", "AAC", "AMR", "FIN", "CAS", "SAS", "EAS", "CAH"]

qc_covar_DLB2 = pd.read_csv("/${WORK_DIR}/addedPHENO_COVFILE_releasev3_SEPT2022_fromTerra.csv", sep=",")

all_to_remove = set()

for ancestry in ancestries:
    remove_file = f"/${WORK_DIR}/toRemove_1stand2ndDegree_Relateds_{ancestry}_noDups.txt"
    
    try:
        relateds = pd.read_csv(remove_file, sep="\t", header=None, names=["FID", "IID"])
        all_to_remove.update(relateds["IID"].tolist())
        print(f"Loaded {len(relateds)} samples to remove from {ancestry}")
    except FileNotFoundError:
        print(f"⚠️ File not found for ancestry: {ancestry} — skipping.")

filtered_covar = qc_covar_DLB2[~qc_covar_DLB2['ID'].isin(all_to_remove)]

filtered_covar.to_csv("qc_covar_DLB2_unrelateds_only.csv", sep=",", index=False)

In [9]:
qc_covar_DLB2 = pd.read_csv("qc_covar_DLB2_unrelateds_only.csv", sep=",")
qc_covar_DLB2.head()

Unnamed: 0,ID,SEX,AGE_BASELINE,AGE_DIAGNOSIS,FAMILY_HISTORY,EDUCATION,LATEST_DX,DX_TYPE,ENROLL_STUDY_ARM,COHORT,PD_PHENO,PD_EXTRA_PHENO,DLB_PHENO,MSA_PHENO,PSP_PHENO,CBD_PHENO,AGE_ANALYSIS
0,BF-1001,1,55,,0.0,1.0,No PD Nor Other Neurological Disorder,,Healthy Control,BIOFIND,1,1,1,1,1,1,55.0
1,BF-1002,2,66,61.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,61.0
2,BF-1003,1,61,56.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,56.0
3,BF-1004,1,62,55.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,55.0
4,BF-1005,2,61,,0.0,1.0,No PD Nor Other Neurological Disorder,,Healthy Control,BIOFIND,1,1,1,1,1,1,61.0


In [None]:
qc_case_DLB2 = qc_covar_DLB2[qc_covar_DLB2["DLB_PHENO"]==2]
qc_case_DLB2.info()

In [11]:
qc_case_DLB2_plink = qc_case_DLB2[["ID"]]
qc_case_DLB2_plink.to_csv("qc_case_DLB2_plink.txt", sep=",", index=False)

In [12]:
!awk '{print $0, $1}' qc_case_DLB2_plink.txt > qc_case_DLB2_ID_ID_plink.txt

In [None]:
qc_control_DLB2 = qc_covar_DLB2[qc_covar_DLB2["PD_PHENO"]==1]
qc_control_DLB2.info()

In [16]:
qc_control_DLB2_plink = qc_control_DLB2[["ID"]]
qc_control_DLB2_plink.to_csv("qc_control_DLB2_plink.txt", sep=",", index=False)

In [17]:
!awk '{print $0, $1}' qc_control_DLB2_plink.txt > qc_control_DLB2_ID_ID_plink.txt

In [None]:
%%bash
module load plink
plink2 --pfile /${WORK_DIR}/FILTERED.AMP_PD_${ANCESTRY} \
--chr 11 --from-bp 121452314 --to-bp 121633763  --make-bed --out AMPDLB2_Alex_vars_all_${ANCESTRY}

In [None]:
%%bash
module load plink/1.9

plink --bfile AMPDLB2_Alex_vars_all_${ANCESTRY} --keep qc_case_DLB2_ID_ID_plink.txt --make-bed --out AMPDLB2_Alex_vars_all_${ANCESTRY}_cases2

In [None]:
%%bash
module load plink/1.9

plink --bfile AMPDLB2_Alex_vars_all_${ANCESTRY}_cases2 --freq --out AMPDLB2_Alex_vars_all_${ANCESTRY}_cases2

In [None]:
%%bash
module load plink/1.9

plink --bfile AMPDLB2_Alex_vars_all_${ANCESTRY} --keep qc_control_DLB2_ID_ID_plink.txt --make-bed --out AMPDLB2_Alex_vars_all_${ANCESTRY}_controls2

In [None]:
%%bash
module load plink/1.9

plink --bfile AMPDLB2_Alex_vars_all_${ANCESTRY}_controls2 --freq --out AMPDLB2_Alex_vars_all_${ANCESTRY}_controls2

## Check Zygosity

In [None]:
%%bash
module load plink
plink2 --bfile AMPDLB2_Alex_vars_all_${ANCESTRY} --recode A --out AMPDLB2_Alex_vars_all_${ANCESTRY}_recode

In [None]:
%%bash
module load plink
plink2 --bfile AMPDLB2_Alex_vars_all_${ANCESTRY}_cases2 --recode A --out AMPDLB2_Alex_vars_all_${ANCESTRY}_cases2_recode

In [None]:
%%bash
module load plink
plink2 --bfile AMPDLB2_Alex_vars_all_${ANCESTRY}_controls2 --recode A --out AMPDLB2_Alex_vars_all_${ANCESTRY}_controls2_recode

## Merge files

In [None]:
import glob
import pandas as pd
import os

control_amp_paths = glob.glob("AMPDLB2_*controls2*.frq")
case_amp_paths = glob.glob("AMPDLB2_*cases2*.frq")

control_amp_paths.sort()
case_amp_paths.sort()

amp_paths = list(zip(control_amp_paths, case_amp_paths))
print(f"Total paired files: {len(amp_paths)}")

df_list = []
for paths in amp_paths:
    for i, path in enumerate(paths):
        df = pd.read_csv(path, sep=r'\s+', engine='python')

        df["NCHROBS"] = pd.to_numeric(df["NCHROBS"])
        nchrobs = df["NCHROBS"].max()
        ancestry = path.split("_")[4]
        maf_col_name = f"{ancestry} {'Controls' if i == 0 else 'Cases'} MAF (NCHROBS = {nchrobs})"
        df.rename({"MAF": maf_col_name}, inplace=True, axis=1)

        df = df[["SNP", maf_col_name]]
        display(df.head(3))

        df_list.append(df)

for i in range(len(df_list)):
    print(f"Match with first file [{i}]:", list(df_list[0]["SNP"]) == list(df_list[i]["SNP"]))

merged_df = df_list[0]
for df in df_list[1:]:
    merged_df = pd.merge(merged_df, df, on='SNP', how="outer")

merged_df = merged_df.fillna(0)

display(merged_df.head())

os.makedirs("Merged_AMPDLB2", exist_ok=True)
merged_df.to_csv("Merged_AMPDLB2/AMPDLB2.csv", index=False)


## Count the number of cases and controls

In [52]:
! awk 'NR==FNR {ids[$1]; next} $1 in ids' qc_case_DLB2_ID_ID_plink.txt /${WORK_DIR}/FILTERED.AMP_PD_ancestry_${ANCESTRY}.samples > filtered_DLB2_samples2_${ANCESTRY}.txt

In [None]:
! wc filtered_DLB2_samples2_${ANCESTRY}.txt

In [60]:
! awk 'NR==FNR {ids[$1]; next} $1 in ids' qc_control_DLB2_ID_ID_plink.txt /${WORK_DIR}/FILTERED.AMP_PD_ancestry_${ANCESTRY}.samples > filtered_samples__DLB2_controls2_${ANCESTRY}.txt

In [None]:
! wc filtered_samples__DLB2_controls2_${ANCESTRY}.txt

## Annotation

In [None]:
%%bash
module load annovar

In [None]:
%%bash
table_annovar.pl AMP_DLB_8.cleaned.vcf $ANNOVAR_DATA/hg38 \
    --buildver hg38 \
    --remove \
    --thread 48 \
    --maxgenethread 48 \
    --protocol refGene,clinvar_20140902,avsnp151,dbnsfp47a \
    --operation g,f,f,f \
    --nopolish \
    --nastring . \
    --out AMP_DLB_8.vcf.anno \
    --vcfinput

## Merge files and apply criteria (exonic and splicing variants, mac 2, CADD>20, and Only present in cases)

In [68]:
import pandas as pd

df = pd.read_csv("AMP_DLB_8.vcf.anno.hg38_multianno.txt", sep="\t")

columns_to_keep = [
    "Chr", "Start", "End", "Ref", "Alt",
    "Func.refGene", "Gene.refGene", "GeneDetail.refGene",
    "ExonicFunc.refGene", "AAChange.refGene",
    "clinvar_20140902", "avsnp151", "CADD_phred"
]

filtered_df = df[columns_to_keep]

filtered_df.to_csv("AMP_DLB_8.filtered.txt", sep="\t", index=False)

In [69]:
import pandas as pd

anno = pd.read_csv('AMP_DLB_8.filtered.txt', sep='\t')
freq = pd.read_csv('Merged_AMPDLB2/AMPDLB2.csv')

freq['Start'] = freq['SNP'].apply(lambda x: int(x.split(':')[1]))

merged = pd.merge(anno, freq, on='Start', how='inner')

merged.to_csv('AMP_DLB2_8_merged_output2.txt', sep='\t', index=False)


In [None]:
import os
import pandas as pd

ancestries = ["AFR","AJ","EUR","MDE","AAC","AMR","FIN","CAS","SAS","EAS","CAH"]
zyg_list = []

for ancestry in ancestries:
    for group in ["cases", "controls"]:
        filename = f"AMPDLB2_Alex_vars_all_{ancestry}_{group}2_recode.raw" 
        if os.path.exists(filename):
            print(f"Processing: {filename}")
            df = pd.read_csv(filename, sep="\t")
            df.drop(columns=["FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE"], inplace=True)
            df = df.fillna(2).astype(int)

            snps = [snpid.split("_")[0] for snpid in df.columns]
            hom_alt = (df == 0).sum()
            het = (df == 1).sum()

            df_zyg = pd.DataFrame({
                "SNP": snps,
                f"{ancestry}_{group}_hom_alt_ac": hom_alt.values,
                f"{ancestry}_{group}_het_ac": het.values
            })

            zyg_list.append(df_zyg)
        else:
            print(f"Missing file: {filename}")

from functools import reduce

if zyg_list:
    merged_zyg = reduce(lambda left, right: pd.merge(left, right, on='SNP', how='outer'), zyg_list)

    merged = pd.read_csv("AMP_DLB2_8_merged_output2.txt", sep="\t")  
    final_merged = pd.merge(merged, merged_zyg, on='SNP', how='left')
    final_merged.to_csv("AMP_DLB2_8_final_output2.txt", sep="\t", index=False)
    print("Final merged file written: AMP_DLB2_8_final_output2.txt")
else:
    print("No valid zygosity data found to merge.")


In [None]:
import pandas as pd

df_amp_merged = pd.read_csv("AMP_DLB2_8_final_output2.txt", sep="\t")

amp_freq_cols = df_amp_merged.filter(like="NCHROBS").columns

criteria_amp_anno = (
    (df_amp_merged["Func.refGene"].isin(["splicing", "exonic"])) &
    ~(df_amp_merged["ExonicFunc.refGene"] == "synonymous SNV")
)

df_amp_filtered = df_amp_merged[criteria_amp_anno]

criteria_amp_freq = (df_amp_filtered[amp_freq_cols] > 0.0).any(axis=1)
df_amp_filtered = df_amp_filtered[criteria_amp_freq]

df_amp_filtered.to_csv("AMP_DLB2_8_filtered_coding_output2.txt", sep="\t", index=False)


In [None]:
! wc AMP_DLB2_8_filtered_coding_output2.txt

In [None]:
import pandas as pd

df_amp_filtered = pd.read_csv("AMP_DLB2_8_filtered_coding_output2.txt", sep="\t")

amp_controls_cols = df_amp_filtered.filter(like="Controls MAF").columns

df_amp_filtered_casesonly = df_amp_filtered[(df_amp_filtered[amp_controls_cols] == 0.0).all(axis=1)]

df_amp_filtered_casesonly.to_csv("AMP_DLB2_8__coding_casesonly_output2.txt", sep="\t", index=False)


In [None]:
! wc AMP_DLB2_8__coding_casesonly_output2.txt

In [None]:
import pandas as pd

df_amp_casesonly = pd.read_csv("AMP_DLB2_8__coding_casesonly_output2.txt", sep="\t")

criteria_amp_cadd = (df_amp_casesonly["CADD_phred"] >= 20) | (df_amp_casesonly["CADD_phred"].isna())
df_amp_casesonly_cadd_filtered = df_amp_casesonly[criteria_amp_cadd]

df_amp_casesonly_cadd_filtered.to_csv("AMP_DLB2_8__coding_casesonly_CADD20_output2.txt", sep="\t", index=False)


In [None]:
! wc AMP_DLB2_8__coding_casesonly_CADD20_output2.txt

In [None]:
import pandas as pd

df_amp_casesonly_cadd = pd.read_csv("AMP_DLB2_8__coding_casesonly_CADD20_output2.txt", sep="\t")

criteria_${ANCESTRY}_ac = (df_amp_casesonly_cadd["${ANCESTRY}_cases_het_ac"] >= 2) | (df_amp_casesonly_cadd["${ANCESTRY}_cases_hom_alt_ac"] >= 2)
df_amp_casesonly_cadd_${ANCESTRY}_filtered = df_amp_casesonly_cadd[criteria_${ANCESTRY}_ac]

df_amp_casesonly_cadd_${ANCESTRY}_filtered.to_csv("AMP_DLB2_8__coding_casesonly_CADD20_${ANCESTRY}AC2_output2.txt", sep="\t", index=False)



In [None]:
! wc AMP_DLB2_8__coding_casesonly_CADD20_${ANCESTRY}AC2_output2.txt

In [None]:
! cat AMP_DLB2_8__coding_casesonly_CADD20_${ANCESTRY}AC2_output2.txt

## Caunt total variansts per ancestry

In [None]:
import pandas as pd

df = pd.read_csv("AMP_DLB2_8_merged_output2.txt", sep="\t")

ancestries = ["AJ", "EUR", "FIN", "MDE"]

summary_tables = []

for ancestry in ancestries:
    control_col = [col for col in df.columns if col.startswith(f"{ancestry} Controls MAF")][0]
    case_col = [col for col in df.columns if col.startswith(f"{ancestry} Cases MAF")][0]

    ancestry_variants = df[(df[control_col] > 0) | (df[case_col] > 0)].copy()

    if not ancestry_variants.empty:
        
        func_counts = ancestry_variants.groupby("Func.refGene").size().reset_index(name="VariantCount")
        func_counts["Ancestry"] = ancestry

        exonic_variants = ancestry_variants[ancestry_variants["Func.refGene"] == "exonic"]
        exonic_counts = exonic_variants.groupby("ExonicFunc.refGene").size().reset_index(name="VariantCount")
        exonic_counts["Ancestry"] = ancestry
        exonic_counts["Func.refGene"] = "exonic"

        summary_tables.append(func_counts)
        summary_tables.append(exonic_counts)

final_summary = pd.concat(summary_tables, ignore_index=True)

final_summary.to_csv("AMP_DLB2_variant_ancestry_func_and_exonic_type_counts2.txt", sep="\t", index=False)



In [None]:
! cat AMP_DLB2_variant_ancestry_func_and_exonic_type_counts2.txt