# AMP PD
* **Project:** ADRD-SORL1-Biobanks
* **Version:** Python/3.10
* **Last Updated:** 14-Jun-2025

## Notebook Overview
Characterization of SORL1 variants, allele freqs, association analysis, burden analysis

# Query AMP PD to check for variants of interest, and allele frequency

## Variables used 
- `${ANCESTRY}` = EUR, AFR, AMR, AAC, AJ, MDE, SAS, CAS, EAS, FIN, CAH

In [None]:
import pandas as pd

In [4]:
qc_covar_PD = pd.read_csv("/${WORK_DIR}/addedPHENO_COVFILE_releasev3_SEPT2022_fromTerra.csv", sep=",")
qc_covar_PD.head()

Unnamed: 0,ID,SEX,AGE_BASELINE,AGE_DIAGNOSIS,FAMILY_HISTORY,EDUCATION,LATEST_DX,DX_TYPE,ENROLL_STUDY_ARM,COHORT,PD_PHENO,PD_EXTRA_PHENO,DLB_PHENO,MSA_PHENO,PSP_PHENO,CBD_PHENO,AGE_ANALYSIS
0,BF-1001,1,55,,0.0,1.0,No PD Nor Other Neurological Disorder,,Healthy Control,BIOFIND,1,1,1,1,1,1,55.0
1,BF-1002,2,66,61.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,61.0
2,BF-1003,1,61,56.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,56.0
3,BF-1004,1,62,55.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,55.0
4,BF-1005,2,61,,0.0,1.0,No PD Nor Other Neurological Disorder,,Healthy Control,BIOFIND,1,1,1,1,1,1,61.0


In [None]:
import pandas as pd

ancestries = ["AFR", "AJ", "EUR", "MDE", "AAC", "AMR", "FIN", "CAS", "SAS", "EAS", "CAH"]

qc_covar_PD = pd.read_csv("/${WORK_DIR}/addedPHENO_COVFILE_releasev3_SEPT2022_fromTerra.csv", sep=",")

all_to_remove = set()

for ancestry in ancestries:
    remove_file = f"/${WORK_DIR}/toRemove_1stand2ndDegree_Relateds_{ancestry}_noDups.txt"
    
    try:
        relateds = pd.read_csv(remove_file, sep="\t", header=None, names=["FID", "IID"])
        all_to_remove.update(relateds["IID"].tolist())
        print(f"Loaded {len(relateds)} samples to remove from {ancestry}")
    except FileNotFoundError:
        print(f"‚ö†Ô∏è File not found for ancestry: {ancestry} ‚Äî skipping.")

filtered_covar = qc_covar_PD[~qc_covar_PD['ID'].isin(all_to_remove)]

filtered_covar.to_csv("qc_covar_PD_unrelateds_only.csv", sep=",", index=False)


In [5]:
qc_covar_PD2 = pd.read_csv("qc_covar_PD_unrelateds_only.csv", sep=",")
qc_covar_PD2.head()

Unnamed: 0,ID,SEX,AGE_BASELINE,AGE_DIAGNOSIS,FAMILY_HISTORY,EDUCATION,LATEST_DX,DX_TYPE,ENROLL_STUDY_ARM,COHORT,PD_PHENO,PD_EXTRA_PHENO,DLB_PHENO,MSA_PHENO,PSP_PHENO,CBD_PHENO,AGE_ANALYSIS
0,BF-1001,1,55,,0.0,1.0,No PD Nor Other Neurological Disorder,,Healthy Control,BIOFIND,1,1,1,1,1,1,55.0
1,BF-1002,2,66,61.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,61.0
2,BF-1003,1,61,56.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,56.0
3,BF-1004,1,62,55.0,0.0,1.0,Idiopathic PD,,PD,BIOFIND,2,2,-9,-9,-9,-9,55.0
4,BF-1005,2,61,,0.0,1.0,No PD Nor Other Neurological Disorder,,Healthy Control,BIOFIND,1,1,1,1,1,1,61.0


In [None]:
qc_case_PD2 = qc_covar_PD2[qc_covar_PD2["PD_PHENO"]==2]
qc_case_PD2.info()

In [7]:
qc_case_PD2_plink = qc_case_PD2[["ID"]]
qc_case_PD2_plink.to_csv("qc_case_PD2_plink.txt", sep=",", index=False)

In [8]:
!awk '{print $0, $1}' qc_case_PD2_plink.txt > qc_case_PD2_ID_ID_plink.txt

In [None]:
qc_control_PD2 = qc_covar_PD2[qc_covar_PD2["PD_PHENO"]==1]
qc_control_PD2.info()

In [11]:
qc_control_PD2_plink = qc_control_PD2[["ID"]]
qc_control_PD2_plink.to_csv("qc_control_PD2_plink.txt", sep=",", index=False)

In [12]:
!awk '{print $0, $1}' qc_control_PD2_plink.txt > qc_control_PD2_ID_ID_plink.txt

In [None]:
%%bash
module load plink
plink2 --pfile /${WORK_DIR}/FILTERED.AMP_PD_${ANCESTRY} \
--chr 11 --from-bp 121452314 --to-bp 121633763 --make-bed --out AMPPD_Alex_vars_all_${ANCESTRY}

In [None]:
%%bash
module load plink/1.9

plink --bfile AMPPD_Alex_vars_all_${ANCESTRY} --keep qc_case_PD2_ID_ID_plink.txt --make-bed --out AMPPD_Alex_vars_all_${ANCESTRY}_cases2

In [None]:
%%bash
module load plink/1.9

plink --bfile AMPPD_Alex_vars_all_${ANCESTRY}_cases2 --freq --out AMPPD_Alex_vars_all_${ANCESTRY}_cases2

In [None]:
%%bash
module load plink/1.9

plink --bfile AMPPD_Alex_vars_all_${ANCESTRY} --keep qc_control_PD2_ID_ID_plink.txt --make-bed --out AMPPD_Alex_vars_all_${ANCESTRY}_controls2

In [None]:
%%bash
module load plink/1.9

plink --bfile AMPPD_Alex_vars_all_${ANCESTRY}_controls2 --freq --out AMPPD_Alex_vars_all_${ANCESTRY}_controls2

## Check Zygosity

In [None]:
%%bash
module load plink
plink2 --bfile AMPPD_Alex_vars_all_${ANCESTRY} --recode A --out AMPPD_Alex_vars_all_${ANCESTRY}_recode

In [None]:
%%bash
module load plink
plink2 --bfile AMPPD_Alex_vars_all_${ANCESTRY}_cases2 --recode A --out AMPPD_Alex_vars_all_${ANCESTRY}_cases2_recode

In [None]:
%%bash
module load plink
plink2 --bfile AMPPD_Alex_vars_all_${ANCESTRY}_controls2 --recode A --out AMPPD_Alex_vars_all_${ANCESTRY}_controls2_recode

In [None]:
## Merge files

In [None]:
import glob
import pandas as pd
import os

control_amp_paths = glob.glob("AMPPD_*controls2*.frq")
case_amp_paths = glob.glob("AMPPD_*cases2*.frq")

control_amp_paths.sort()
case_amp_paths.sort()

amp_paths = list(zip(control_amp_paths, case_amp_paths))
print(f"Total paired files: {len(amp_paths)}")

df_list = []
for paths in amp_paths:
    for i, path in enumerate(paths):
        df = pd.read_csv(path, sep=r'\s+', engine='python')

        df["NCHROBS"] = pd.to_numeric(df["NCHROBS"])
        nchrobs = df["NCHROBS"].max()
        ancestry = path.split("_")[4]
        maf_col_name = f"{ancestry} {'Controls' if i == 0 else 'Cases'} MAF (NCHROBS = {nchrobs})"
        df.rename({"MAF": maf_col_name}, inplace=True, axis=1)

        df = df[["SNP", maf_col_name]]
        display(df.head(3))

        df_list.append(df)

for i in range(len(df_list)):
    print(f"Match with first file [{i}]:", list(df_list[0]["SNP"]) == list(df_list[i]["SNP"]))

merged_df = df_list[0]
for df in df_list[1:]:
    merged_df = pd.merge(merged_df, df, on='SNP', how="outer")

merged_df = merged_df.fillna(0)

display(merged_df.head())

os.makedirs("Merged_AMPPD", exist_ok=True)
merged_df.to_csv("Merged_AMPPD/AMPPD2.csv", index=False)


## Count the number of cases and controls

In [60]:
! awk 'NR==FNR {ids[$1]; next} $1 in ids' qc_case_PD2_ID_ID_plink.txt /${WORK_DIR}/FILTERED.AMP_PD_ancestry_${ANCESTRY}.samples > filtered_samples2_${ANCESTRY}.txt

In [None]:
! wc filtered_samples2_${ANCESTRY}.txt

In [83]:
! awk 'NR==FNR {ids[$1]; next} $1 in ids' qc_control_PD2_ID_ID_plink.txt /${WORK_DIR}/FILTERED.AMP_PD_ancestry_${ANCESTRY}.samples > filtered_samples_controls2_${ANCESTRY}.txt

In [None]:
! wc filtered_samples_controls2_${ANCESTRY}.txt

## Annotation

In [None]:
%%bash
module load annovar

In [None]:
%%bash
table_annovar.pl AMP_DLB_8.cleaned.vcf $ANNOVAR_DATA/hg38 \
    --buildver hg38 \
    --remove \
    --thread 48 \
    --maxgenethread 48 \
    --protocol refGene,clinvar_20140902,avsnp151,dbnsfp47a \
    --operation g,f,f,f \
    --nopolish \
    --nastring . \
    --out AMP_DLB_8.vcf.anno \
    --vcfinput

## Merge files and apply criteria (exonic and splicing variants, mac 2, CADD>20, and Only present in cases)

In [56]:
import pandas as pd

df = pd.read_csv("AMP_PD_8.vcf.anno.hg38_multianno.txt", sep="\t")

columns_to_keep = [
    "Chr", "Start", "End", "Ref", "Alt",
    "Func.refGene", "Gene.refGene", "GeneDetail.refGene",
    "ExonicFunc.refGene", "AAChange.refGene",
    "clinvar_20140902", "avsnp151", "CADD_phred"
]

filtered_df = df[columns_to_keep]

filtered_df.to_csv("AMP_PD_8.filtered.txt", sep="\t", index=False)


In [111]:
import pandas as pd

anno = pd.read_csv('AMP_PD_8.filtered.txt', sep='\t')
freq = pd.read_csv('Merged_AMPPD/AMPPD2.csv')

freq['Start'] = freq['SNP'].apply(lambda x: int(x.split(':')[1]))

merged = pd.merge(anno, freq, on='Start', how='inner')

merged.to_csv('AMP_PD_8_merged_output2.txt', sep='\t', index=False)


In [None]:
import os
import pandas as pd

ancestries = ["AFR","AJ","EUR","MDE","AAC","AMR","FIN","CAS","SAS","EAS","CAH"]
zyg_list = []

for ancestry in ancestries:
    for group in ["cases", "controls"]:
        filename = f"AMPPD_Alex_vars_all_{ancestry}_{group}2_recode.raw"  
        if os.path.exists(filename):
            print(f"Processing: {filename}")
            df = pd.read_csv(filename, sep="\t")
            df.drop(columns=["FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE"], inplace=True)
            df = df.fillna(2).astype(int)

            snps = [snpid.split("_")[0] for snpid in df.columns]
            hom_alt = (df == 0).sum()
            het = (df == 1).sum()

            df_zyg = pd.DataFrame({
                "SNP": snps,
                f"{ancestry}_{group}_hom_alt_ac": hom_alt.values,
                f"{ancestry}_{group}_het_ac": het.values
            })

            zyg_list.append(df_zyg)
        else:
            print(f"Missing file: {filename}")


from functools import reduce

if zyg_list:
    merged_zyg = reduce(lambda left, right: pd.merge(left, right, on='SNP', how='outer'), zyg_list)

    
    merged = pd.read_csv("AMP_PD_8_merged_output2.txt", sep="\t")  
    final_merged = pd.merge(merged, merged_zyg, on='SNP', how='left')
    final_merged.to_csv("AMP_PD_8_final_output2.txt", sep="\t", index=False)
    print("Final merged file written: AMP_PD_8_final_output2.txt")
else:
    print("No valid zygosity data found to merge.")


In [None]:
import pandas as pd

df_amp_merged = pd.read_csv("AMP_PD_8_final_output2.txt", sep="\t")

amp_freq_cols = df_amp_merged.filter(like="NCHROBS").columns

criteria_amp_anno = (
    (df_amp_merged["Func.refGene"].isin(["splicing", "exonic"])) &
    ~(df_amp_merged["ExonicFunc.refGene"] == "synonymous SNV")
)

df_amp_filtered = df_amp_merged[criteria_amp_anno]

criteria_amp_freq = (df_amp_filtered[amp_freq_cols] > 0.0).any(axis=1)
df_amp_filtered = df_amp_filtered[criteria_amp_freq]

df_amp_filtered.to_csv("AMP_PD_8_filtered_coding_output2.txt", sep="\t", index=False)


In [None]:
! wc AMP_PD_8_filtered_coding_output2.txt

In [None]:
import pandas as pd

df_amp_filtered = pd.read_csv("AMP_PD_8_filtered_coding_output2.txt", sep="\t")

amp_controls_cols = df_amp_filtered.filter(like="Controls MAF").columns

df_amp_filtered_casesonly = df_amp_filtered[(df_amp_filtered[amp_controls_cols] == 0.0).all(axis=1)]

df_amp_filtered_casesonly.to_csv("AMP_PD_8__coding_casesonly_output2.txt", sep="\t", index=False)


In [None]:
! wc AMP_PD_8__coding_casesonly_output2.txt

In [None]:
import pandas as pd

df_amp_casesonly = pd.read_csv("AMP_PD_8__coding_casesonly_output2.txt", sep="\t")

criteria_amp_cadd = (df_amp_casesonly["CADD_phred"] >= 20) | (df_amp_casesonly["CADD_phred"].isna())
df_amp_casesonly_cadd_filtered = df_amp_casesonly[criteria_amp_cadd]

df_amp_casesonly_cadd_filtered.to_csv("AMP_PD_8__coding_casesonly_CADD20_output2.txt", sep="\t", index=False)


In [None]:
! wc AMP_PD_8__coding_casesonly_CADD20_output2.txt

In [None]:
import pandas as pd

df_amp_casesonly_cadd = pd.read_csv("AMP_PD_8__coding_casesonly_CADD20_output2.txt", sep="\t")

criteria_${ANCESTRY}_ac = (df_amp_casesonly_cadd["${ANCESTRY}_cases_het_ac"] >= 2) | (df_amp_casesonly_cadd["${ANCESTRY}_cases_hom_alt_ac"] >= 2)
df_amp_casesonly_cadd_${ANCESTRY}_filtered = df_amp_casesonly_cadd[criteria_${ANCESTRY}_ac]

df_amp_casesonly_cadd_${ANCESTRY}_filtered.to_csv("AMP_PD_8__coding_casesonly_CADD20_${ANCESTRY}AC2_output2.txt", sep="\t", index=False)



In [None]:
! wc AMP_PD_8__coding_casesonly_CADD20_${ANCESTRY}AC2_output2.txt

In [None]:
! cat AMP_PD_8__coding_casesonly_CADD20_${ANCESTRY}AC2_output2.txt

## Caunt total variansts per ancestry

In [None]:
import pandas as pd

df = pd.read_csv("AMP_PD_8_merged_output2.txt", sep="\t")

ancestries = ["AAC", "AFR", "AJ", "AMR", "CAH", "CAS", "EAS", "EUR", "FIN", "MDE", "SAS"]

summary_tables = []

for ancestry in ancestries:
    control_col = [col for col in df.columns if col.startswith(f"{ancestry} Controls MAF")][0]
    case_col = [col for col in df.columns if col.startswith(f"{ancestry} Cases MAF")][0]

    ancestry_variants = df[(df[control_col] > 0) | (df[case_col] > 0)].copy()

    if not ancestry_variants.empty:
        func_counts = ancestry_variants.groupby("Func.refGene").size().reset_index(name="VariantCount")
        func_counts["Ancestry"] = ancestry

        exonic_variants = ancestry_variants[ancestry_variants["Func.refGene"] == "exonic"]
        exonic_counts = exonic_variants.groupby("ExonicFunc.refGene").size().reset_index(name="VariantCount")
        exonic_counts["Ancestry"] = ancestry
        exonic_counts["Func.refGene"] = "exonic"

        summary_tables.append(func_counts)
        summary_tables.append(exonic_counts)

final_summary = pd.concat(summary_tables, ignore_index=True)

final_summary.to_csv("AMP_PD_variant_ancestry_func_and_exonic_type_counts2.txt", sep="\t", index=False)



In [None]:
! head AMP_PD_variant_ancestry_func_and_exonic_type_counts2.txt

## Association analysis

In [1]:
import numpy as np
import pandas as pd
import sys
from functools import reduce
import argparse

In [None]:
%%bash
module load plink
plink2 --bfile AMPPD_Alex_vars_all_${ANCESTRY} --mac 2 --make-bed --out  AMPPD_Alex_vars_all_MAC2_${ANCESTRY}

In [None]:
%%bash
module load plink
plink2 --bfile AMPPD_Alex_vars_all_MAC2_${ANCESTRY} --recode vcf-iid --out AMPPD_Alex_vars_all_MAC2_${ANCESTRY}_recode

In [34]:
!gzip AMPPD_Alex_vars_all_MAC2_${ANCESTRY}_recode.vcf

In [5]:
import pandas as pd

df = pd.read_csv("AMP_PD_8.vcf.anno.hg38_multianno.txt", sep='\t')

filtered_df = df[df['Func.refGene'].isin(['splicing', 'exonic'])]

filtered_df.to_csv("AMP_PD_8_splicing_exonic.txt", sep='\t', index=False)


In [None]:
! wc AMP_PD_8_splicing_exonic.txt

In [None]:
import pandas as pd

filter_df = pd.read_csv("AMP_PD_8_splicing_exonic.txt", sep='\t', usecols=["Chr", "Start", "Ref", "Alt"])

filter_df['Chr'] = filter_df['Chr'].astype(str).str.replace('chr', '', regex=False)

filter_set = set(zip(filter_df['Chr'], filter_df['Start'].astype(str), filter_df['Ref'], filter_df['Alt']))

ancestries = ["EUR", "AFR", "AMR", "EAS", "SAS", "AAC", "AJ", "CAS", "CAH", "FIN", "MDE"]

for ancestry in ancestries:
    input_vcf = f"AMPPD_Alex_vars_all_MAC2_{ancestry}_recode.vcf" 
    output_vcf = f"AMPPD_Alex_vars_splicing_exonic_{ancestry}.vcf"

    with open(input_vcf, 'r') as infile, open(output_vcf, 'w') as outfile:
        for line in infile:
            if line.startswith("#"):
                outfile.write(line)  
            else:
                cols = line.strip().split("\t")
                chrom, pos, ref, alt = cols[0], cols[1], cols[3], cols[4]
                if (chrom, pos, ref, alt) in filter_set:
                    outfile.write(line)



In [None]:
! wc AMPPD_Alex_vars_splicing_exonic_${ANCESTRY}.vcf

In [86]:
!gzip AMPPD_Alex_vars_splicing_exonic_${ANCESTRY}.vcf 

## Creating pheno file

In [None]:
import pandas as pd
import os

ancestries = ["AFR", "AJ", "EUR", "MDE", "AAC", "AMR", "FIN", "CAS", "SAS", "EAS", "CAH"]

covariate_file = "/${WORK_DIR}/fromTerra/addedPHENO_COVFILE_releasev3_SEPT2022_fromTerra.csv"
cov_df = pd.read_csv(covariate_file)

sample_id_column = 'ID'
pheno_column = 'PD_PHENO'

for ancestry in ancestries:
    sample_file = f"/${WORK_DIR}/FILTERED.AMP_PD_ancestry_{ancestry}.samples"
    remove_file = f"/${WORK_DIR}/toRemove_1stand2ndDegree_Relateds_{ancestry}_noDups.txt"
    output_file = f"FILTERED.AMP_PD_ancestry_{ancestry}_PHENO.txt"

    if not os.path.isfile(sample_file):
        print(f"‚ö†Ô∏è Sample file not found: {sample_file}")
        continue

    samples = pd.read_csv(sample_file, sep="\t", header=None, names=["FID", "IID"])

    if os.path.isfile(remove_file):
        to_remove = pd.read_csv(remove_file, sep="\t", header=None, names=["FID", "IID"])
        before = len(samples)
        samples = samples[~samples["IID"].isin(to_remove["IID"])]
        print(f"üßπ {ancestry}: {before - len(samples)} related samples removed.")
    else:
        print(f"‚ö†Ô∏è No related file found for {ancestry}, skipping related filtering.")

    merged = samples.merge(cov_df[[sample_id_column, pheno_column]],
                           left_on="IID", right_on=sample_id_column, how="left")

    filtered = merged[merged[pheno_column].isin([1, 2])]

    final = filtered[["FID", "IID", pheno_column]]
    final.columns = ["FID", "IID", "PHENO"]

    final.to_csv(output_file, sep="\t", index=False)

    print(f"‚úÖ Ancestry {ancestry}: {len(final)} samples saved to {output_file}")



In [None]:
import pandas as pd

covar_file = "qc_covar_PD_unrelateds_only.csv"
covar_df = pd.read_csv(covar_file)

pca_files = {
    "AJ": "/${WORK_DIR}/PCA.FILTERED.AMP_PD_AJ.PD.eigenvec",
    "EUR": "/${WORK_DIR}/PCA.FILTERED.AMP_PD_EUR.PD.eigenvec"
}

merged_list = []

for ancestry, pca_path in pca_files.items():
    pca_df = pd.read_csv(pca_path, delim_whitespace=True)

    if '#FID' in pca_df.columns:
        pca_df.rename(columns={'#FID': 'FID'}, inplace=True)

    merged = covar_df.merge(pca_df, left_on="ID", right_on="IID", how="inner")

    merged["Ancestry"] = ancestry

    merged_list.append(merged)

    print(f"‚úÖ {ancestry}: Matched {len(merged)} samples.")

final_merged = pd.concat(merged_list, ignore_index=True)

final_merged['FID'] = final_merged['ID']
final_merged['IID'] = final_merged['ID']

cols = ['FID', 'IID'] + [col for col in final_merged.columns if col not in ['FID', 'IID']]
final_merged = final_merged[cols]

final_merged.to_csv("qc_covar_PD_unrelateds_with_PCs.txt", sep='\t', index=False)



In [None]:
import pandas as pd

df = pd.read_csv("qc_covar_PD_unrelateds_with_PCs.csv")

missing_counts = df[['AGE_BASELINE', 'AGE_DIAGNOSIS']].isnull().sum()

print("üîç Missing data summary:")
print(missing_counts)


In [None]:
import pandas as pd

df = pd.read_csv("qc_covar_PD_unrelateds_with_PCs.csv")

df['AGE'] = df.apply(
    lambda row: row['AGE_DIAGNOSIS'] if row['PD_PHENO'] == 2 else row['AGE_BASELINE'] if row['PD_PHENO'] == 1 else None,
    axis=1
)

df.to_csv("qc_covar_PD_unrelateds_with_PCs_with_AGE.csv", index=False)


In [None]:
import pandas as pd

df = pd.read_csv("qc_covar_PD_unrelateds_with_PCs_with_AGE.csv")

missing_counts = df[['AGE']].isnull().sum()

print("üîç Missing data summary:")
print(missing_counts)

In [None]:
import pandas as pd

covar_file = "qc_covar_PD_unrelateds_with_PCs.txt"
covar_df = pd.read_csv(covar_file, sep="\t")  

covar_df.columns = covar_df.columns.str.strip()

columns_to_keep = ['FID', 'IID', 'SEX', 'AGE_BASELINE', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']

filtered_df = covar_df[columns_to_keep]

filtered_df.to_csv("qc_covar_PD_unrelateds_filtered.txt", sep='\t', index=False)


In [None]:
%%bash
module load plink/2.0

vcf_file="/${WORK_DIR}/AMPPD_Alex_vars_splicing_exonic_EUR.vcf.gz"

ancestries=("EUR")

output_dir="/${WORK_DIR}"

mkdir -p "$output_dir"

for ancestry in "${ancestries[@]}"; do
  echo "Processing ancestry: $ancestry"

  plink2 \
    --vcf "$vcf_file" \
    --double-id \
    --pheno "FILTERED.AMP_PD_ancestry_${ancestry}_PHENO.txt" \
    --adjust \
    --ci 0.95 \
    --covar "qc_covar_PD_unrelateds_filtered.txt" \
    --covar-name SEX,AGE_BASELINE,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \
    --threads 15 \
    --covar-variance-standardize \
    --out "${output_dir}/Logistic_FID_IID_PHENO_case_controls_${ancestry}_AMPPD_chr11_Alex" \
    --glm omit-ref firth-fallback cols=+a1freq,+a1freqcc,+a1count,+totallele,+a1countcc,+totallelecc,+gcountcc,+err \
    --silent
done

In [None]:

import pandas as pd

file_path = "Logistic_FID_IID_PHENO_case_controls_EUR_AMPPD_chr11_Alex.PHENO.glm.logistic.hybrid"
df = pd.read_csv(file_path, sep="\t")

filtered_df = df[df['TEST'] == 'ADD']

filtered_df.to_csv("Logistic_FID_IID_PHENO_case_controls_EUR_AMPPD_chr11_Alex.PHENO.glm.logistic.hybrid.ADD_only.txt", sep='\t', index=False)



In [None]:
%%bash
module load plink/2.0

vcf_file="/${WORK_DIR}/AMPPD_Alex_vars_splicing_exonic_AJ.vcf.gz"

ancestries=("AJ")

output_dir="/${WORK_DIR}"

mkdir -p "$output_dir"

for ancestry in "${ancestries[@]}"; do
  echo "Processing ancestry: $ancestry"

  plink2 \
    --vcf "$vcf_file" \
    --double-id \
    --pheno "FILTERED.AMP_PD_ancestry_${ancestry}_PHENO.txt" \
    --adjust \
    --ci 0.95 \
    --covar "qc_covar_PD_unrelateds_filtered.txt" \
    --covar-name SEX,AGE_BASELINE,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \
    --threads 15 \
    --covar-variance-standardize \
    --out "${output_dir}/Logistic_FID_IID_PHENO_case_controls_${ancestry}_AMPPD_chr11_Alex" \
    --glm omit-ref firth-fallback cols=+a1freq,+a1freqcc,+a1count,+totallele,+a1countcc,+totallelecc,+gcountcc,+err \
    --silent
done

In [None]:
import pandas as pd

file_path = "Logistic_FID_IID_PHENO_case_controls_AJ_AMPPD_chr11_Alex.PHENO.glm.logistic.hybrid"
df = pd.read_csv(file_path, sep="\t")

filtered_df = df[df['TEST'] == 'ADD']

filtered_df.to_csv("Logistic_FID_IID_PHENO_case_controls_AJ_AMPPD_chr11_Alex.PHENO.glm.logistic.hybrid.ADD_only.txt", sep='\t', index=False)



## Burden analysis

In [None]:
! wget https://github.com/zhanxw/rvtests/releases/download/v2.1.0/rvtests_linux64.tar.gz
! tar -xvzf  rvtests_linux64.tar.gz

In [167]:
! bgzip AMPPD_Alex_vars_splicing_exonic_EUR.vcf -k

In [168]:
! tabix -f -p vcf AMPPD_Alex_vars_splicing_exonic_EUR.vcf.gz

In [11]:
! bgzip AMPPD_Alex_vars_splicing_exonic_AJ.vcf -k

In [12]:
! tabix -f -p vcf AMPPD_Alex_vars_splicing_exonic_AJ.vcf.gz

In [3]:
import pandas as pd

pheno = pd.read_csv("FILTERED.AMP_PD_ancestry_EUR_PHENO.txt", sep="\t")
covar = pd.read_csv("qc_covar_PD_unrelateds_filtered.txt", sep="\t")

merged = pd.merge(covar, pheno, on=["FID", "IID"], how="inner")

merged.to_csv("merged_file.txt", sep="\t", index=False)


In [None]:
!executable/rvtest \
    --inVcf "AMPPD_Alex_vars_splicing_exonic_EUR.vcf.gz" \
    --out "Alex_exonic_splicing_SORL1_Burden_EUR" \
    --numThread 10 \
    --noweb \
    --hide-covar \
    --kernel skat,skato \
    --pheno "merged_file.txt" \
    --pheno-name PHENO \
    --geneFile "/data/CARD_AA/data/AD_Marzieh/Alex/refFlat.txt" \
    --covar "merged_file.txt" \
    --covar-name SEX,AGE_BASELINE,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \
    --multipleAllele \
    --gene SORL1

In [6]:
import pandas as pd

pheno = pd.read_csv("FILTERED.AMP_PD_ancestry_AJ_PHENO.txt", sep="\t")
covar = pd.read_csv("qc_covar_PD_unrelateds_filtered.txt", sep="\t")

merged = pd.merge(covar, pheno, on=["FID", "IID"], how="inner")

merged.to_csv("merged_file_AJ.txt", sep="\t", index=False)


In [None]:
!executable/rvtest \
    --inVcf "AMPPD_Alex_vars_splicing_exonic_AJ.vcf.gz" \
    --out "Alex_exonic_splicing_SORL1_Burden_AJ" \
    --numThread 10 \
    --noweb \
    --hide-covar \
    --kernel skat,skato \
    --pheno "merged_file_AJ.txt" \
    --pheno-name PHENO \
    --geneFile "/data/CARD_AA/data/AD_Marzieh/Alex/refFlat.txt" \
    --covar "merged_file_AJ.txt" \
    --covar-name SEX,AGE_BASELINE,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \
    --multipleAllele \
    --gene SORL1

## Calculate of F-U and F-A

In [None]:
%%bash
module load plink/1.9

vcf_file="/${WORK_DIR}/AMPPD_Alex_vars_splicing_exonic_EUR.vcf.gz"

ancestries=("EUR")

output_dir="/${WORK_DIR}"

mkdir -p "$output_dir"

for ancestry in "${ancestries[@]}"; do
  echo "Processing ancestry: $ancestry"

  plink \
    --vcf "$vcf_file" \
    --double-id \
    --pheno "FILTERED.AMP_PD_ancestry_${ancestry}_PHENO.txt" \
    --pheno-name PHENO \
    --assoc \
    --adjust \
    --ci 0.95 \
    --allow-no-sex \
    --threads 15 \
    --out "${output_dir}/Asso_FID_IID_PHENO_case_controls_${ancestry}_AMPPD_chr11_Alex" \
    --silent
done

In [None]:
%%bash
module load plink/1.9

vcf_file="/${WORK_DIR}/AMPPD_Alex_vars_splicing_exonic_AJ.vcf.gz"

ancestries=("AJ")

output_dir="/${WORK_DIR}"

mkdir -p "$output_dir"

for ancestry in "${ancestries[@]}"; do
  echo "Processing ancestry: $ancestry"

  plink \
    --vcf "$vcf_file" \
    --double-id \
    --pheno "FILTERED.AMP_PD_ancestry_${ancestry}_PHENO.txt" \
    --pheno-name PHENO \
    --assoc \
    --adjust \
    --ci 0.95 \
    --allow-no-sex \
    --threads 15 \
    --out "${output_dir}/Asso_FID_IID_PHENO_case_controls_${ancestry}_AMPPD_chr11_Alex" \
    --silent
done