# ADSP

* **Project:** ADRD-SORL1-Biobanks
* **Version:** Python/3.10
* **Last Updated:** 14-Jun-2025

## Notebook Overview
Characterization of SORL1 variants, allele freqs, association analysis, burden analysis, clinical data

# Query ADSP to check for variants of interest, allele frequency, and to calculate missingness

## Variables used 

- `${ANCESTRY}` = EUR, AFR, AMR, AAC, AJ, MDE, SAS, CAS, EAS, FIN, CAH
- `chr${}:Position:A1:A2` = Chromosome number, position, reference and alternative alleles
- `${}= 1-10`

In [None]:
import pandas as pd

In [10]:
%%bash
module load plink

plink2 \
  --pfile /${WORK_DIR}/chr11.compact_filtered.r4.wgs.biallelic \
  --chr 11 \
  --from-bp 121452314 \
  --to-bp 121633763 \
  --mac 2 \
  --make-bed \
  --out bed_Alex_output.txt 


In [None]:
%%bash
module load plink

plink2 \
  --pfile /${WORK_DIR}/chr11.compact_filtered.r4.wgs.biallelic \
  --chr 11 \
  --from-bp 121452314 \
  --to-bp 121633763 \
  --mac 2 \
  --make-pgen \
  --out Alex_output.txt 

In [None]:
%%bash
module load plink

plink2 --bfile bed_Alex_output.txt --recode A --out recoded_bed_Alex_output.txt

In [None]:
%%bash
module load plink
plink2 -pfile Alex_output.txt --missing --out Alex_output.txt_missing

In [None]:
%%bash
module load plink
plink2 --pfile Alex_output.txt --freq --out Alex_output.txt_freq

In [None]:
${ANCESTRY}= pd.read_csv("${WORK_DIR}/FILTERED.merged_biallelic_${ANCESTRY}.psam", sep = '\t')
${ANCESTRY}.head()

In [None]:
${ANCESTRY}_keep = ${ANCESTRY} [["#FID", "IID"]]
${ANCESTRY}_keep.to_csv("adsp_${ANCESTRY}_keep.txt", sep="\t", index=False)

In [None]:
%%bash
module load plink/1.9

plink --bfile bed_Alex_output.txt --keep adsp_${ANCESTRY}_keep.txt --make-bed --out bed_Alex_adsp_${ANCESTRY}

In [None]:
%%bash
module load plink/1.9  
plink --bfile bed_Alex_adsp_${ANCESTRY} --remove /${WORK_DIR}/REMOVE.FILTERED.merged_biallelic_${ANCESTRY}.related --make-bed --out bed_Alex_adsp_${ANCESTRY}_unrelated

In [None]:
%%bash
module load plink/1.9

plink --bfile bed_Alex_adsp_${ANCESTRY}_unrelated --freq --out bed_Alex_adsp_${ANCESTRY}_unrelated_freq

In [None]:
qc_covar = pd.read_csv("/${WORK_DIR}/covars_for_QC.txt", sep="\t")
qc_covar.head()

In [None]:
qc_case = qc_covar[qc_covar["PHENO"]==2]
qc_case.info()

In [None]:
qc_case_plink = qc_case[["FID", "IID"]]
qc_case_plink.to_csv("qc_case_plink.txt", sep="\t", index=False)

In [None]:
qc_control_plink = qc_control[["FID", "IID"]]
qc_control_plink.to_csv("qc_control_plink.txt", sep="\t", index=False)

In [None]:
%%bash
module load plink/1.9

plink --bfile bed_Alex_adsp_${ANCESTRY}_unrelated --keep qc_case_plink.txt --make-bed --out bed_Alex_adsp_${ANCESTRY}_unrelated_cases

In [None]:
%%bash
module load plink/1.9

plink --bfile bed_Alex_adsp_${ANCESTRY}_unrelated_cases --freq --out bed_Alex_adsp_${ANCESTRY}_unrelated_cases_freq

In [None]:
%%bash
module load plink/1.9

plink --bfile bed_Alex_adsp_${ANCESTRY}_unrelated --keep qc_control_plink.txt --make-bed --out bed_Alex_adsp_${ANCESTRY}_unrelated_controls

In [None]:
%%bash
module load plink/1.9

plink --bfile bed_Alex_adsp_${ANCESTRY}_unrelated_controls --freq --out bed_Alex_adsp_${ANCESTRY}_unrelated_controls_freq

In [None]:
%%bash
module load plink

plink2 --bfile  bed_Alex_adsp_${ANCESTRY}_unrelated_cases --recode A --out recoded__${ANCESTRY}_case_Alex_output.txt

# Association analysis

In [None]:
import numpy as np
import pandas as pd
import sys
from functools import reduce
import argparse
from glob import glob
import subprocess

In [None]:
%%bash
module load plink
plink2 --bfile bed_Alex_output.txt --recode vcf-iid --out Alex_output_MAC2_recode

In [None]:
%%bash
module load annovar

In [None]:
%%bash
table_annovar.pl Alex_output_MAC2_recode.vcf $ANNOVAR_DATA/hg38 \
    --buildver hg38 \
    --remove \
    --thread 48 \
    --maxgenethread 48 \
    --protocol refGene,clinvar_20140902,avsnp151,gnomad_genome,dbnsfp47a \
    --operation g,f,f,f,f \
    --nopolish \
    --nastring . \
    --out Alex_output_MAC2_recode.vcf.annovar \
    --vcfinput

In [None]:
import pandas as pd

# Load the file into a DataFrame
input_file = "Alex_output_MAC2_recode.vcf.annovar.hg38_multianno.txt"
output_file = "Alex_exonic_splicing_variants.txt"

# Read the tab-delimited file
df = pd.read_csv(input_file, sep="\t")

# Filter for exonic and splicing variants
filtered_df = df[df["Func.refGene"].isin(["exonic", "splicing"])]

# Save the result to a new file
filtered_df.to_csv(output_file, sep="\t", index=False)

print(f"Filtered variants saved to {output_file}")


In [None]:
import pandas as pd

# Full ancestry list
ancestries = ["EUR", "AFR", "AMR", "EAS", "SAS", "AAC", "MDE", "AJ", "FIN", "CAS", "CAH"]

# Generate keep files
for ancestry in ancestries:
    pheno_file = f"/${WORK_DIR}/FID_IID_PHENO_{ancestry}.fam"
    try:
        df = pd.read_csv(pheno_file, delim_whitespace=True)
        keep_file = f"{ancestry}.keep"
        df[['FID', 'IID']].to_csv(keep_file, sep=' ', index=False, header=False)
        print(f"Created {keep_file} from {pheno_file}")
    except FileNotFoundError:
        print(f"WARNING: {pheno_file} not found, skipping.")


In [None]:
%%bash
module load plink/1.9

base_plink_file="bed_Alex_output.txt"
ancestries=("EUR" "AFR" "AMR" "EAS" "SAS" "AAC" "MDE" "AJ" "FIN" "CAS" "CAH")

for ancestry in "${ancestries[@]}"; do
    keep_file="${ancestry}.keep"
    subset_prefix="temp_${ancestry}"
    final_prefix="Alex_output_MAC2_${ancestry}"

    if [[ -f "$keep_file" ]]; then
        echo "Processing $ancestry..."

        # Step 1: Subset samples by ancestry
        plink \
            --bfile "$base_plink_file" \
            --keep "$keep_file" \
            --make-bed \
            --out "$subset_prefix"

        # Step 2: Filter variants with at least 1 minor allele in subset
        plink \
            --bfile "$subset_prefix" \
            --mac 1 \
            --make-bed \
            --out "$final_prefix"

        # Remove temporary files
        rm "${subset_prefix}".bed "${subset_prefix}".bim "${subset_prefix}".fam

    else
        echo "WARNING: $keep_file not found, skipping $ancestry."
    fi
done


In [None]:
%%bash
module load plink

# List of 11 ancestries
ancestries=("EUR" "AFR" "AMR" "EAS" "SAS" "AAC" "MDE" "AJ" "FIN" "CAS" "CAH")

# Loop through each ancestry and run plink2 recode
for ancestry in "${ancestries[@]}"; do
    input_prefix="Alex_output_MAC2_${ancestry}"
    output_prefix="${input_prefix}_recode"

    if [[ -f "${input_prefix}.bed" ]]; then
        echo "Running plink2 recode for ${ancestry}..."
        plink2 \
            --bfile "$input_prefix" \
            --recode vcf-iid \
            --out "$output_prefix"
    else
        echo "WARNING: ${input_prefix}.bed not found. Skipping ${ancestry}."
    fi
done


In [None]:
import pandas as pd
import os

# List of 11 ancestries
ancestries = ["EUR", "AFR", "AMR", "EAS", "SAS", "AAC", "MDE", "AJ", "FIN", "CAS", "CAH"]

# Input exonic variant list
exonic_variants_file = "Alex_exonic_splicing_variants.txt"

# Load exonic variants
exonic_variants = pd.read_csv(exonic_variants_file, sep="\t")
exonic_variants["Chr"] = exonic_variants["Chr"].astype(str)
exonic_variants["Start"] = exonic_variants["Start"].astype(str)

# Process each ancestry's VCF
for ancestry in ancestries:
    vcf_file = f"Alex_output_MAC2_{ancestry}_recode.vcf"
    output_vcf = f"Alex_exonic_splicing_output_{ancestry}.vcf"

    if not os.path.exists(vcf_file):
        print(f"VCF for {ancestry} not found: {vcf_file}")
        continue

    # Read VCF
    vcf_header = []
    vcf_data = []
    with open(vcf_file, "r") as vcf:
        for line in vcf:
            if line.startswith("#"):
                vcf_header.append(line)
            else:
                vcf_data.append(line.strip().split("\t"))

    if not vcf_data:
        print(f"No variant data found in {vcf_file}")
        continue

    vcf_columns = vcf_header[-1].strip().split("\t")
    vcf_df = pd.DataFrame(vcf_data, columns=vcf_columns)

    # Filter VCF
    filtered_vcf = vcf_df[
        (vcf_df["#CHROM"].isin(exonic_variants["Chr"])) &
        (vcf_df["POS"].isin(exonic_variants["Start"])) &
        (vcf_df["REF"].isin(exonic_variants["Ref"])) &
        (vcf_df["ALT"].isin(exonic_variants["Alt"]))
    ]

    # Write filtered VCF
    with open(output_vcf, "w") as out_vcf:
        out_vcf.writelines(vcf_header)
        for _, row in filtered_vcf.iterrows():
            out_vcf.write("\t".join(row) + "\n")

    print(f"[âœ“] Filtered VCF written for {ancestry}: {output_vcf}")


In [None]:
%%bash
# List of ancestries
ancestries=("EUR" "AFR" "AMR" "EAS" "SAS" "AAC" "MDE" "AJ" "FIN" "CAS" "CAH")

# Loop through and gzip each file
for ancestry in "${ancestries[@]}"; do
    vcf_file="Alex_exonic_splicing_output_${ancestry}.vcf"
    if [[ -f "$vcf_file" ]]; then
        echo "Compressing $vcf_file..."
        gzip "$vcf_file"
    else
        echo "WARNING: $vcf_file not found, skipping."
    fi
done

In [None]:
%%bash
module load plink/2.0

# Define an array of ancestries
ancestries=("EUR" "AFR" "AMR" "EAS" "SAS" "AAC" "MDE" "AJ" "FIN" "CAS" "CAH")

# Define the output directory
output_dir="/${WORK_DIR}"

# Create the output directory if it does not exist
mkdir -p "$output_dir"

# Loop over each ancestry
for ancestry in "${ancestries[@]}"; do
  echo "Processing ancestry: $ancestry"

  # Define VCF file inside the loop so $ancestry is expanded
  vcf_file="/${WORK_DIR}/Alex_exonic_splicing_output_${ancestry}.vcf.gz"

  # Run PLINK
  plink2 \
    --vcf "$vcf_file" \
    --double-id \
    --pheno "/${WORK_DIR}/FID_IID_PHENO_${ancestry}.fam" \
    --adjust \
    --ci 0.95 \
    --covar "/${WORK_DIR}/covars_alldata_with999forAGRandRACE_PCA.txt" \
    --covar-name SEX,AGE,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \
    --threads 15 \
    --covar-variance-standardize \
    --out "${output_dir}/New_Logistic_FID_IID_PHENO_case_controls_${ancestry}_vars_chr11_exonic_splicing_Alex" \
    --glm omit-ref firth-fallback cols=+a1freq,+a1freqcc,+a1count,+totallele,+a1countcc,+totallelecc,+gcountcc,+err \
    --silent
done

In [None]:
import pandas as pd
import os

# Define the directory path
dir_path = "/${WORK_DIR}"

# Define the list of files and their corresponding ancestry codes
files_and_ancestries = [
    ("New_Logistic_FID_IID_PHENO_case_controls_AAC_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logistic.hybrid", "AAC"),
    ("New_Logistic_FID_IID_PHENO_case_controls_AFR_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logistic.hybrid", "AFR"),
    ("New_Logistic_FID_IID_PHENO_case_controls_AMR_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logistic.hybrid", "AMR"),
    ("New_Logistic_FID_IID_PHENO_case_controls_AJ_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logistic.hybrid", "AJ"),
    ("New_Logistic_FID_IID_PHENO_case_controls_EUR_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logistic.hybrid", "EUR"),
    ("New_Logistic_FID_IID_PHENO_case_controls_CAS_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logistic.hybrid", "CAS"),
    ("New_Logistic_FID_IID_PHENO_case_controls_SAS_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logistic.hybrid", "SAS"),
    ("New_Logistic_FID_IID_PHENO_case_controls_MDE_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logistic.hybrid", "MDE"),
    ("New_Logistic_FID_IID_PHENO_case_controls_EAS_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logistic.hybrid", "EAS"),
    ("New_Logistic_FID_IID_PHENO_case_controls_FIN_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logistic.hybrid", "FIN"),
    ("New_Logistic_FID_IID_PHENO_case_controls_CAH_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logistic.hybrid", "CAH"),
    
]

# Process each file
for file_name, ancestry in files_and_ancestries:
    file_path = os.path.join(dir_path, file_name)
    
    # Load the data
    df = pd.read_csv(file_path, sep='\t')

    # Add the new column with the corresponding ancestry code
    df['ancestry'] = ancestry

    # Save the updated DataFrame to a new file
    output_file_name = file_name.replace(".hybrid", "New__with_ancestry.txt")
    output_file_path = os.path.join(dir_path, output_file_name)
    df.to_csv(output_file_path, sep='\t', index=False)

    print(f"Updated file saved to {output_file_path}")

In [None]:
import pandas as pd
import os

# Define the directory path
dir_path = "/${WORK_DIR}"

# List of file paths to be combined
file_paths = [
    "New_Logistic_FID_IID_PHENO_case_controls_AAC_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logisticNew__with_ancestry.txt",
    "New_Logistic_FID_IID_PHENO_case_controls_AFR_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logisticNew__with_ancestry.txt",
    "New_Logistic_FID_IID_PHENO_case_controls_AMR_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logisticNew__with_ancestry.txt",
    "New_Logistic_FID_IID_PHENO_case_controls_AJ_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logisticNew__with_ancestry.txt",
    "New_Logistic_FID_IID_PHENO_case_controls_EUR_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logisticNew__with_ancestry.txt",
    "New_Logistic_FID_IID_PHENO_case_controls_CAS_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logisticNew__with_ancestry.txt",
    "New_Logistic_FID_IID_PHENO_case_controls_SAS_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logisticNew__with_ancestry.txt",
    "New_Logistic_FID_IID_PHENO_case_controls_MDE_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logisticNew__with_ancestry.txt",
    "New_Logistic_FID_IID_PHENO_case_controls_EAS_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logisticNew__with_ancestry.txt",
    "New_Logistic_FID_IID_PHENO_case_controls_FIN_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logisticNew__with_ancestry.txt",
    "New_Logistic_FID_IID_PHENO_case_controls_CAH_vars_chr11_exonic_splicing_Alex.PHENO1.glm.logisticNew__with_ancestry.txt",
]

# Initialize an empty list to hold the DataFrames
df_list = []

# Read each file and append the DataFrame to the list
for file_name in file_paths:
    file_path = os.path.join(dir_path, file_name)
    df = pd.read_csv(file_path, sep='\t')
    df_list.append(df)

# Concatenate all DataFrames in the list
combined_df = pd.concat(df_list, ignore_index=True)

# Save the combined DataFrame to a new file
output_file_path = os.path.join(dir_path, "New_Combined_Logistic_chr11_exonic_splicing_Alex_with_ancestry.hybrid")
combined_df.to_csv(output_file_path, sep='\t', index=False)

print(f"Combined data saved to {output_file_path}")


In [None]:
import pandas as pd

# Load the combined data file safely
file_path = "/${WORK_DIR}/New_Combined_Logistic_chr11_exonic_splicing_Alex_with_ancestry.hybrid"
df = pd.read_csv(file_path, sep='\t', low_memory=False)

# Filter for rows where TEST == "ADD"
filtered_df = df[df['TEST'] == 'ADD']

# Save the filtered DataFrame
output_file_path = "/${WORK_DIR}/New_Filtered_Combined_Logistic_chr11_exonic_splicing_Alex_with_ancestry.hybrid"
filtered_df.to_csv(output_file_path, sep='\t', index=False)

print(f"Filtered data saved to {output_file_path}")


In [None]:
import pandas as pd

# Load the filtered data file
file_path = "/${WORK_DIR}/New_Filtered_Combined_Logistic_chr11_exonic_splicing_Alex_with_ancestry.hybrid"
df = pd.read_csv(file_path, sep='\t')

# Select only the required columns
columns_to_keep = ['#CHROM', 'POS', 'REF', 'ALT', 'A1', 'P', 'OR', 'L95', 'U95', 'ancestry']
filtered_df = df[columns_to_keep]

# Save the filtered DataFrame to a new file
output_file_path = "/${WORK_DIR}/New_Selected_Combined_Logistic_chr11_exonic_splicing_Alex_with_ancestry.hybrid"
filtered_df.to_csv(output_file_path, sep='\t', index=False)

print(f"Data with selected columns saved to {output_file_path}")

In [None]:
import pandas as pd

# Load the data file
file_path = "/${WORK_DIR}/New_Selected_Combined_Logistic_chr11_exonic_splicing_Alex_with_ancestry.hybrid"
df = pd.read_csv(file_path, sep='\t')

# Define the desired order of ancestries
ancestry_order = ['EUR', 'AFR', 'AMR', 'EAS', 'SAS', 'MDE', 'AJ', 'FIN', 'AAC', 'CAS', 'CAH']

# Initialize a list to hold the rows for the new table
combined_data = []

# Group the data by variation (defined by #CHROM, POS, REF, ALT)
grouped = df.groupby(['#CHROM', 'POS', 'REF', 'ALT'])

# Iterate over each group and format the data for the combined table
for (chrom, pos, ref, alt), group in grouped:
    # Append the variation row
    combined_data.append([f"{chrom}:{pos} {ref}>{alt}", "", "", ""])
    
    # Append the header row for ancestries
    combined_data.append(["Ancestry", "A1", "P", "OR (L95_U95)"])
    
    # Create a dictionary of ancestries and their data
    ancestry_dict = {row['ancestry']: [row['A1'], row['P'], f"{row['OR']} ({row['L95']}_{row['U95']})"] for _, row in group.iterrows()}
    
    # Append the data rows for each ancestry in the specified order if it exists
    for ancestry in ancestry_order:
        if ancestry in ancestry_dict:
            combined_data.append([ancestry] + ancestry_dict[ancestry])

    # Add an empty row for separation between variations
    combined_data.append(["", "", "", ""])

# Convert the combined data into a DataFrame
combined_df = pd.DataFrame(combined_data)

# Save the combined DataFrame to a new file
output_file_path = "New_Combined_Variations_Table_chr11_exonic_splicing_Alex.tsv"
combined_df.to_csv(output_file_path, sep='\t', header=False, index=False)

print(f"Combined variations table saved to {output_file_path}")


In [None]:
%%bash
module load plink/1.9

# Define an array of ancestries
ancestries=("EUR" "AFR" "AMR" "EAS" "SAS" "AAC" "MDE" "AJ" "FIN" "CAS" "CAH")

# Define the output directory
output_dir="/{WORK_DIR}"

# Create the output directory if it does not exist
mkdir -p "$output_dir"

# Loop over each ancestry
for ancestry in "${ancestries[@]}"; do
  echo "Processing ancestry: $ancestry"

  # Define VCF file inside the loop
  vcf_file="${output_dir}/Alex_exonic_splicing_output_${ancestry}.vcf.gz"

  if [[ -f "$vcf_file" ]]; then
    plink \
      --vcf "$vcf_file" \
      --double-id \
      --pheno "/{WORK_DIR}/logistic_regression/FID_IID_PHENO_${ancestry}.fam" \
      --pheno-name PHENO1 \
      --assoc \
      --ci 0.95 \
      --adjust \
      --allow-no-sex \
      --threads 15 \
      --out "${output_dir}/New_Assoc_FID_IID_PHENO_case_controls_${ancestry}_chr11_exonic_splicing_Alex" \
      --silent
  else
    echo "WARNING: VCF file for $ancestry not found: $vcf_file"
  fi
done


# Burden analysis

In [None]:
! wget https://github.com/zhanxw/rvtests/releases/download/v2.1.0/rvtests_linux64.tar.gz
! tar -xvzf  rvtests_linux64.tar.gz

In [None]:
%%bash

# Define an array of ancestries
ancestries=("EUR" "AFR" "AMR" "EAS" "SAS" "AAC" "MDE" "AJ" "FIN" "CAS" "CAH")

# Directory containing the VCF files
vcf_dir="/${WORK_DIR}"

# Loop through each ancestry
for ancestry in "${ancestries[@]}"; do
    vcf_file="${vcf_dir}/Alex_exonic_splicing_output_${ancestry}.vcf"
    
    if [[ -f "$vcf_file" ]]; then
        echo "Compressing $vcf_file..."
        bgzip -k "$vcf_file"
    else
        echo "WARNING: File $vcf_file not found. Skipping."
    fi
done

In [None]:
%%bash

# Define an array of ancestries
ancestries=("EUR" "AFR" "AMR" "EAS" "SAS" "AAC" "MDE" "AJ" "FIN" "CAS" "CAH")

# Directory containing the VCF files
vcf_dir="/${WORK_DIR}"

# Loop through each ancestry
for ancestry in "${ancestries[@]}"; do
    vcf_file="${vcf_dir}/Alex_exonic_splicing_output_${ancestry}.vcf.gz"
    
    if [[ -f "$vcf_file" ]]; then
        echo "Indexing $vcf_file..."
        tabix -f -p vcf "$vcf_file"
    else
        echo "WARNING: File $vcf_file not found. Skipping."
    fi
done


In [None]:
df_covar = pd.read_csv("/${WORK_DIR}/covars_alldata_with999forAGRandRACE_PCA.txt", sep="\t")
df_covar

In [None]:
for ancestry in ['EUR', 'AFR', 'AMR', 'EAS', 'SAS', 'MDE', 'AJ', 'FIN', 'AAC', 'CAS', 'CAH']:
    df_pheno = pd.read_csv(f"/${WORK_DIR}/logistic_regression/FID_IID_PHENO_{ancestry}.fam", sep=" ")
    df_covar_ancestry = df_covar.merge(df_pheno, on=["FID","IID"], how="inner")
    df_covar_ancestry.to_csv(f"/${WORK_DIR}/burden_covar_{ancestry}.txt", sep="\t", index=False)


In [None]:
for ancestry in ['EUR', 'AFR', 'AMR', 'EAS', 'SAS', 'MDE', 'AJ', 'FIN', 'AAC', 'CAS', 'CAH']:
    !executable/rvtest \
    --inVcf "/${WORK_DIR}/Alex_exonic_splicing_output_{ancestry}.vcf.gz" \
    --out "/${WORK_DIR}/NewAlex_exonic_splicing_SORL1_Burden_{ancestry}" \
    --numThread 10 \
    --noweb \
    --hide-covar \
    --kernel skat,skato \
    --pheno "/${WORK_DIR}/burden_covar_{ancestry}.txt" \
    --pheno-name PHENO \
    --geneFile "/${WORK_DIR}/refFlat.txt" \
    --covar "/${WORK_DIR}/burden_covar_{ancestry}.txt" \
    --covar-name SEX,AGE,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \
    --multipleAllele \
    --gene SORL1

In [None]:
!executable/rvtest \
--inVcf "/${WORK_DIR}/Alex_exonic_splicing_output.vcf.gz" \
--out "Alex-MDE-BURDEN" \
--pheno "/d${WORK_DIR}/burden_covar_MDE.txt" \
--pheno-name PHENO \
--covar "/${WORK_DIR}/burden_covar_MDE.txt" \
--covar-name SEX,AGE,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \
--multipleAllele \
--single wald,score \
--numThread 10 \
--noweb \
--hide-covar

In [None]:
! awk '$13 < 0.05' Alex-MDE-BURDEN.SingleScore.assoc

In [None]:
!awk 'NR > 1 && $13 < 0.05' Alex-MDE-BURDEN.SingleScore.assoc | sort -k13,13g | head -n 20

# Clinical data for 10 identified variants

In [None]:
%%bash
module load plink
plink2 -pfile /${WORK_DIR}/chr11.compact_filtered.r4.wgs.biallelic --snps chr${}:Position:A1:A2   --make-bed --out chr11-${}

In [None]:
%%bash
module load plink
plink2 --bfile chr11-${}  --recode A --out chr11-${}_recoded

In [None]:
!awk '$7 == 0 || $7 == 1' chr11-${}_recoded.raw > chr11-${}_recoded.raw.filtered.raw

In [None]:
! cat chr11-${}_recoded.raw.filtered.raw

In [None]:
%%bash
module load plink/1.9

plink --bfile chr11-${} --keep qc_case_plink.txt --make-bed --out chr11-${}_cases

In [None]:
import pandas as pd


fam_file = 'chr11-${}_cases.fam'
fam_df = pd.read_csv(fam_file, delim_whitespace=True, header=None, usecols=[0, 1], names=['FID', 'IID'])

raw_file = 'chr11-${}_recoded.raw.filtered.raw'
raw_df = pd.read_csv(raw_file, delim_whitespace=True, header=None, usecols=[0, 1], names=['FID', 'IID'])

merged_df = pd.merge(fam_df, raw_df, on=['FID', 'IID'], how='inner')

merged_df.to_csv('chr11-${}_cases_filtered.fam', sep=' ', header=False, index=False)


In [None]:
! cat chr11-${}_cases_filtered.fam

In [None]:
%%writefile Alex_case_sampleids2.txt

In [None]:
import pandas as pd

qc_covar = pd.read_csv("/${WORK_DIR}/covars_for_QC.txt", sep="\t")

case_sampleids = pd.read_csv("Alex_case_sampleids2.txt", delim_whitespace=True, header=None, names=['FID', 'IID'])

merged_df = pd.merge(qc_covar, case_sampleids, on=['FID', 'IID'], how='inner')

merged_df.to_csv("qc_covar_Alex_case_sampleids.txt", sep="\t", index=False)

In [None]:
import pandas as pd

clinical_data = pd.read_csv("/${WORK_DIR}/ADNIPhenotypes_DS_2022.08.18_ALL.txt", sep="\t")

case_sampleids = pd.read_csv("Alex_case_sampleids2.txt", delim_whitespace=True, header=None, names=['FID', 'IID'])

first_column_name = clinical_data.columns[0]

filtered_clinical_data = clinical_data[clinical_data[first_column_name].isin(case_sampleids['FID'])]

filtered_clinical_data.to_csv("Alex_clinical_case_sampleids.txt", sep="\t", index=False)


In [None]:
! cat Alex_clinical_case_sampleids.txt

In [None]:
import pandas as pd

clinical_data = pd.read_csv("/${WORK_DIR}/ADSPCaseControlPhenotypes_DS_2022.08.18_ALL.txt", sep="\t", low_memory=False)

case_sampleids = pd.read_csv("Alex_case_sampleids2.txt", delim_whitespace=True, header=None, names=['FID', 'IID'])

first_column_name = clinical_data.columns[0]

filtered_clinical_data = clinical_data[clinical_data[first_column_name].isin(case_sampleids['FID'])]

filtered_clinical_data.to_csv("Alex_clinical2_case_sampleids.txt", sep="\t", index=False)


In [None]:
! cat Alex_clinical2_case_sampleids.txt

In [None]:
import pandas as pd

clinical_data = pd.read_csv("/${WORK_DIR}/ADSPFamilyBasedPhenotypes_DS_2022.08.18_ALL.txt", sep="\t", low_memory=False, encoding='latin1')

case_sampleids = pd.read_csv("Alex_case_sampleids2.txt", delim_whitespace=True, header=None, names=['FID', 'IID'])

first_column_name = clinical_data.columns[0]

filtered_clinical_data = clinical_data[clinical_data[first_column_name].isin(case_sampleids['FID'])]

filtered_clinical_data.to_csv("Alex_clinical3_case_sampleids.txt", sep="\t", index=False)

In [None]:
! cat Alex_clinical3_case_sampleids.txt

In [None]:
import pandas as pd

clinical_data = pd.read_csv("/${WORK_DIR}/ADSP-PHC-Biomarker_DS_2022.09.27_ALL.csv", sep=",", low_memory=False, encoding='latin1')

case_sampleids = pd.read_csv("Alex_case_sampleids2.txt", delim_whitespace=True, header=None, names=['FID'])

filtered_clinical_data = clinical_data[clinical_data['SUBJID'].isin(case_sampleids['FID'])]

filtered_clinical_data.to_csv("Alex_clinical4_case_sampleids.txt", sep="\t", index=False)

In [None]:
! cat Alex_clinical4_case_sampleids.txt

In [None]:
import pandas as pd

clinical_data = pd.read_csv("/${WORK_DIR}/phenotype_data/ADSP-PHC-Cognition_DS_2022.09.27_ALL.csv", sep=",", low_memory=False, encoding='latin1')

case_sampleids = pd.read_csv("Alex_case_sampleids2.txt", delim_whitespace=True, header=None, names=['FID'])

filtered_clinical_data = clinical_data[clinical_data['SUBJID'].isin(case_sampleids['FID'])]

filtered_clinical_data.to_csv("Alex_clinical5_case_sampleids.txt", sep="\t", index=False)

In [None]:
! cat Alex_clinical5_case_sampleids.txt

In [None]:
import pandas as pd

clinical_data = pd.read_csv("/${WORK_DIR}/ADSP-PHC-Neuropath_DS_2022.09.27_ALL.csv", sep=",", low_memory=False, encoding='latin1')

case_sampleids = pd.read_csv("Alex_case_sampleids2.txt", delim_whitespace=True, header=None, names=['FID'])

filtered_clinical_data = clinical_data[clinical_data['SUBJID'].isin(case_sampleids['FID'])]

filtered_clinical_data.to_csv("Alex_clinical6_case_sampleids.txt", sep="\t", index=False)

In [None]:
! cat Alex_clinical6_case_sampleids.txt