# LRRK2 p.A419V - Analysis in AMP-PD EUR 

- Project: Multiancestry LRRK2 p.A419V analysis
- Version: Python/3.10.12
- Created: 05-MAY-2025
- Last Update: 12-JUNE-2025

# Description

**1. Create Covariate**

**2. PLINK file preparation**
- Remove related individual
- Keep EUR samples

**3. HWE**

**4 . Check AMP-PD p.A419V MAF**

# Getting started

## Load python libraries

In [5]:
# Import necessary packages
import os
import pandas as pd
import numpy as np
from io import StringIO
from firecloud import api as fapi
from IPython.core.display import display, HTML
import urllib.parse
from google.cloud import bigquery
import sys as sys

# Define function
# Utility routine for printing a shell command before executing it
def shell_do(command):
    print(f'Executing: {command}', file=sys.stderr)
    !$command
    
def shell_return(command):
    print(f'Executing: {command}', file=sys.stderr)
    output = !$command
    return '\n'.join(output)

  from IPython.core.display import display, HTML


# Create covariate

In [4]:
# Load clinical information
pd_case_control_df = pd.read_csv(f'{WORK_DIR}/amp_pd_case_control.csv')

In [5]:
# Keep columns of interest
pd_case_control_latest_df = pd_case_control_df[['participant_id', 'diagnosis_latest', 'case_control_other_latest']].copy()

# Rename Columns
pd_case_control_latest_df.columns = ['ID', 'LATEST_DX', 'CASE_CONTROL']

In [6]:
#Check case/control value counts
print(pd_case_control_latest_df['CASE_CONTROL'].value_counts())

CASE_CONTROL
Control    4363
Case       3608
Other      2937
Name: count, dtype: int64


In [7]:
# Add column for study origin
pd_case_control_latest_df['COHORT']= np.where(pd_case_control_latest_df.ID.str.contains("LB-"), "LBD",
                                    np.where(pd_case_control_latest_df.ID.str.contains("PP-"), "PPMI",
                                    np.where(pd_case_control_latest_df.ID.str.contains("PD-"), "PDBP",
                                    np.where(pd_case_control_latest_df.ID.str.contains("HB-"), "HBS",
                                    np.where(pd_case_control_latest_df.ID.str.contains("LC-"), "LCC",
                                    np.where(pd_case_control_latest_df.ID.str.contains("BF-"), "BIOFIND",
                                    np.where(pd_case_control_latest_df.ID.str.contains("SU-"), "SURE-PD3",
                                    np.where(pd_case_control_latest_df.ID.str.contains("SY-"), "STEADY-PD3", np.nan))))))))

In [8]:
# Drop duplicates
case_con_reduced = pd_case_control_latest_df.copy()
case_con_reduced.drop_duplicates(subset=['ID'], inplace=True)

In [9]:
case_con_reduced.loc[case_con_reduced['CASE_CONTROL'] == "Control", 'CASE_CONTROL'] = 1
case_con_reduced.loc[case_con_reduced['CASE_CONTROL'] == "Case", 'CASE_CONTROL'] = 2
case_con_reduced.loc[case_con_reduced['CASE_CONTROL'] == "Other", 'CASE_CONTROL'] = -9

In [10]:
print(case_con_reduced['CASE_CONTROL'].value_counts())

CASE_CONTROL
1     4363
2     3608
-9    2937
Name: count, dtype: int64


In [11]:
# Load Enrollment.csv
enrollment_df = pd.read_csv(f'{WORK_DIR}/Enrollment.csv')

In [12]:
# Keep columns of interest
enrollment_subset_df = enrollment_df[['participant_id', 'study_arm']].copy()

# Rename columns
enrollment_subset_df.columns = ['ID', 'ENROLL_STUDY_ARM']
enrollment_subset_df.head()

# Drop duplicates
enrollment_subset_df.drop_duplicates(subset=['ID'], keep='first', inplace=True)

In [13]:
# load demographic data
demographics_df = pd.read_csv(f'{WORK_DIR}/Demographics.csv')

In [14]:
# Rename Columns
demographics_df.rename(columns = {'participant_id':'ID'}, inplace = True)
demographics_df.rename(columns = {'age_at_baseline':'BASELINE_AGE'}, inplace = True)
demographics_df.rename(columns = {'race':'RACE'}, inplace = True)
demographics_df.rename(columns = {'ethnicity':'ETHNICITY'}, inplace = True)

In [15]:
# Sort by visit month and Drop Duplicates
demographics_baseline_df = demographics_df \
.sort_values('visit_month', ascending=True) \
.drop_duplicates('ID').sort_index()

In [16]:
# Merge last diagnostic with diagnostic at enrollement
demographics_df_casecon = demographics_df.merge(case_con_reduced, on='ID', how='outer')

In [17]:
demographics_df_casecon['sex'].value_counts()

sex
Male      6039
Female    4869
Name: count, dtype: int64

In [18]:
# Recode the sex into numberic: 1 and 2

demographics_df_casecon.loc[demographics_df_casecon['sex'] == "Male", 'sex'] = 1
demographics_df_casecon.loc[demographics_df_casecon['sex'] == "Female", 'sex'] = 2

In [19]:
demographics_df_casecon.rename(columns = {"ID": "IID", "CASE_CONTROL":"PHENO", "sex": "SEX"}, inplace = True)
demographics_df_casecon = demographics_df_casecon[['IID', 'PHENO', 'SEX', 'RACE','ETHNICITY','BASELINE_AGE', 'LATEST_DX','COHORT']]

In [20]:
# Keep only columns of interest
demographics_df_casecon_toKeep = demographics_df_casecon[['IID', 'PHENO', 'SEX', 'RACE',
                                                          'ETHNICITY','BASELINE_AGE', 'LATEST_DX',
                                                          'COHORT']].copy()

In [21]:
enrollment_subset_df.rename(columns = {"ID" : "IID"}, inplace = True)

In [22]:
# Merge Pheno with demograhic data
enrollment_pheno_df = demographics_df_casecon_toKeep.merge(enrollment_subset_df, on='IID', how='outer')

# Create FID column
enrollment_pheno_df['FID'] = enrollment_pheno_df['IID'].values

# Order columns
reorder_enrollment_pheno_df = enrollment_pheno_df[['FID', 'IID', 'PHENO',
                                                  'SEX', 'RACE','ETHNICITY', 'BASELINE_AGE', 'LATEST_DX',
                                                  'COHORT', 'ENROLL_STUDY_ARM']].copy()

In [23]:
#Check phenotype vs. enrollment study arm value counts
reorder_enrollment_pheno_df.groupby(['PHENO', 'ENROLL_STUDY_ARM']).size().reset_index(name='counts')

Unnamed: 0,PHENO,ENROLL_STUDY_ARM,counts
0,-9,Disease Control,155
1,-9,Genetic Cohort PD,4
2,-9,Genetic Cohort Unaffected,44
3,-9,Genetic Registry Unaffected,16
4,-9,Healthy Control,15
5,-9,LBD,2521
6,-9,PD,27
7,-9,Prodromal,45
8,-9,SWEDD,9
9,1,Disease Control,1


In [24]:
#Remove individuals from the genetic registry cohorts and other enrollment categories
#We only want to keep  controls who were originally enrolled as controls, and PD cases originally enrolled as cases
#Will exclude prodromal, SWEDD and unknown individuals also
#Individuals enrolled as disease control but the latest diagnosis is PD will also be excluded

#Keep only individuals enrolled as Healthy Control/Disease Control or PD.
filtered_enrollment_pheno_df = reorder_enrollment_pheno_df.copy()
filtered_enrollment_pheno_df = filtered_enrollment_pheno_df[filtered_enrollment_pheno_df['ENROLL_STUDY_ARM'].isin(['Disease Control', 
                                                                                                                   'Healthy Control', 
                                                                                                                  'PD'])]
#Now remove individuals with PHENO of -9 (keep only individuals with PHENO of 1 or 2)
filtered_enrollment_pheno_df = filtered_enrollment_pheno_df[filtered_enrollment_pheno_df['PHENO'].isin([1,2])]

#Now remove individuals who were enrolled with the opposite diagnosis, i.e. individuals who were enrolled as controls but have a latest diagnosis of PD
filtered_enrollment_pheno_df = filtered_enrollment_pheno_df[((filtered_enrollment_pheno_df['PHENO'] == 2) & (filtered_enrollment_pheno_df['ENROLL_STUDY_ARM'] == 'PD')) | (filtered_enrollment_pheno_df['PHENO'] == 1)]


#Check value counts again
filtered_enrollment_pheno_df.groupby(['PHENO', 'ENROLL_STUDY_ARM']).size().reset_index(name='counts')

Unnamed: 0,PHENO,ENROLL_STUDY_ARM,counts
0,1,Disease Control,1
1,1,Healthy Control,3359
2,2,PD,2679


In [45]:
#Save file - this is intermediate not final covariate file
#This includes all ancestries
filtered_enrollment_pheno_df.to_csv(f'{WORK_DIR}/COVS_temp.txt', index=False, sep='\t', na_rep='NA')

In [35]:
filtered_enrollment_pheno_df[["FID", "IID", "SEX"]].to_csv(f"{WORK_DIR}/update_sex.txt", sep = "\t", header = True, index = False)
filtered_enrollment_pheno_df[["FID", "IID", "PHENO"]].to_csv(f"{WORK_DIR}/update_pheno.txt", sep = "\t", header = True, index = False)

# PLINK file preparation 



In [44]:
%%bash
WORK_DIR='/home/jupyter/A419V_release9/amppd'
cd $WORK_DIR

# Extract LRRK2 p.A419V only
/home/jupyter/plink2 \
--pfile chr12 \
--chr 12 \
--from-bp 40252984 \
--to-bp 40252984 \
--make-bed \
--out a419v_ampd

PLINK v2.0.0-a.6.9LM 64-bit Intel (29 Jan 2025)    cog-genomics.org/plink/2.0/
(C) 2005-2025 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to a419v_ampd.log.
Options in effect:
  --chr 12
  --from-bp 40252984
  --make-bed
  --out a419v_ampd
  --pfile chr12
  --to-bp 40252984

Start time: Tue Apr 22 03:04:27 2025
52216 MiB RAM detected, ~50112 available; reserving 26108 MiB for main
workspace.
Using up to 8 compute threads.
10418 samples (0 females, 0 males, 10418 ambiguous; 10418 founders) loaded from
chr12.psam.
7282999 variants loaded from chr12.pvar.
Note: No phenotype data present.
1 variant remaining after main filters.
Writing a419v_ampd.fam ... done.
Writing a419v_ampd.bim ... done.
Writing a419v_ampd.bed ... done.
End time: Tue Apr 22 03:04:55 2025


In [45]:
%%bash
WORK_DIR='/home/jupyter/A419V_release9/amppd'
cd $WORK_DIR

/home/jupyter/plink1.9 \
--bfile a419v_ampd \
--recode A \
--out a419v_ampd

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to a419v_ampd.log.
Options in effect:
  --bfile a419v_ampd
  --out a419v_ampd
  --recode A

52216 MB RAM detected; reserving 26108 MB for main workspace.
1 variant loaded from .bim file.
10418 people (0 males, 0 females, 10418 ambiguous) loaded from .fam.
Ambiguous sex IDs written to a419v_ampd.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 10418 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
1 variant and 10418 people pass filters and QC.
Note: No phenotypes present.
--recode A to a419v_ampd.raw ... 1011121314151617181920212223242526272829303132333435363738394041424344454

## Remove related individual

In [45]:
%%bash
WORK_DIR='/home/jupyter/A419V_release9/amppd'
cd $WORK_DIR

wc -l a419v_ampd.fam
wc -l FILTERED.AMP_PD_ancestry_EUR.samples

10418 a419v_ampd.fam
8607 FILTERED.AMP_PD_ancestry_EUR.samples


## Keep EUR samples

In [36]:
%%bash
WORK_DIR='/home/jupyter/A419V_release9/amppd'
cd $WORK_DIR

# Keep only EUR samples
/home/jupyter/plink1.9 \
--bfile a419v_ampd \
--keep FILTERED.AMP_PD_ancestry_EUR.samples \
--update-sex update_sex.txt \
--pheno update_pheno.txt \
--make-bed \
--out a419v_ampd_eur

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to a419v_ampd_eur.log.
Options in effect:
  --bfile a419v_ampd
  --keep FILTERED.AMP_PD_ancestry_EUR.samples
  --make-bed
  --out a419v_ampd_eur
  --pheno update_pheno.txt
  --update-sex update_sex.txt

52216 MB RAM detected; reserving 26108 MB for main workspace.
1 variant loaded from .bim file.
10418 people (0 males, 0 females, 10418 ambiguous) loaded from .fam.
Ambiguous sex IDs written to a419v_ampd_eur.nosex .
5871 phenotype values present after --pheno.
--update-sex: 5871 people updated, 169 IDs not present.
--keep: 8607 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 8607 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707

In [7]:
%%bash
WORK_DIR='/home/jupyter/A419V_release9/amppd'
cd $WORK_DIR

# Remove related individuals
/home/jupyter/plink1.9 \
--bfile a419v_ampd_eur \
--remove toRemove_1stand2ndDegree_Relateds_EUR.txt \
--make-bed \
--out a419v_ampd_eur_no_rel

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to a419v_ampd_eur_no_rel.log.
Options in effect:
  --bfile a419v_ampd_eur
  --make-bed
  --out a419v_ampd_eur_no_rel
  --remove toRemove_1stand2ndDegree_Relateds_EUR.txt

52216 MB RAM detected; reserving 26108 MB for main workspace.
1 variant loaded from .bim file.
8607 people (2844 males, 2330 females, 3433 ambiguous) loaded from .fam.
Ambiguous sex IDs written to a419v_ampd_eur_no_rel.nosex .
5174 phenotype values loaded from .fam.
--remove: 8282 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 8282 founders and 0 nonfounders present.




Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
1 variant and 8282 people pass filters and QC.
Among remaining phenotypes, 2251 are cases and 2835 are controls.  (3196
phenotypes are missing.)
--make-bed to a419v_ampd_eur_no_rel.bed + a419v_ampd_eur_no_rel.bim +
a419v_ampd_eur_no_rel.fam ... 101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899done.


# HWE

In [11]:
%%bash
WORK_DIR='/home/jupyter/A419V_release9/amppd'
cd $WORK_DIR

# Check HWE
/home/jupyter/plink1.9 \
--bfile a419v_ampd_eur_no_rel \
--hardy \
--out a419v_ampd_eur_no_rel

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to a419v_ampd_eur_no_rel.log.
Options in effect:
  --bfile a419v_ampd_eur_no_rel
  --hardy
  --out a419v_ampd_eur_no_rel

52216 MB RAM detected; reserving 26108 MB for main workspace.
1 variant loaded from .bim file.
8282 people (2803 males, 2283 females, 3196 ambiguous) loaded from .fam.
Ambiguous sex IDs written to a419v_ampd_eur_no_rel.nosex .
5086 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 8282 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
--hardy: Writing Hardy-Weinberg report (founders only) to
a419v_ampd_eur_no_rel.hwe ... 10111213141

In [12]:
%%bash
WORK_DIR='/home/jupyter/A419V_release9/amppd'
cd $WORK_DIR

head a419v_ampd_eur_no_rel.hwe

 CHR          SNP     TEST   A1   A2                 GENO   O(HET)   E(HET)            P 
  12   rs34594498      ALL    T    C             0/2/8280 0.0002415 0.0002415            1
  12   rs34594498      AFF    T    C             0/0/2251        0        0            1
  12   rs34594498    UNAFF    T    C             0/1/2834 0.0003527 0.0003527            1


 # Check AMP-PD p.A419V MAF

In [23]:
variant = "rs34594498_C"
results = []
label = "AMP_PD"
recode  = pd.read_csv(f"{WORK_DIR}/a419v_ampd_eur_no_rel.raw", delim_whitespace = True)

cases_data = recode[recode['PHENOTYPE'] == 2]
controls_data = recode[recode['PHENOTYPE'] == 1]
total_cases = cases_data.shape[0]
total_controls = controls_data.shape[0]
    
# Cases
hom_cases = (cases_data[variant] == 2).sum()
het_cases = (cases_data[variant] == 1).sum()
hom_ref_cases = (cases_data[variant] == 0).sum()
missing_cases = total_cases - (hom_cases + het_cases + hom_ref_cases)
freq_cases = (2 * hom_cases + het_cases) / (2 * (total_cases - missing_cases)) if (total_cases - missing_cases) > 0 else None

# Controls
hom_controls = (controls_data[variant] == 2).sum()
het_controls = (controls_data[variant] == 1).sum()
hom_ref_controls = (controls_data[variant] == 0).sum()
missing_controls = total_controls - (hom_controls + het_controls + hom_ref_controls)
freq_controls = (2 * hom_controls + het_controls) / (2 * (total_controls - missing_controls)) if (total_controls - missing_controls) > 0 else None
    
# Collect results
results.append({
        'Ancestry': label,
        'Variant': variant,
        'Hom Cases': hom_cases,
        'Het Cases': het_cases,
        'Hom Ref Cases': hom_ref_cases,
        'Missing Cases': missing_cases,
        'Total Cases': total_cases,
        'Carrier Freq in Cases': freq_cases,
        'Hom Controls': hom_controls,
        'Het Controls': het_controls,
        'Hom Ref Controls': hom_ref_controls,
        'Missing Controls': missing_controls,
        'Total Controls': total_controls,
        'Carrier Freq in Controls': freq_controls
    })

# Convert to DataFrame
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Ancestry,Variant,Hom Cases,Het Cases,Hom Ref Cases,Missing Cases,Total Cases,Carrier Freq in Cases,Hom Controls,Het Controls,Hom Ref Controls,Missing Controls,Total Controls,Carrier Freq in Controls
0,AMP_PD,rs34594498_C,2251,0,0,0,2251,1.0,2834,1,0,0,2835,0.999824


The variant only found in one control, further association study is not being done 