# LRRK2 p.A419V - Haplotype analysis 

- Project: Multiancestry LRRK2 p.A419V analysis
- Version: Python/3.10.12
- Last Updated: 05-MAY-2025

# Description

**1. Description**
- Getting started
- Load python libraries
- Define function
- Setting up path
- Install R and its packages

**2. Haplotype block size comparison**

**3. Haplotype association study**

**4. R2 calculation**

# Getting started

## Load python libraries

In [1]:
# Use the os package to interact with the environment
import os

# Bring in Pandas for Dataframe functionality
import pandas as pd

# Numpy for basics
import numpy as np

# Use StringIO for working with file contents
from io import StringIO

# Enable IPython to display matplotlib graphs
import matplotlib.pyplot as plt
%matplotlib inline

# Enable interaction with the FireCloud API
from firecloud import api as fapi

# Import the iPython HTML rendering for displaying links to Google Cloud Console
from IPython.core.display import display, HTML

# Import urllib modules for building URLs to Google Cloud Console
import urllib.parse

# BigQuery for querying data
from google.cloud import bigquery

#Import Sys
import sys as sys

  from IPython.core.display import display, HTML


## Define function

In [3]:
# Utility routine for printing a shell command before executing it
def shell_do(command):
    print(f'Executing: {command}', file=sys.stderr)
    !$command
    
def shell_return(command):
    print(f'Executing: {command}', file=sys.stderr)
    output = !$command
    return '\n'.join(output)

# Utility routine for printing a query before executing it
def bq_query(query):
    print(f'Executing: {query}', file=sys.stderr)
    return pd.read_gbq(query, project_id=BILLING_PROJECT_ID, dialect='standard')

# Utility routine for display a message and a link
def display_html_link(description, link_text, url):
    html = f'''
    <p>
    </p>
    <p>
    {description}
    <a target=_blank href="{url}">{link_text}</a>.
    </p>
    '''

    display(HTML(html))

# Utility routines for reading files from Google Cloud Storage
def gcs_read_file(path):
    """Return the contents of a file in GCS"""
    contents = !gsutil -u {BILLING_PROJECT_ID} cat {path}
    return '\n'.join(contents)
    
def gcs_read_csv(path, sep=None):
    """Return a DataFrame from the contents of a delimited file in GCS"""
    return pd.read_csv(StringIO(gcs_read_file(path)), sep=sep, engine='python')

# Utility routine for displaying a message and link to Cloud Console
def link_to_cloud_console_gcs(description, link_text, gcs_path):
    url = '{}?{}'.format(
        os.path.join('https://console.cloud.google.com/storage/browser',
                     gcs_path.replace("gs://","")),
        urllib.parse.urlencode({'userProject': BILLING_PROJECT_ID}))

    display_html_link(description, link_text, url)

## Setting up path

In [2]:
# Set up billing project and data path variables
BILLING_PROJECT_ID = os.environ['GOOGLE_PROJECT']
WORKSPACE_NAMESPACE = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE_NAME = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']

WORKSPACE_ATTRIBUTES = fapi.get_workspace(WORKSPACE_NAMESPACE, WORKSPACE_NAME).json().get('workspace',{}).get('attributes',{})

## GP2 v5.0 gs://gp2tier2/release9_18122024/
GP2_TIER1 = 'gs://gp2tier1/release9_18122024'
GP2_RELEASE_PATH = 'gs://gp2tier2/release9_18122024'
GP2_CLINICAL_RELEASE_PATH = f'{GP2_RELEASE_PATH}/clinical_data'
GP2_META_RELEASE_PATH = f'{GP2_RELEASE_PATH}/meta_data'
GP2_SUMSTAT_RELEASE_PATH = f'{GP2_RELEASE_PATH}/summary_statistics'

GP2_RAW_GENO_PATH = f'{GP2_RELEASE_PATH}/raw_genotypes'
GP2_IMPUTED_GENO_PATH = f'{GP2_RELEASE_PATH}/imputed_genotypes'
print('GP2 v5.0')
print(f'Path to GP2 v2.0 Clinical Data: {GP2_CLINICAL_RELEASE_PATH}')
print(f'Path to GP2 v2.0 Raw Genotype Data: {GP2_RAW_GENO_PATH}')
print(f'Path to GP2 v2.0 Imputed Genotype Data: {GP2_IMPUTED_GENO_PATH}')

GP2 v5.0
Path to GP2 v2.0 Clinical Data: gs://gp2tier2/release9_18122024/clinical_data
Path to GP2 v2.0 Raw Genotype Data: gs://gp2tier2/release9_18122024/raw_genotypes
Path to GP2 v2.0 Imputed Genotype Data: gs://gp2tier2/release9_18122024/imputed_genotypes


## Install R and its packages

In [4]:
%%bash
mkdir -p /home/jupyter/A419V_release9/R_packages

In [None]:
# Install and import rpy2
!pip install rpy2
import rpy2.rinterface

In [7]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [8]:
%%R

BILLING_PROJECT_ID  <- Sys.getenv('GOOGLE_PROJECT')
WORKSPACE_NAMESPACE <- Sys.getenv('WORKSPACE_NAMESPACE')
WORKSPACE_NAME      <- Sys.getenv('WORKSPACE_NAME')

In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
  library ‘/usr/lib/R/site-library’ contains no packages


In [9]:
%%R
pack <- "/home/jupyter/A419V_release9/R_packages"

install.packages("dplyr", lib = pack)
install.packages("data.table", lib = pack)
install.packages("arsenal", lib = pack)
install.packages("haplo.stats", lib = pack)

* installing *source* package ‘dplyr’ ...
** package ‘dplyr’ successfully unpacked and MD5 sums checked
** using staged installation
** libs
using C++ compiler: ‘g++ (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0’


g++ -std=gnu++17 -I"/usr/share/R/include" -DNDEBUG       -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2   -c chop.cpp -o chop.o
g++ -std=gnu++17 -I"/usr/share/R/include" -DNDEBUG       -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2   -c filter.cpp -o filter.o
g++ -std=gnu++17 -I"/usr/share/R/include" -DNDEBUG       -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2   -c funs.cpp -o funs.o
g++ -std=gnu++17 -I"/usr/share/R/include" -DNDEBUG       -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2   -c group_by.cpp -o group_by.o
g++ -std=gnu++17 -I"/usr/sha

installing to /home/jupyter/A419V_release9/R_packages/00LOCK-dplyr/00new/dplyr/libs
** R
** data
*** moving datasets to lazyload DB
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
*** copying figures
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** checking absolute paths in shared objects and dynamic libraries
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (dplyr)
* installing *source* package ‘data.table’ ...
** package ‘data.table’ successfully unpacked and MD5 sums checked
** using staged installation


gcc 9.4.0
zlib 1.2.11 is available ok
* checking if R installation supports OpenMP without any extra hints... yes
gcc -I"/usr/share/R/include" -DNDEBUG      -fopenmp  -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c assign.c -o assign.o


** libs
using C compiler: ‘gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0’


gcc -I"/usr/share/R/include" -DNDEBUG      -fopenmp  -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c between.c -o between.o
gcc -I"/usr/share/R/include" -DNDEBUG      -fopenmp  -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c bmerge.c -o bmerge.o
gcc -I"/usr/share/R/include" -DNDEBUG      -fopenmp  -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c chmatch.c -o chmatch.o
gcc -I"/usr/share/R/include" -DNDEBUG      -fopenmp  -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c cj.c -o cj.o
gcc -I"/usr/share/R/include" -DNDEBUG      -fopenmp  -fp

gcc -I"/usr/share/R/include" -DNDEBUG      -fopenmp  -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c transpose.c -o transpose.o
gcc -I"/usr/share/R/include" -DNDEBUG      -fopenmp  -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c types.c -o types.o
gcc -I"/usr/share/R/include" -DNDEBUG      -fopenmp  -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c uniqlist.c -o uniqlist.o
gcc -I"/usr/share/R/include" -DNDEBUG      -fopenmp  -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c utils.c -o utils.o
gcc -I"/usr/share/R/include" -DNDEBUG      -fo

installing to /home/jupyter/A419V_release9/R_packages/00LOCK-data.table/00new/data.table/libs
** R
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** checking absolute paths in shared objects and dynamic libraries
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (data.table)
* installing *source* package ‘arsenal’ ...
** package ‘arsenal’ successfully unpacked and MD5 sums checked
** using staged installation
** R
** data
*** moving datasets to lazyload DB
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
*** copying figures
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** testing if installed pac

gcc -I"/usr/share/R/include" -DNDEBUG       -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c groupsum.c -o groupsum.o
gcc -I"/usr/share/R/include" -DNDEBUG       -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c haplo.stats_init.c -o haplo.stats_init.o
gcc -I"/usr/share/R/include" -DNDEBUG       -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c haplo_em_pin.c -o haplo_em_pin.o
gcc -I"/usr/share/R/include" -DNDEBUG       -fpic  -g -O2 -fdebug-prefix-map=/build/r-base-EpRONj/r-base-4.4.2=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c louis_info.c -o louis_info.o
gcc -I"/usr/share/R/include" -DNDEBUG       

installing to /home/jupyter/A419V_release9/R_packages/00LOCK-haplo.stats/00new/haplo.stats/libs
** R
** data
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** checking absolute paths in shared objects and dynamic libraries
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (haplo.stats)


trying URL 'https://cloud.r-project.org/src/contrib/dplyr_1.1.4.tar.gz'
Content type 'application/x-gzip' length 1207521 bytes (1.2 MB)
downloaded 1.2 MB


The downloaded source packages are in
	‘/tmp/Rtmp8Akrfw/downloaded_packages’
trying URL 'https://cloud.r-project.org/src/contrib/data.table_1.17.0.tar.gz'
Content type 'application/x-gzip' length 5833671 bytes (5.6 MB)
downloaded 5.6 MB


The downloaded source packages are in
	‘/tmp/Rtmp8Akrfw/downloaded_packages’
trying URL 'https://cloud.r-project.org/src/contrib/arsenal_3.6.3.tar.gz'
Content type 'application/x-gzip' length 672939 bytes (657 KB)
downloaded 657 KB


The downloaded source packages are in
	‘/tmp/Rtmp8Akrfw/downloaded_packages’
trying URL 'https://cloud.r-project.org/src/contrib/haplo.stats_1.9.7.tar.gz'
Content type 'application/x-gzip' length 2766759 bytes (2.6 MB)
downloaded 2.6 MB


The downloaded source packages are in
	‘/tmp/Rtmp8Akrfw/downloaded_packages’


In [10]:
%%R
pack <- "/home/jupyter/A419V_release9/R_packages"

suppressPackageStartupMessages(library(dplyr, lib.loc = pack))
suppressPackageStartupMessages(library(data.table, lib.loc = pack))
suppressPackageStartupMessages(library(arsenal, lib.loc = pack))
suppressPackageStartupMessages(library(haplo.stats, lib.loc = pack))

In [10]:
%%bash
WORK_DIR=/home/jupyter/A419V_release9
cd $WORK_DIR

ancestry_labels=('AAC' 'AFR' 'AJ' 'AMR' 'CAH' 'CAS' 'EAS' 'EUR' 'FIN' 'MDE' 'SAS')

for label in "${ancestry_labels[@]}"
do

    /home/jupyter/plink1.9 \
    --bfile ${label}/${label}_release9_remove_related_updated \
    --chr 12 \
    --from-bp 40196744 \
    --to-bp 40369285 \
    --missing \
    --make-bed \
    --out ${label}/${label}_release9_remove_related_lrrk2
    
done

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AAC/AAC_release9_remove_related_lrrk2.log.
Options in effect:
  --bfile AAC/AAC_release9_remove_related_updated
  --chr 12
  --from-bp 40196744
  --make-bed
  --missing
  --out AAC/AAC_release9_remove_related_lrrk2
  --to-bp 40369285

3672 MB RAM detected; reserving 1836 MB for main workspace.
448 out of 1896835 variants loaded from .bim file.
1207 people (498 males, 709 females) loaded from .fam.
1165 phenotype values loaded from .fam.
Using 1 thread.
Before main variant filters, 1207 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998097.
--missing: Sample missing data report written to


Logging to CAS/CAS_release9_remove_related_lrrk2.log.
Options in effect:
  --bfile CAS/CAS_release9_remove_related_updated
  --chr 12
  --from-bp 40196744
  --make-bed
  --missing
  --out CAS/CAS_release9_remove_related_lrrk2
  --to-bp 40369285

3672 MB RAM detected; reserving 1836 MB for main workspace.
444 out of 1902516 variants loaded from .bim file.
1006 people (461 males, 545 females) loaded from .fam.
990 phenotype values loaded from .fam.
Using 1 thread.
Before main variant filters, 1006 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998117.
--missing: Sample missing data report written to
CAS/CAS_release9_remove_related_lrrk2.imiss, and variant-based missing data
report written to CAS/CAS_release9_remove_related_lrrk2.lmiss.
444 variants and 100

  --chr 12
  --from-bp 40196744
  --make-bed
  --missing
  --out SAS/SAS_release9_remove_related_lrrk2
  --to-bp 40369285

3672 MB RAM detected; reserving 1836 MB for main workspace.
449 out of 1919616 variants loaded from .bim file.
580 people (375 males, 205 females) loaded from .fam.
553 phenotype values loaded from .fam.
Using 1 thread.
Before main variant filters, 580 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998629.
--missing: Sample missing data report written to
SAS/SAS_release9_remove_related_lrrk2.imiss, and variant-based missing data
report written to SAS/SAS_release9_remove_related_lrrk2.lmiss.
449 variants and 580 people pass filters and QC.
Among remaining phenotypes, 354 are cases and 199 are controls.  (27 phenotypes
are missing.)
--

In [10]:
ancestry_labels=['AAC', 'AFR', 'AJ', 'AMR', 'CAH', 'CAS', 'EAS', 'EUR', 'FIN', 'MDE', 'SAS']

for anc in ancestry_labels:
    
    bim = pd.read_csv(f"/home/jupyter/A419V_release9/{anc}/{anc}_release9_remove_related_lrrk2.bim", 
                      sep = "\t", names = ["CHR", "RSID", "POS", "BP", "A1", "A2"])
    bim["CHR"] = bim["CHR"].astype(str)
    bim["BP"] = bim["BP"].astype(str)
    bim["RSID"] = bim["CHR"].str.cat(bim["BP"], sep = "_")
    bim["RSID"] = bim["RSID"].str.cat(bim["A2"], sep = "_")
    bim["RSID"] = bim["RSID"].str.cat(bim["A1"], sep = "_")
    
    bim.to_csv(f"/home/jupyter/A419V_release9/{anc}/{anc}_release9_remove_related_lrrk2.bim", 
               sep = "\t", index = False, header = False)

In [17]:
%%bash
WORK_DIR=/home/jupyter/A419V_release9
cd $WORK_DIR

# Remove duplicate
ancestry_labels=('AAC' 'AFR' 'AJ' 'AMR' 'CAH' 'CAS' 'EAS' 'EUR' 'FIN' 'MDE' 'SAS')

for label in "${ancestry_labels[@]}"
do

    /home/jupyter/plink2 \
    --bfile ${label}/${label}_release9_remove_related_lrrk2 \
    --rm-dup force-first \
    --make-bed \
    --out ${label}/${label}_release9_remove_related_lrrk2_nodup
    
done

PLINK v2.0.0-a.6.9LM 64-bit Intel (29 Jan 2025)    cog-genomics.org/plink/2.0/
(C) 2005-2025 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AAC/AAC_release9_remove_related_lrrk2_nodup.log.
Options in effect:
  --bfile AAC/AAC_release9_remove_related_lrrk2
  --make-bed
  --out AAC/AAC_release9_remove_related_lrrk2_nodup
  --rm-dup force-first

Start time: Sun Apr  6 00:05:04 2025
3672 MiB RAM detected, ~2016 available; reserving 1836 MiB for main workspace.
Using 1 compute thread.
1207 samples (709 females, 498 males; 1207 founders) loaded from
AAC/AAC_release9_remove_related_lrrk2.fam.
448 variants loaded from AAC/AAC_release9_remove_related_lrrk2.bim.
1 binary phenotype loaded (338 cases, 827 controls).
--rm-dup: 65 duplicated IDs, 83 variants removed.
Writing AAC/AAC_release9_remove_related_lrrk2_nodup.fam ... done.
Writing AAC/AAC_release9_remove_related_lrrk2_nodup.bim ... done.
Writing AAC/AAC_release9_remove_related_lrrk2_nodup.bed ... done.
End time:

PLINK v2.0.0-a.6.9LM 64-bit Intel (29 Jan 2025)    cog-genomics.org/plink/2.0/
(C) 2005-2025 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to FIN/FIN_release9_remove_related_lrrk2_nodup.log.
Options in effect:
  --bfile FIN/FIN_release9_remove_related_lrrk2
  --make-bed
  --out FIN/FIN_release9_remove_related_lrrk2_nodup
  --rm-dup force-first

Start time: Sun Apr  6 00:05:04 2025
3672 MiB RAM detected, ~2006 available; reserving 1836 MiB for main workspace.
Using 1 compute thread.
109 samples (66 females, 43 males; 109 founders) loaded from
FIN/FIN_release9_remove_related_lrrk2.fam.
451 variants loaded from FIN/FIN_release9_remove_related_lrrk2.bim.
1 binary phenotype loaded (86 cases, 4 controls).
--rm-dup: 66 duplicated IDs, 84 variants removed.
Writing FIN/FIN_release9_remove_related_lrrk2_nodup.fam ... done.
Writing FIN/FIN_release9_remove_related_lrrk2_nodup.bim ... done.
Writing FIN/FIN_release9_remove_related_lrrk2_nodup.bed ... done.
End time: Sun Ap

# Haplotype block size comparison

In [4]:
%%bash
WORK_DIR=/home/jupyter/A419V_release9
cd $WORK_DIR

ancestry_labels=('AJ' 'CAH' 'CAS' 'EAS' 'EUR')

for label in "${ancestry_labels[@]}"
do

    /home/jupyter/plink1.9 \
    --bfile ${label}/${label}_release9_remove_related_lrrk2_nodup \
    --blocks \
    --blocks-min-maf 0.0001 \
    --out ${label}/${label}_release9_remove_related_lrrk2.blocks

done

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AJ/AJ_release9_remove_related_lrrk2.blocks.log.
Options in effect:
  --bfile AJ/AJ_release9_remove_related_lrrk2_nodup
  --blocks
  --blocks-min-maf 0.0001
  --out AJ/AJ_release9_remove_related_lrrk2.blocks

52216 MB RAM detected; reserving 26108 MB for main workspace.
365 variants loaded from .bim file.
3081 people (1934 males, 1147 females) loaded from .fam.
2533 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 3081 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.997855.
365 variants and 3081 people pass filters and Q

In [13]:
%%R

# Create a table contain the location of A419V
a419v_loci <- data.table(
    SNP = "A419V" ,
    CHR = 12 ,   
    BP  = 40252984)

In [14]:
%%R

# Initialize an empty table
ALLhap <- data.table()

ALLhap$ancestry <- NA
ALLhap$KB       <- NA
ALLhap$NSNPS    <- NA
ALLhap$SNPS     <- NA

# Extract info from the loci table into variables
thisSnp <- a419v_loci$SNP[1]
thisChr <- a419v_loci$CHR[1]
thisBp  <- a419v_loci$BP[1]

ancestry_labels <- c('AJ', 'CAH', 'CAS', 'EAS', 'EUR')

for (anc in ancestry_labels){
    
    hap     <- fread(paste0("/home/jupyter/A419V_release9/", anc, "/", anc, "_release9_remove_related_lrrk2.blocks.blocks.det"), header =T)
    hap_sub <- subset(hap, CHR == thisChr & BP1 <= thisBp & BP2 >= thisBp)
    
    if(length(hap_sub$KB) > 0) {
    
        tmp <- data.table(
            
            ancestry = anc,
            KB       = hap_sub$KB,
            NSNPS    = hap_sub$NSNPS,
            SNPS     = hap_sub$SNPS
        )
        
        ALLhap <- rbind(ALLhap, tmp)
    
    } 
}

ALLhap

   ancestry     KB NSNPS
     <char>  <num> <int>
1:     <NA>     NA    NA
2:      EAS 14.193     9
                                                                                                                                              SNPS
                                                                                                                                            <char>
1:                                                                                                                                            <NA>
2: 12_40238792_G_A|12_40239262_G_A|12_40239512_A_G|12_40240543_G_A|12_40249666_G_A|12_40251369_G_A|12_40251495_G_A|12_40251828_G_A|12_40252984_G_A


# Haplotype Association study

## Haplotype frequency comparison

In [15]:
%%bash
WORK_DIR=/home/jupyter/A419V_release9
cd $WORK_DIR

ancestry_labels=('AJ' 'CAH' 'CAS' 'EAS' 'EUR')

for label in "${ancestry_labels[@]}"
do

    /home/jupyter/plink1.9 \
    --bfile ${label}/${label}_release9_remove_related_lrrk2_nodup \
    --recode \
    --out ${label}/${label}_release9_remove_related_lrrk2_nodup

done

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AJ/AJ_release9_remove_related_lrrk2_nodup.log.
Options in effect:
  --bfile AJ/AJ_release9_remove_related_lrrk2_nodup
  --out AJ/AJ_release9_remove_related_lrrk2_nodup
  --recode

52216 MB RAM detected; reserving 26108 MB for main workspace.
365 variants loaded from .bim file.
3081 people (1934 males, 1147 females) loaded from .fam.
2533 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 3081 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.997855.
365 variants and 3081 people pass filters and QC.
Among remaining phenotype

In [16]:
%%R

ancestry_labels <- c('AJ', 'CAH', 'CAS', 'EAS', 'EUR')

for (anc in ancestry_labels){
    
    # Read PED file
    PED <- fread(paste0("/home/jupyter/A419V_release9/", anc, "/", anc, "_release9_remove_related_lrrk2_nodup.ped"))

    # Get Allele name
    FILTERED.LRRK2_1 <- read.table(paste0("/home/jupyter/A419V_release9/", anc, "/", anc, "_release9_remove_related_lrrk2_nodup.map"), header=FALSE)
    FILTERED.LRRK2_2 <- read.table(paste0("/home/jupyter/A419V_release9/", anc, "/", anc, "_release9_remove_related_lrrk2_nodup.map"), header=FALSE)
    FILTERED.LRRK2_1$V2 <- paste0(FILTERED.LRRK2_1$V2, sep="_", "1")
    FILTERED.LRRK2_2$V2 <- paste0(FILTERED.LRRK2_2$V2, sep="_", "2")

    # Set column names
    colnames(FILTERED.LRRK2_1) <- c("CHR", "SNP", "CM", "POS")
    colnames(FILTERED.LRRK2_2) <- c("CHR", "SNP", "CM", "POS")
    FILTERED.LRRK2_2alleles    <- rbind(FILTERED.LRRK2_1, FILTERED.LRRK2_2)

    # Sort file and create geno matrix:
    # The variants should appear in order of genomic postition in the ped file, therefore, we sort the alleles based on position
    LRRK2_pos     <- FILTERED.LRRK2_2alleles[order(FILTERED.LRRK2_2alleles$POS),]
    LRRK2_alleles <- LRRK2_pos[,c("SNP")]
    write.table(LRRK2_alleles, file = paste0("/home/jupyter/A419V_release9/", anc, "/", anc, "_LRRK2_alleles.txt"), quote = F, sep = "\t", row.names = F, col.names = F)

    # Add allele names to the PED file
    colnames(PED) <- c("FID", "IID", "PAT","MAT", "SEX", "PHENO", LRRK2_alleles)
    write.table(as.data.frame(PED), file = paste0("/home/jupyter/A419V_release9/", anc, "/", anc, "_geno_matrix_LRRK2.txt"), quote = F, row.names = FALSE, sep = '\t')

}

### EAS

In [28]:
%%bash
WORK_DIR="/home/jupyter/A419V_release9/"
cd $WORK_DIR

label="EAS"

grep 40252984 ${label}/${label}_release9_remove_related_lrrk2.blocks.blocks | sed 's/ /|/g' | sed -r 's/^.{2}//'

12_40238792_G_A|12_40239262_G_A|12_40239512_A_G|12_40240543_G_A|12_40249666_G_A|12_40251369_G_A|12_40251495_G_A|12_40251828_G_A|12_40252984_G_A


In [75]:
%%R

anc <- "EAS"

Geno_matrix_LRRK2 <- read.delim(paste0("/home/jupyter/A419V_release9/", anc, "/", anc, "_geno_matrix_LRRK2.txt"), check.names = FALSE)

#Colnames:
LRRK2_alleles <- read.table(paste0("/home/jupyter/A419V_release9/", anc, "/", anc, "_LRRK2_alleles.txt"), quote="\"", comment.char="")
H1_a          <- LRRK2_alleles[grepl("12_40238792_G_A|12_40239262_G_A|12_40239512_A_G|12_40240543_G_A|12_40249666_G_A|12_40251369_G_A|12_40251495_G_A|12_40251828_G_A|12_40252984_G_A", LRRK2_alleles$V1), ]
H1_LRRK2      <- Geno_matrix_LRRK2[,c("FID", "IID", "PAT", "MAT", "SEX", "PHENO", H1_a)]

In [76]:
%%R
ncol(H1_LRRK2)

[1] 24


In [77]:
%%R
ncol(H1_LRRK2)
anc <- "EAS"

#Extract only genotype data:
geno          <- data.frame(H1_LRRK2[,c(7:24)], check.names = FALSE)

#Set variables for running the association analyses in haplo.stats
#Label the SNPs: "12_40252984_G_A"

# Get the column name of the haplotype
label <- colnames(H1_LRRK2)[7:24] %>% gsub("_.{1}$", "", .) %>% unique()

#Set binary pheno (0=control, 1=patient):
H1_LRRK2$PHENO_01 <- H1_LRRK2$PHENO-1
y.bin <- 1*(H1_LRRK2$PHENO_01=="1")

In [78]:
%%R

#Non-adjusted:
H1 <- haplo.cc(y=y.bin, geno=geno, locus.label= label, control = haplo.glm.control(haplo.freq.min = 0.01))
print(H1, nlines=10, digits=2)

#Sort the output on p-value:
H1_cc.df <- H1$cc.df
H1_cc.df_sort <- H1_cc.df[order(H1_cc.df$`p-val`),]

-------------------------------------------------------------------------------- 
                            Global Score Statistics                              
-------------------------------------------------------------------------------- 
global-stat = 12, df = 4, p-val = 0.016

-------------------------------------------------------------------------------- 
                         Counts for Cases and Controls                           
-------------------------------------------------------------------------------- 
control    case 
   2454    3192 


   12_40238792_G_A 12_40239262_G_A 12_40239512_A_G 12_40240543_G_A
17               G               G               A               G
21               G               G               G               G
7                G               A               G               G
4                A               G               A               G
1                A               G               A               G
2                A           

In [79]:
%%R

library(knitr)

H1_cc.df_sort %>% kable()



|   |12_40238792_G_A |12_40239262_G_A |12_40239512_A_G |12_40240543_G_A |12_40249666_G_A |12_40251369_G_A |12_40251495_G_A |12_40251828_G_A |12_40252984_G_A |  Hap-Score|     p-val|   pool.hf| control.hf|   case.hf|glm.eff |  OR.lower|        OR| OR.upper|
|:--|:---------------|:---------------|:---------------|:---------------|:---------------|:---------------|:---------------|:---------------|:---------------|----------:|---------:|---------:|----------:|---------:|:-------|---------:|---------:|--------:|
|4  |A               |G               |A               |G               |G               |G               |G               |G               |G               |  3.0628115| 0.0021927| 0.2947496|  0.2797671| 0.3063289|Eff     | 1.0219151| 1.1215528| 1.230905|
|17 |G               |G               |A               |G               |G               |G               |G               |G               |G               | -2.4072061| 0.0160751| 0.3135438|  0.3253539| 0.3045028|Eff     | 0.

#### Visualise the block

In [58]:
with open("/home/jupyter/A419V_release9/EAS/EAS_hap_extract.txt", "w") as f:
    f.write("12_40238792_G_A\n12_40239262_G_A\n12_40239512_A_G\n12_40240543_G_A\n12_40249666_G_A\n12_40251369_G_A\n12_40251495_G_A\n12_40251828_G_A\n12_40252984_G_A")

In [73]:
%%bash
WORK_DIR=/home/jupyter/A419V_release9
cd $WORK_DIR

label="EAS"

/home/jupyter/plink1.9 \
--bfile ${label}/${label}_release9_remove_related_lrrk2_nodup \
--extract ${label}/${label}_hap_extract.txt \
--snps-only just-acgt \
--recode HV \
--out ${label}/${label}_release9_remove_related_lrrk2_nodup

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to EAS/EAS_release9_remove_related_lrrk2_nodup.log.
Options in effect:
  --bfile EAS/EAS_release9_remove_related_lrrk2_nodup
  --extract EAS/EAS_hap_extract.txt
  --out EAS/EAS_release9_remove_related_lrrk2_nodup
  --recode HV
  --snps-only just-acgt

52216 MB RAM detected; reserving 26108 MB for main workspace.
347 out of 350 variants loaded from .bim file.
5646 people (3495 males, 2151 females) loaded from .fam.
5571 phenotype values loaded from .fam.
--extract: 9 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 5646 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 don

In [65]:
%%bash
WORK_DIR=/home/jupyter/A419V_release9
cd $WORK_DIR


ls EAS/EAS_release9_remove_related_lrrk2_nodup.chr*

EAS/EAS_release9_remove_related_lrrk2_nodup.chr-12.info
EAS/EAS_release9_remove_related_lrrk2_nodup.chr-12.ped


In [74]:
WORK_DIR="/home/jupyter/A419V_release9"
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp {WORK_DIR}/EAS/EAS_release9_remove_related_lrrk2_nodup.chr-12.* {WORKSPACE_BUCKET}/A419V_analysis/release9/haplotype/EAS/')

Executing: gsutil -u terra-8cb3be5c -m cp /home/jupyter/A419V_release9/EAS/EAS_release9_remove_related_lrrk2_nodup.chr-12.* gs://fc-e8a73e41-545c-42b1-8720-970cf953ba35/A419V_analysis/release9/haplotype/EAS/


Copying file:///home/jupyter/A419V_release9/EAS/EAS_release9_remove_related_lrrk2_nodup.chr-12.ped [Content-Type=application/octet-stream]...
Copying file:///home/jupyter/A419V_release9/EAS/EAS_release9_remove_related_lrrk2_nodup.chr-12.info [Content-Type=application/x-info]...
/ [2/2 files][336.2 KiB/336.2 KiB] 100% Done                                    
Operation completed over 2 objects/336.2 KiB.                                    


### EUR

In [38]:
%%bash
WORK_DIR="/home/jupyter/A419V_release9/"
cd $WORK_DIR

label="EUR"

cat ${label}/${label}_release9_remove_related_lrrk2.blocks.blocks

* 12_40198826_A_G 12_40199601_A_G 12_40200768_A_G 12_40204350_A_C 12_40207140_A_G 12_40208138_G_A 12_40208422_C_A 12_40209874_C_A 12_40213423_A_T 12_40215078_G_A 12_40215707_A_G 12_40216942_G_A 12_40220260_A_G 12_40221630_A_G
* 12_40225280_A_G 12_40225499_A_G
* 12_40237989_G_A 12_40238792_G_A
* 12_40256700_G_A 12_40257073_T_A 12_40258718_G_A
* 12_40262377_A_G 12_40262593_G_A 12_40263898_C_G 12_40264353_G_A
* 12_40267251_T_A 12_40272118_G_A 12_40273073_G_C 12_40273861_G_A 12_40276652_A_G 12_40277068_A_G 12_40277196_C_A 12_40278187_A_G 12_40279566_A_G 12_40282999_C_A 12_40283045_A_T 12_40283227_C_A 12_40284011_C_A 12_40285467_T_A 12_40285609_G_A 12_40287115_A_G
* 12_40293043_A_G 12_40293294_G_A 12_40293624_G_C 12_40293896_A_G 12_40294893_A_G 12_40295786_G_A
* 12_40309109_G_A 12_40309185_G_A
* 12_40311356_A_T 12_40312802_G_A 12_40313594_C_A 12_40313976_G_A 12_40314059_G_A 12_40317244_A_G 12_40317645_C_A 12_40317832_G_A 12_40320043_C_G 12_40320097_A_G 12_40320099_T_A 12_40320699_C_A 12_403

In [37]:
%%bash
WORK_DIR="/home/jupyter/A419V_release9/"
cd $WORK_DIR

label="EUR"

grep 40252984 ${label}/${label}_release9_remove_related_lrrk2.blocks.blocks | sed 's/ /|/g' | sed -r 's/^.{2}//'

No block contain p.A419V, checking for missingness

In [47]:
%%bash
WORK_DIR=/home/jupyter/A419V_release9
cd $WORK_DIR

ancestry_labels=('EUR')

for label in "${ancestry_labels[@]}"
do

    /home/jupyter/plink1.9 \
    --bfile ${label}/${label}_release9_remove_related_lrrk2_nodup \
    --missing \
    --out ${label}/${label}_release9_remove_related_lrrk2.blocks_test

done

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to EUR/EUR_release9_remove_related_lrrk2.blocks_test.log.
Options in effect:
  --bfile EUR/EUR_release9_remove_related_lrrk2_nodup
  --missing
  --out EUR/EUR_release9_remove_related_lrrk2.blocks_test

52216 MB RAM detected; reserving 26108 MB for main workspace.
332 variants loaded from .bim file.
34703 people (19239 males, 15464 females) loaded from .fam.
20824 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 34703 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998415.
--missing: Sample missing data report written to
EU

In [51]:
%%bash
WORK_DIR=/home/jupyter/A419V_release9
cd $WORK_DIR

head -1 EUR/EUR_release9_remove_related_lrrk2.blocks_test.lmiss
grep 40252984 EUR/EUR_release9_remove_related_lrrk2.blocks_test.lmiss

 CHR               SNP   N_MISS   N_GENO   F_MISS
  12   12_40252984_G_A       35    34703 0.001009


### AJ

In [52]:
%%bash
WORK_DIR="/home/jupyter/A419V_release9/"
cd $WORK_DIR

label="AJ"

grep 40252984 ${label}/${label}_release9_remove_related_lrrk2.blocks.blocks | sed 's/ /|/g' | sed -r 's/^.{2}//'

In [53]:
%%bash
WORK_DIR="/home/jupyter/A419V_release9/"
cd $WORK_DIR

label="AJ"

cat ${label}/${label}_release9_remove_related_lrrk2.blocks.blocks

* 12_40197810_C_A 12_40198586_A_G 12_40198826_A_G 12_40199601_A_G 12_40200768_A_G 12_40204350_A_C 12_40207140_A_G 12_40208138_G_A
* 12_40208422_C_A 12_40209874_C_A 12_40213423_A_T 12_40213907_A_G 12_40215078_A_G 12_40215707_A_G 12_40216942_G_A 12_40217062_C_G 12_40220260_A_G 12_40220632_G_A 12_40220854_G_A 12_40221630_A_G 12_40222612_A_G 12_40223957_A_G 12_40226614_T_A 12_40227006_G_C 12_40228666_C_A 12_40230508_A_G
* 12_40237989_G_A 12_40238792_G_A 12_40239512_A_G 12_40249666_G_A
* 12_40251828_A_G 12_40252732_G_A
* 12_40257073_T_A 12_40258718_G_A 12_40261071_A_G 12_40262377_A_G 12_40262593_G_A 12_40263898_C_G 12_40264353_G_A
* 12_40267251_T_A 12_40272118_G_A 12_40273073_G_C 12_40273861_G_A 12_40276592_G_A 12_40277196_C_A 12_40278187_A_G 12_40279566_A_G 12_40282999_C_A 12_40283045_A_T 12_40283227_C_A 12_40285467_T_A 12_40285609_G_A 12_40287115_A_G 12_40288195_A_G 12_40288634_G_A 12_40293043_A_G
* 12_40293896_A_G 12_40294893_A_G 12_40295786_G_A 12_40296180_A_G 12_40298346_G_A 12_4029925

No block contain p.A419V

### CAH

In [54]:
%%bash
WORK_DIR="/home/jupyter/A419V_release9/"
cd $WORK_DIR

label="CAH"

grep 40252984 ${label}/${label}_release9_remove_related_lrrk2.blocks.blocks | sed 's/ /|/g' | sed -r 's/^.{2}//'

In [55]:
%%bash
WORK_DIR="/home/jupyter/A419V_release9/"
cd $WORK_DIR

label="CAH"

cat ${label}/${label}_release9_remove_related_lrrk2.blocks.blocks

* 12_40199601_A_G 12_40200768_A_G 12_40204350_A_C 12_40207140_A_G 12_40208138_G_A 12_40208422_C_A 12_40209874_C_A 12_40213423_A_T 12_40213907_A_G 12_40215078_G_A 12_40215707_A_G
* 12_40216942_G_A 12_40217062_C_G
* 12_40220854_G_A 12_40221630_A_G 12_40222612_A_G
* 12_40225280_A_G 12_40225499_A_G
* 12_40227006_G_C 12_40228666_C_A
* 12_40237989_A_G 12_40238792_G_A 12_40239512_A_G 12_40240543_G_A 12_40243579_A_C 12_40247932_A_T 12_40249666_G_A
* 12_40251828_G_A 12_40252732_G_A
* 12_40257073_T_A 12_40258718_G_A 12_40261071_A_G 12_40261443_G_A 12_40262377_A_G 12_40262593_G_A 12_40263898_C_G 12_40264353_G_A
* 12_40272118_G_A 12_40273073_G_C
* 12_40277196_C_A 12_40278187_A_G 12_40279566_A_G 12_40282999_C_A 12_40283045_A_T 12_40283227_C_A 12_40283897_G_A 12_40283933_G_A 12_40284011_C_A 12_40285467_T_A
* 12_40287115_A_G 12_40288195_A_G 12_40288634_G_A 12_40293043_A_G 12_40293294_G_A 12_40293552_A_C 12_40293624_G_C 12_40293896_A_G 12_40294893_A_G 12_40295280_G_A 12_40295786_G_A
* 12_40303632_G_A 

### CAS

In [56]:
%%bash
WORK_DIR="/home/jupyter/A419V_release9/"
cd $WORK_DIR

label="CAS"

grep 40252984 ${label}/${label}_release9_remove_related_lrrk2.blocks.blocks | sed 's/ /|/g' | sed -r 's/^.{2}//'

# Calculate R2

In [23]:
%%bash
WORK_DIR=/home/jupyter/A419V_release9
cd $WORK_DIR

ancestry_labels=('AJ' 'CAH' 'CAS' 'EAS' 'EUR')

for label in "${ancestry_labels[@]}"
do

    wc -l ${label}/${label}_release9_remove_related_lrrk2_nodup_exon.bim
    
done

162 AJ/AJ_release9_remove_related_lrrk2_nodup_exon.bim
162 CAH/CAH_release9_remove_related_lrrk2_nodup_exon.bim
162 CAS/CAS_release9_remove_related_lrrk2_nodup_exon.bim
161 EAS/EAS_release9_remove_related_lrrk2_nodup_exon.bim
160 EUR/EUR_release9_remove_related_lrrk2_nodup_exon.bim


In [32]:
%%bash
WORK_DIR=/home/jupyter/A419V_release9
cd $WORK_DIR

ancestry_labels=('AJ' 'CAH' 'CAS' 'EAS' 'EUR')

for label in "${ancestry_labels[@]}"
do

    /home/jupyter/plink1.9 \
    --bfile ${label}/${label}_release9_remove_related_lrrk2_nodup_exon \
    --ld-snp "12_40252984_G_A" \
    --r2 \
    --ld-window 99999 \
    --ld-window-kb 10000000 \
    --ld-window-r2 0 \
    --out ${label}/${label}_a419v_r2
    
done

PLINK v1.90b6.9 64-bit (4 Mar 2019)            www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AJ/AJ_a419v_r2.log.
Options in effect:
  --bfile AJ/AJ_release9_remove_related_lrrk2_nodup_exon
  --ld-snp 12_40252984_G_A
  --ld-window 99999
  --ld-window-kb 10000000
  --ld-window-r2 0
  --out AJ/AJ_a419v_r2
  --r2

52216 MB RAM detected; reserving 26108 MB for main workspace.
162 variants loaded from .bim file.
3081 people (1934 males, 1147 females) loaded from .fam.
2533 phenotype values loaded from .fam.
Using up to 8 threads (change this with --threads).
Before main variant filters, 3081 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.999088.
162 variants and 3081 people pass filter

In [33]:
%%bash
WORK_DIR=/home/jupyter/A419V_release9
cd $WORK_DIR

ancestry_labels=('AJ' 'CAH' 'CAS' 'EAS' 'EUR')

for label in "${ancestry_labels[@]}"
do

    wc -l ${label}/${label}_a419v_r2.ld
    
done

23 AJ/AJ_a419v_r2.ld
37 CAH/CAH_a419v_r2.ld
28 CAS/CAS_a419v_r2.ld
63 EAS/EAS_a419v_r2.ld
111 EUR/EUR_a419v_r2.ld


In [51]:
pd.set_option("Display.max_rows", None)

In [52]:
df = pd.DataFrame({"SNP_B":""}, index = [0])

labels=['AJ', 'CAH', 'CAS', 'EAS', 'EUR']

for label in labels:
    
    ld = pd.read_csv(f"/home/jupyter/A419V_release9/{label}/{label}_a419v_r2.ld", delim_whitespace = True)
    ld_red = ld[["SNP_B", "R2"]]
    ld_red.rename(columns = {"R2":f"R2_{label}"}, inplace = True)

    df = pd.merge(df, ld_red, on = "SNP_B", how = "outer")

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ld_red.rename(columns = {"R2":f"R2_{label}"}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ld_red.rename(columns = {"R2":f"R2_{label}"}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ld_red.rename(columns = {"R2":f"R2_{label}"}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Unnamed: 0,SNP_B,R2_AJ,R2_CAH,R2_CAS,R2_EAS,R2_EUR
0,,,,,,
1,12_40232380_A_C,2.11859e-07,,3.431e-05,,3.0292e-08
2,12_40237989_G_A,1.76537e-05,,,,0.00112706
3,12_40251273_G_A,3.17271e-07,,,,1.69905e-06
4,12_40251369_G_A,1.05964e-07,,,3e-06,1.21459e-07
5,12_40252984_G_A,1.0,1.0,1.0,1.0,1.0
6,12_40263898_C_G,6.13238e-05,0.002827,5.12723e-07,0.000534,0.00010186
7,12_40278187_A_G,9.40887e-05,0.001466,0.00205757,0.001594,6.57644e-05
8,12_40293624_G_C,2.12273e-07,1.4e-05,,,3.33944e-07
9,12_40294893_A_G,0.001132,0.047171,0.06784,0.043004,0.00378768
