# MAPT - AAO regression

## GP2 NBA data release 7

## Project: Exploring MAPT-containing H1 and H2 haplotypes  in Parkinson's Disease across diverse populations (Hackathon project 2023)


Version: Python/3.10.12

Last Updated: 14-MAY-2025

Update Description: Added descriptions to the analysis

Gene coordinates for the region of 17q21.31 (containing MAPT) from the UCSC Browser: chr17:42,800,001-46,800,000 (GRCh38/hg38)

Notebook overview: In this notebook we performed analyses looking at the association between PD age at onset (AAO) and MAPT H1 vs H2 haplotypes


## Description:

* Set things up: Loading Python libraries, set paths to the GP2 data and defining functions
* Install packages
* Copy the files
* Create a covariate file
* Remove related individuals
* Remove 'non-PD cases and -controls' individuals
* Set PD cases to status H2 = 1 (H2/H2 and H1/H2) vs H1 = 2 (H1/H1)
* Check if the AAO is normally distributed
* Test if there is a differene between H1 vs H2 carriers for AAO using Mann-Whitney test
* Run linear regression analyses for AAO

### 1. Set up things

#### Loading Python libraries and defining functions

In [None]:
# Use the os package to interact with the environment
import os

# Bring in Pandas for Dataframe functionality
import pandas as pd

# Numpy for basics
import numpy as np

# Use StringIO for working with file contents
from io import StringIO

# Enable IPython to display matplotlib graphs
import matplotlib.pyplot as plt
%matplotlib inline

# Enable interaction with the FireCloud API
from firecloud import api as fapi

# Import the iPython HTML rendering for displaying links to Google Cloud Console
from IPython.core.display import display, HTML

# Import urllib modules for building URLs to Google Cloud Console
import urllib.parse

# BigQuery for querying data
from google.cloud import bigquery

#Import Sys
import sys as sys

#Import pathlb
import pathlib

In [None]:
# Set up billing project and data path variables
BILLING_PROJECT_ID = os.environ['GOOGLE_PROJECT']
WORKSPACE_NAMESPACE = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE_NAME = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']

WORKSPACE_ATTRIBUTES = fapi.get_workspace(WORKSPACE_NAMESPACE, WORKSPACE_NAME).json().get('workspace',{}).get('attributes',{})

## Print the information to check we are in the proper release and billing 
## This will be different for you, the user, depending on the billing project your workspace is on
print('Billing and Workspace')
print(f'Workspace Name: {WORKSPACE_NAME}')
print(f'Billing Project: {BILLING_PROJECT_ID}')
print(f'Workspace Bucket, where you can upload and download data: {WORKSPACE_BUCKET}')
print('')

## GP2 v7.0
## Explicitly define release v7.0 path 
GP2_RELEASE_PATH = 'gs://gp2tier2/path/to/release/7'
GP2_CLINICAL_RELEASE_PATH = f'{GP2_RELEASE_PATH}/clinical_data'
GP2_RAW_GENO_PATH = f'{GP2_RELEASE_PATH}/raw_genotypes'
GP2_IMPUTED_GENO_PATH = f'{GP2_RELEASE_PATH}/imputed_genotypes'
GP2_META_RELEASE_PATH = f'{GP2_RELEASE_PATH}/meta_data'
GP2_SUMSTAT_RELEASE_PATH = f'{GP2_RELEASE_PATH}/summary_statistics'

print('GP2 v7.0')
print(f'Path to GP2 v7.0 Clinical Data @ `GP2_CLINICAL_RELEASE_PATH`: {GP2_CLINICAL_RELEASE_PATH}')
print(f'Path to GP2 v7.0 Metadata @ `GP2_META_RELEASE_PATH`: {GP2_META_RELEASE_PATH}')
print(f'Path to GP2 v7.0 Raw Genotype Data @ `GP2_RAW_GENO_PATH`: {GP2_RAW_GENO_PATH}')
print(f'Path to GP2 v7.0 Imputed Genotype Data @ `GP2_IMPUTED_GENO_PATH`: {GP2_IMPUTED_GENO_PATH}')
print(f'Path to GP2 v7.0 summary statistics: {GP2_SUMSTAT_RELEASE_PATH}')

In [None]:
# Utility routine for printing a shell command before executing it
def shell_do(command):
    print(f'Executing: {command}', file=sys.stderr)
    !$command
    
def shell_return(command):
    print(f'Executing: {command}', file=sys.stderr)
    output = !$command
    return '\n'.join(output)

# Utility routine for printing a query before executing it
def bq_query(query):
    print(f'Executing: {query}', file=sys.stderr)
    return pd.read_gbq(query, project_id=BILLING_PROJECT_ID, dialect='standard')

# Utility routine for display a message and a link
def display_html_link(description, link_text, url):
    html = f'''
    <p>
    </p>
    <p>
    {description}
    <a target=_blank href="{url}">{link_text}</a>.
    </p>
    '''

    display(HTML(html))

# Utility routines for reading files from Google Cloud Storage
def gcs_read_file(path):
    """Return the contents of a file in GCS"""
    contents = !gsutil -u {BILLING_PROJECT_ID} cat {path}
    return '\n'.join(contents)
    
def gcs_read_csv(path, sep=None):
    """Return a DataFrame from the contents of a delimited file in GCS"""
    return pd.read_csv(StringIO(gcs_read_file(path)), sep=sep, engine='python')

# Utility routine for displaying a message and link to Cloud Console
def link_to_cloud_console_gcs(description, link_text, gcs_path):
    url = '{}?{}'.format(
        os.path.join('https://console.cloud.google.com/storage/browser',
                     gcs_path.replace("gs://","")),
        urllib.parse.urlencode({'userProject': BILLING_PROJECT_ID}))

    display_html_link(description, link_text, url)

In [None]:
%%capture
%%bash

# Install plink 1.9
cd /home/jupyter/
if test -e /home/jupyter/plink; then

echo "Plink is already installed in /home/jupyter/"
else
echo "Plink is not installed"
cd /home/jupyter

wget http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20190304.zip 

unzip -o plink_linux_x86_64_20190304.zip
mv plink plink1.9

fi

#### Install plink v1.9 and v2.0

In [None]:
%%capture
%%bash

# Install plink 1.9
cd /home/jupyter/
if test -e /home/jupyter/plink; then

echo "Plink is already installed in /home/jupyter/"
else
echo "Plink is not installed"
cd /home/jupyter

wget http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20190304.zip 

unzip -o plink_linux_x86_64_20190304.zip
mv plink plink1.9

fi

In [None]:
%%capture
%%bash

# Install plink 2.0
cd /home/jupyter/
if test -e /home/jupyter/plink2; then

echo "Plink2 is already installed in /home/jupyter/"
else
echo "Plink2 is not installed"
cd /home/jupyter/

wget http://s3.amazonaws.com/plink2-assets/plink2_linux_x86_64_latest.zip

unzip -o plink2_linux_x86_64_latest.zip

fi

In [None]:
%%bash

# chmod plink 1.9 to make sure you have permission to run the program
chmod u+x /home/jupyter/plink1.9

In [None]:
%%bash

# chmod plink 2.0 to make sure you have permission to run the program
chmod u+x /home/jupyter/plink2

In [None]:
# Create a folder on your workspace
print("Making a working directory")
WORK_DIR = f'/home/jupyter/Subhaplotypes/'
shell_do(f'mkdir -p {WORK_DIR}') # f' stands for f-string which contains expressions inside brackets


In [None]:
# Check directory where GP2 Tier 2 data is
print("List available imputed genotype information in GP2")
shell_do(f'gsutil -u {BILLING_PROJECT_ID} ls {GP2_IMPUTED_GENO_PATH}/CAH')

#### Retreive the  files 

In [None]:
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_IMPUTED_GENO_PATH}/AFR/chr17_AFR_release7* {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_IMPUTED_GENO_PATH}/AAC/chr17_AAC_release7* {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_IMPUTED_GENO_PATH}/AMR/chr17_AMR_release7* {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_IMPUTED_GENO_PATH}/AJ/chr17_AJ_release7* {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_IMPUTED_GENO_PATH}/EUR/chr17_EUR_release7* {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_IMPUTED_GENO_PATH}/EAS/chr17_EAS_release7* {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_IMPUTED_GENO_PATH}/CAH/chr17_CAH_release7* {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_IMPUTED_GENO_PATH}/CAS/chr17_CAS_release7* {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_IMPUTED_GENO_PATH}/MDE/chr17_MDE_release7* {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_IMPUTED_GENO_PATH}/SAS/chr17_SAS_release7* {WORK_DIR}')

Also the eigenvec and eigenval files:

In [None]:
# Check directory where GP2 Tier 2 data is
print("List available imputed genotype information in GP2")
shell_do(f'gsutil -u {BILLING_PROJECT_ID} ls {GP2_RAW_GENO_PATH}/CAH')

In [None]:
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/AFR/AFR_release7.eigenval {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/AFR/AFR_release7.eigenvec {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/AAC/AAC_release7.eigenval {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/AAC/AAC_release7.eigenvec {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/AMR/AMR_release7.eigenval {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/AMR/AMR_release7.eigenvec {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/AJ/AJ_release7.eigenval {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/AJ/AJ_release7.eigenvec {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/CAH/CAH_release7.eigenval {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/CAH/CAH_release7.eigenvec {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/CAS/CAS_release7.eigenval {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/CAS/CAS_release7.eigenvec {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/EUR/EUR_release7.eigenval {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/EUR/EUR_release7.eigenvec {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/EAS/EAS_release7.eigenval {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/EAS/EAS_release7.eigenvec {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/MDE/MDE_release7.eigenval {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/MDE/MDE_release7.eigenvec {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/SAS/SAS_release7.eigenval {WORK_DIR}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_RAW_GENO_PATH}/SAS/SAS_release7.eigenvec {WORK_DIR}')

In [None]:
# Select the file that matches with your population
shell_do(f'gsutil -u {BILLING_PROJECT_ID} ls {GP2_META_RELEASE_PATH}/related_samples/')

In [None]:
ancestries = {'AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'FIN', 'MDE', 'SAS', 'CAH'}

for ancestry in ancestries:
    
    shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_META_RELEASE_PATH}/related_samples/{ancestry}_release7.related {WORK_DIR}')


In [None]:
!cat /home/jupyter/Subhaplotypes/CAH_release7.related

The IDs are:
ID1: Individual ID for the first individual of the pair
ID2: Individual ID for the second individual of the pair
We select to remove individuals in the ID1 and only exclude one person in the pair

#### Create a covariate file:

In [None]:
shell_do(f'gsutil -u {BILLING_PROJECT_ID} ls {GP2_CLINICAL_RELEASE_PATH}')
shell_do(f'gsutil -u {BILLING_PROJECT_ID} -m cp -r {GP2_CLINICAL_RELEASE_PATH}/master_key_release7_final.csv {WORK_DIR}')

In [None]:
clin = pd.read_csv('/home/jupyter/Subhaplotypes/master_key_release7_final.csv')
clin.info()

In [None]:
# Subsetting to keep only a few columns 
key = clin[['GP2sampleID', 'baseline_GP2_phenotype_for_qc', 'biological_sex_for_qc', 'age_at_sample_collection', 'age_of_onset', 'label']]
# Renaming the columns
key.rename(columns = {'GP2sampleID':'IID',
                                     'baseline_GP2_phenotype_for_qc':'phenotype',
                                     'biological_sex_for_qc':'SEX', 
                                     'age_at_sample_collection':'AGE', 
                                     'age_of_onset':'AAO'}, inplace = True)
key

In [None]:
print(key['phenotype'].unique())


In [None]:
 ## Get the PCs
pcs = pd.read_csv(f'/home/jupyter/Subhaplotypes/{ancestry}_release7.eigenvec', sep='\t')
    
pcs.head()

In [None]:
ancestries = {'AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'MDE', 'SAS', 'CAH'}

for ancestry in ancestries:
    
    print(f'WORKING ON: {ancestry}')
    
    ## Subset to keep ancestry of interest 
    ancestry_key = key[key['label']==ancestry].copy()
    ancestry_key.reset_index(drop=True)
    
    # Load information about related individuals in the ancestry analyzed
    related_df = pd.read_csv(f'/home/jupyter/Subhaplotypes/{ancestry}_release7.related')
    print(f'Related individuals: {related_df.shape}')

    # Make a list of just one set of related people
    related_list = list(related_df['IID1'])

    # Check value counts of related and remove only one related individual
    ancestry_key = ancestry_key[~ancestry_key["IID"].isin(related_list)]

    # Check size
    print(f'Unrelated individuals: {ancestry_key.shape}')
    
    # Convert phenotype to binary (1/2)
    ## Assign conditions so case=2 and controls=1, and -9 otherwise (matching PLINK convention)
    # PD = 2; control = 1
    pheno_mapping = {"PD": 2, "Control": 1}
    ancestry_key['PHENO'] = ancestry_key['phenotype'].map(pheno_mapping).astype('Int64')

    # Check value counts of pheno
    ancestry_key['PHENO'].value_counts(dropna=False)
    
    ## Get the PCs
    pcs = pd.read_csv(f'/home/jupyter/Subhaplotypes/{ancestry}_release7.eigenvec', sep='\t')
    
    #Select the first 10 PCs
    selected_columns = ['IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']
    pcs = pd.DataFrame(data=pcs.iloc[:, 1:12].values, columns=selected_columns)

    # Drop the first row (since it's now the column names)
    pcs = pcs.drop(0)

    # Reset the index to remove any potential issues
    pcs = pcs.reset_index(drop=True)
    
    # Check size
    print(f'PCs: {pcs.shape}')

    # Check value counts of SEX
    sex_og_values = ancestry_key['SEX'].value_counts(dropna=False)
    print(f'Sex value counts - original:\n {sex_og_values.to_string()}')
    
    # Convert sex to binary (1/2)
    ## Assign conditions so female=2 and men=1, and -9 otherwise (matching PLINK convention)
    # Female = 2; Male = 1
    sex_mapping = {"Female": 2, "Male": 1}
    ancestry_key['SEX'] = ancestry_key['SEX'].map(sex_mapping).astype('Int64')
    
    # Check value counts of SEX after recoding
    sex_recode_values = ancestry_key['SEX'].value_counts(dropna=False)
    print(f'Sex value counts - recoded:\n{sex_recode_values.to_string()}')
    
    ## Make covariate file
    df = pd.merge(pcs, ancestry_key, on='IID', how='left')
    print(f'Check columns for covariate file: {df.columns}')
    
    #Make additional columns - FID, fatid and matid - these are needed for RVtests!!
    #RVtests needs the first 5 columns to be fid, iid, fatid, matid and sex otherwise it does not run correctly
    #Uppercase column name is ok
    #See https://zhanxw.github.io/rvtests/#phenotype-file
    df['FID'] = 0
    df['FATID'] = 0
    df['MATID'] = 0

    ## Clean up and keep columns we need 
    final_df = df[['FID','IID', 'FATID', 'MATID', 'SEX', 'AGE','AAO', 'PHENO','PC1', 'PC2', 'PC3', 'PC4', 'PC5','PC6', 'PC7', 'PC8', 'PC9', 'PC10']].copy()

    ##DO NOT replace missing values with -9 as this is misinterpreted by RVtests - needs to be nonnumeric
    #Leave missing values as NA
    
    #Check number of PD cases missing age
    pd_missAge = final_df[(final_df['PHENO']==2)&(final_df['AGE'].isna())]
    print(f'Number of PD cases missing age: {pd_missAge.shape[0]}')
    
    #Check number of controls missing age
    control_missAge = final_df[(final_df['PHENO']==1)&(final_df['AGE'].isna())]
    print(f'Number of controls missing age: {control_missAge.shape[0]}')

    ## Make file of sample IDs to keep 
    samples_toKeep = final_df[['FID', 'IID']].copy()
    
    # Define paths for saving files locally
    output_dir = pathlib.Path(f'/home/jupyter/Subhaplotypes/{ancestry}')
    output_dir.mkdir(parents=True, exist_ok=True)

    samplestokeep_path = output_dir / f'{ancestry}.samplestokeep'
    samples_toKeep.to_csv(samplestokeep_path, sep='\t', index=False, header=None)

    # Save the final DataFrame locally
    final_file_path = output_dir / f'{ancestry}_covariates.tsv'
    final_df.to_csv(final_file_path, sep='\t', index=False)

    print(f'Files saved for {ancestry} in {output_dir}\n')


In [None]:
EAS = pd.read_csv('/home/jupyter/Subhaplotypes/EUR/EUR_covariates.tsv', sep='\t')
EAS.info()

In [None]:
EAS.head()

#### Remove related individuals


In [None]:
%%bash

ancestries=('AAC' 'AFR' 'AJ' 'AMR' 'CAS' 'EAS' 'EUR' 'MDE' 'SAS' 'CAH')

WORK_DIR='/home/jupyter/Subhaplotypes/'
cd $WORK_DIR

for ancestry in "${ancestries[@]}"; do
    echo "Processing ancestry: $ancestry"
    tail -n +2 "${ancestry}_release7.related" | cut -d, -f2 > "${ancestry}_related_ids.txt"
done

In [None]:
!cat /home/jupyter/Subhaplotypes/CAS_related_ids.txt

In [None]:
%%bash

WORK_DIR='/home/jupyter/Subhaplotypes/'
cd $WORK_DIR

ancestries=('AAC' 'AFR' 'AJ' 'AMR' 'CAS' 'EAS' 'EUR' 'MDE' 'SAS' 'CAH')

for ancestry in "${ancestries[@]}"; do
    echo "Processing ancestry: $ancestry"
    /home/jupyter/plink2 \
        --pfile chr17_${ancestry}_release7 \
        --remove ${ancestry}_related_ids.txt \
        --make-pgen \
        --out chr17_${ancestry}_release7_nonrelated
done

#### Remove non-PD case control individuals

The prune flag keeo only these with a plink phenotype of 1 or 0

In [None]:

%%bash

WORK_DIR='/home/jupyter/Subhaplotypes/'
cd $WORK_DIR

ancestries=('AAC' 'AFR' 'AJ' 'AMR' 'CAS' 'EAS' 'EUR' 'MDE' 'SAS' 'CAH')

for ancestry in "${ancestries[@]}"; do
    echo "Processing ancestry: $ancestry"
    /home/jupyter/plink2 \
        --pfile chr17_${ancestry}_release7_nonrelated \
        --prune \
        --make-pgen \
        --out chr17_${ancestry}_release7_nonrelated_pdc
done

In [None]:
%%bash

#extract the SNP of interest

WORK_DIR='/home/jupyter/Subhaplotypes/'
cd $WORK_DIR

ancestries=('AAC' 'AFR' 'AJ' 'AMR' 'CAS' 'EAS' 'EUR' 'MDE' 'SAS' 'CAH')

for ancestry in "${ancestries[@]}"; do
    echo "Processing ancestry: $ancestry"
    /home/jupyter/tools/plink2 \
    --pfile chr17_${ancestry}_release7_nonrelated_pdc \
    --chr 17 \
    --from-bp 45996523  \
    --to-bp 45996523 \
    --mac 20 \
    --mind \
    --make-pgen \
    --out ${ancestry}_haplo_h1h2
done

#### Set PD cases to status H2 = 1 (H2/H2 and H1/H2) vs H1 = 2 (H1/H1)

In [None]:

# Define ancestry groups
ancestries = ['AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'MDE', 'SAS', 'CAH']

# Base directories
covariates_dir = "/home/jupyter/Subhaplotypes"
h1h1_cases_dir = "/home/jupyter/Team6_haplo"

# Loop through each ancestry group
for ancestry in ancestries:
    print(f"Processing ancestry: {ancestry}")

    # Define file paths
    covariates_file = f"{covariates_dir}/{ancestry}/{ancestry}_covariates_cleaned.tsv"
    h1h1_cases_file = f"{h1h1_cases_dir}/{ancestry}_h1h1_cases.tsv"
    output_file = f"{covariates_dir}/{ancestry}/{ancestry}_covariates_updated.tsv"

    try:
        # Load the files
        EUR = pd.read_csv(covariates_file, sep='\t')
        EUR_h1 = pd.read_csv(h1h1_cases_file, sep='\t')

        # Create a set of IIDs from EUR_h1 for fast lookup
        h1_iids = set(EUR_h1['IID'])

        # Assign 2 if IID is in EUR_h1, otherwise assign 1
        EUR['H1H1_Status'] = EUR['IID'].apply(lambda x: 2 if x in h1_iids else 1)

        # Save the updated file
        EUR.to_csv(output_file, sep='\t', index=False)

        print(f"Updated file saved as: {output_file}")

        # Print counts of 1s and 2s
        counts = EUR['H1H1_Status'].value_counts()
        print(f"Counts for {ancestry}:")
        print(counts, "\n")

    except FileNotFoundError:
        print(f"Skipping {ancestry}: One of the files is missing.")

print("Processing complete!")


In [None]:
#remove rows with missing data

# Define ancestries and base directory
ancestries = ['AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'MDE', 'SAS', 'CAH']
base_dir = '/home/jupyter/Subhaplotypes'

# Process each ancestry file
for ancestry in ancestries:
    print(f"Processing ancestry: {ancestry}")
    input_file = os.path.join(base_dir, ancestry, f"{ancestry}_covariates.tsv")
    output_file = os.path.join(base_dir, ancestry, f"{ancestry}_covariates_cleaned.tsv")

    # Check if file exists
    if os.path.exists(input_file):
        # Read file and drop rows with missing values
        df = pd.read_csv(input_file, sep='\t')
        df_cleaned = df.dropna()  # Drop rows with any NaN/NA values
        df_cleaned.to_csv(output_file, sep='\t', index=False)
        print(f"Cleaned file saved: {output_file}")
    else:
        print(f"File not found: {input_file}")


#### Check if the AAO is normally distributed

In [None]:
#Look at if the data for AAO is normally discributed:

import scipy.stats as stats

# Define ancestry groups
ancestries = ['AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'MDE', 'SAS', 'CAH']

# Loop through each ancestry group
for ancestry in ancestries:
    file_path = f"/home/jupyter/Subhaplotypes/{ancestry}/{ancestry}_covariates_updated.tsv"

    try:
        # Load data
        df = pd.read_csv(file_path, sep='\t')

        # Filter for PHENO = 2 and remove missing AAO values
        df = df[df['PHENO'] == 2].dropna(subset=['AAO'])

        # Split into groups
        group_1 = df[df['H1H1_Status'] == 1]['AAO']
        group_2 = df[df['H1H1_Status'] == 2]['AAO']

        # Function to perform the Shapiro-Wilk test
        def shapiro_test(data, group_name):
            if len(data) >= 3 and len(data) < 5000:  # Shapiro-Wilk is valid for n < 5000
                shapiro_stat, shapiro_p = stats.shapiro(data)
                print(f"{ancestry} - {group_name}: p={shapiro_p:.5f} (p < 0.05 means NOT normal)")
            else:
                print(f"{ancestry} - {group_name}: Sample size too large for Shapiro-Wilk test")

        # Run the test for both groups
        print(f"\n### Testing Normality for {ancestry} ###")
        shapiro_test(group_1, "H1H1_Status = 1")
        shapiro_test(group_2, "H1H1_Status = 2")

    except Exception as e:
        print(f"Error processing {ancestry}: {e}")


#### Test if there is a difference between H1 vs H2 carriers for AAO using Mann-Whitney test
We are using the Mann-Whitnay test as the data was not normally distributed

In [None]:
#Test if there is a differene between H1 vs H2 carriers for AAO

from scipy.stats import mannwhitneyu

ancestries = ['AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'MDE', 'SAS', 'CAH']

for ancestry in ancestries:
    file_path = f"/home/jupyter/Subhaplotypes/{ancestry}/{ancestry}_covariates_updated.tsv"
    
    try:
        df = pd.read_csv(file_path, sep='\t')

        # Filter for PHENO = 2 and remove missing AAO values
        df = df[df['PHENO'] == 2].dropna(subset=['AAO'])

        # Split into groups
        group_1 = df[df['H1H1_Status'] == 1]['AAO']
        group_2 = df[df['H1H1_Status'] == 2]['AAO']

        # Check if we have enough data for a valid test
        if len(group_1) > 1 and len(group_2) > 1:
            u_stat, p_u = mannwhitneyu(group_1, group_2, alternative='two-sided')

            print(f"{ancestry} - Mann-Whitney U test p-value: {p_u:.5f}")
        else:
            print(f"{ancestry} - Not enough data for statistical test.")

    except FileNotFoundError:
        print(f"{ancestry} - File not found, skipping.")

print("Statistical tests completed for all ancestries.")



#### Run linear regression analyses for H1/H2 vs AAO

In [None]:
%%bash

WORK_DIR='/home/jupyter/Subhaplotypes/'
cd $WORK_DIR

ancestries=('AAC' 'AFR' 'AJ' 'AMR' 'CAS' 'EAS' 'EUR' 'MDE' 'SAS' 'CAH')

for ancestry in "${ancestries[@]}"; do
    echo "Processing ancestry: $ancestry"
    /home/jupyter/tools/plink2 \
    --pfile ${ancestry}_haplo_h1h2 \
    --glm hide-covar --ci 0.95 \
    --covar /home/jupyter/Subhaplotypes/${ancestry}/${ancestry}_covariates_cleaned.tsv \
    --covar-name SEX,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \
    --pheno-name AAO \
    --pheno ${ancestry}/${ancestry}_covariates_cleaned.tsv \
    --covar-variance-standardize \
    --out /home/jupyter/Subhaplotypes/${ancestry}_H1_H2_AAO
done


In [None]:
import pandas as pd

# List of ancestries
ancestries = ['AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'MDE', 'SAS', 'CAH']

# Base directory for all ancestries
BASE_DIR = '/home/jupyter/Subhaplotypes'

# Loop through ancestries and read each file
for ancestry in ancestries:
    # Construct the file path
    file_path = f"{BASE_DIR}/{ancestry}_H1_H2_AAO.AAO.glm.linear"
    
    try:
        # Read the file into a pandas DataFrame
        df = pd.read_csv(file_path, sep='\t')
        
        # Display the DataFrame with pandas styling (nice in Jupyter notebooks)
        print(f"Displaying contents of {ancestry} file:")
        display(df.head().style.set_properties(**{'text-align': 'center'}))  # Center align text
        print("\n" + "-"*50 + "\n")
    except FileNotFoundError:
        print(f"File not found for ancestry {ancestry}: {file_path}")


Flip the ref and alt allele

In [None]:
%%bash
cd /home/jupyter/Subhaplotypes
head -20 AAC_haplo_h1h2_.pvar

In [None]:
## Prepare allele.txt file to flip major/minor alleles
# Define the SNP and allele
snp = 'chr17:45996523:A:G'
allele = 'G'

# Specify the output file name
output_file = '/home/jupyter/Subhaplotypes/alleles.txt'

# Write the SNP and allele to the file
with open(output_file, 'w') as f:
    f.write(f"{snp} {allele}\n")

In [None]:
%%bash

WORK_DIR='/home/jupyter/Subhaplotypes/'
cd $WORK_DIR

ancestries=('AAC' 'AFR' 'AJ' 'AMR' 'CAS' 'EAS' 'EUR' 'MDE' 'SAS' 'CAH')

for ancestry in "${ancestries[@]}"; do
    echo "Processing ancestry: $ancestry"
    /home/jupyter/tools/plink2 \
    --pfile ${ancestry}_haplo_h1h2_ \
    --ref-allele force alleles.txt \
    --make-pgen \
    --out ${ancestry}_flipped_plink_file
done

In [None]:
%%bash

WORK_DIR='/home/jupyter/Subhaplotypes/'
cd $WORK_DIR
head EUR_flipped_plink_file.pvar

In [None]:
%%bash

WORK_DIR='/home/jupyter/Subhaplotypes/'
cd $WORK_DIR

ancestries=('AAC' 'AFR' 'AJ' 'AMR' 'CAS' 'EAS' 'EUR' 'MDE' 'SAS' 'CAH')

for ancestry in "${ancestries[@]}"; do
    echo "Processing ancestry: $ancestry"
    /home/jupyter/tools/plink2 \
    --pfile ${ancestry}_flipped_plink_file \
    --glm a0-ref hide-covar --ci 0.95 \
    --covar /home/jupyter/Subhaplotypes/${ancestry}/${ancestry}_covariates_cleaned.tsv \
    --covar-name SEX,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \
    --pheno-name AAO \
    --pheno ${ancestry}/${ancestry}_covariates_cleaned.tsv \
    --covar-variance-standardize \
    --out /home/jupyter/Subhaplotypes/${ancestry}_H1_H2_AAO_flip
done

In [None]:
import pandas as pd

# List of ancestries
ancestries = ['AAC', 'AFR', 'AJ', 'AMR', 'CAS', 'EAS', 'EUR', 'MDE', 'SAS', 'CAH']

# Base directory for all ancestries
BASE_DIR = '/home/jupyter/Subhaplotypes'

# Loop through ancestries and read each file
for ancestry in ancestries:
    # Construct the file path
    file_path = f"{BASE_DIR}/{ancestry}_H1_H2_AAO_flip.AAO.glm.linear"
    
    try:
        # Read the file into a pandas DataFrame
        df = pd.read_csv(file_path, sep='\t')
        
        # Display the DataFrame with pandas styling (nice in Jupyter notebooks)
        print(f"Displaying contents of {ancestry} file:")
        display(df.head().style.set_properties(**{'text-align': 'center'}))  # Center align text
        print("\n" + "-"*50 + "\n")
    except FileNotFoundError:
        print(f"File not found for ancestry {ancestry}: {file_path}")
