# GP2 R10 Imputed Data

* **Project:** ADRD-SORL1-Biobanks
* **Version:** Python/3.10
* **Last Updated:** 21-Aug-2025

## Notebook Overview
Haplotype analysis

### Variable setup

In [2]:
ancestry = "EUR"
ancestry_list = ["AAC", "AFR", "AJ", "AMR","CAH", "CAS", "EUR", "EAS", "FIN", "MDE", "SAS"]
CHROM = 'chr11'
GENE = 'SORL1'
WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
! mkdir -p {WORK_DIR}
DATA_DIR = f'/home/jupyter/workspace'


In [3]:
import pandas as pd

### Make covariate files

In [None]:
cohort_stats = []
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! mkdir -p {WORK_DIR}
    
    # Let's load the master key
    key = pd.read_csv(f'{DATA_DIR}/gp2_tier2_eu_release10/clinical_data/master_key_release10_final_vwb.csv', header=0)
    print(key.shape)
    key = key[['GP2ID', 'baseline_GP2_phenotype', 'baseline_GP2_phenotype_for_qc', 'biological_sex_for_qc', 'age_at_sample_collection', 'age_of_onset',  "age_at_diagnosis",'race_for_qc','nba', 'wgs', 'nba_label', 'study_type']]
    key.rename(columns = {'GP2ID': 'IID',
                          'baseline_GP2_phenotype':'phenotype',
                                         'biological_sex_for_qc':'SEX', 
                                         'age_at_sample_collection':'AGE', 
                                         'race_for_qc':'RACE',
                                         'age_of_onset':'AAO',
                                     "age_at_diagnosis":"AAD"}, inplace = True)
    
    ## Subset to keep ancestry of interest 
    ancestry_key = key[key['nba_label']==ancestry].copy()
    ancestry_key.reset_index(drop=True)
    
    # Load information about related individuals 
    try:
        related_df = pd.read_csv(f'{DATA_DIR}/gp2_tier2_eu_release10/meta_data/related_samples/{ancestry}_release10_vwb.related')
        print(related_df.shape)
        # Make a list of just one set of related people
        related_list = list(related_df['IID1'])
    
        # remove related individuals
        print(f"Removing {len(related_list)} individuals from {ancestry}.")
    
        ancestry_key = ancestry_key[~ancestry_key["IID"].isin(related_list)]


        ancestry_key
    except:
        print("Warning. No related samples file found. Removing 0 samples.")

    
    pheno_mapping = {"PD": 2, "Control": 1}
    ancestry_key['PHENO'] = ancestry_key['phenotype'].map(pheno_mapping).astype('Int64')
    
    ## Get the PCs
    pcs = pd.read_csv(f'{DATA_DIR}/gp2_tier2_eu_release10/raw_genotypes/{ancestry}/{ancestry}_release10_vwb.eigenvec', sep='\t')
    selected_columns = ['IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']
    pcs = pd.DataFrame(data=pcs.iloc[:, 1:7].values, columns=selected_columns)
    
    # Drop the first row (since it's now the column names)
    pcs = pcs.drop(0)
    
    # Reset the index to remove any potential issues
    pcs = pcs.reset_index(drop=True)
    
    # Convert phenotype to binary (1/2)
    ## Assign conditions so female=2 and men=1, and -9 otherwise (matching PLINK convention)
    # Female = 2; Male = 1
    sex_mapping = {"Female": 2, "Male": 1}
    ancestry_key["SEX"] = ancestry_key['SEX'].map(sex_mapping).astype('Int64')
    
    # Check value counts of SEX
    ancestry_key["SEX"].value_counts(dropna=False)
    
    print(f"Dropping {(~ancestry_key['nba']==1).sum()} samples that are not genotyped")
    print(f"Dropping {(ancestry_key['wgs']==1).sum()} samples that have whole genome sequencing available")
    ancestry_key = ancestry_key[(ancestry_key["nba"]==1) & (ancestry_key["wgs"]==0) ]
    
    ## Make covariate file
    df = pd.merge(pcs, ancestry_key, on='IID', how="inner")
    
    print(f"Dropping {ancestry_key.shape[0] - df.shape[0]} samples with no pcs")
    ## Drop lines with missing pheno
    print(f"Dropping {(df['PHENO'].isna()).sum()} samples with missing pheno")
    
    df = df[df['PHENO'].notna()]
    
    df["PHENO"].value_counts(dropna=False)
    
    df["#FID"] = 0
    final_df = df[['#FID','IID', 'SEX', 'AGE', 'PHENO', 'AAO', 'AAD', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']].copy()
    final_df.groupby(['PHENO'])['SEX'].value_counts()
    
    ## Make file of sample IDs to keep 
    samples_toKeep = final_df[['#FID', 'IID']].copy()
    
    samples_toKeep.to_csv(f'{WORK_DIR}/{ancestry}.samplestoKeep', sep = '\t', index=False, header=None)
    
    ## Save your covariate file
    final_df.to_csv(f'{WORK_DIR}/{ancestry}_covariate_file.txt', sep = '\t', index=False, na_rep= "NA", header=True)
    
    cohort_stats.append((ancestry,(final_df["PHENO"] == 2).sum(), (final_df["PHENO"] == 1).sum()))
cohort_stats


In [4]:
# Write to tab-delimited file
with open(f'/home/jupyter/{GENE}_results/haplotype_files/case_control_cts_imputed.txt', 'w') as f:
    f.write("Ancestry\tCases\tControls\n")
    for row in cohort_stats:
        f.write(f"{row[0]}\t{row[1]}\t{row[2]}\n")

### Make Phenotype Files

In [5]:
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! cut -f 1,2,5 {WORK_DIR}/{ancestry}_covariate_file.txt > {WORK_DIR}/{ancestry}_pheno.txt

### Select SORL1 region ±100 kb boundaries, remove related individuals and restrict to case and controls

In [6]:
# run for all other ancestries
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! mkdir -p {WORK_DIR}
    ! /home/jupyter/tools/plink2 \
    --pfile {DATA_DIR}/gp2_tier2_eu_release10/imputed_genotypes/{ancestry}/{CHROM}_{ancestry}_release10_vwb \
    --chr 11 \
    --from-bp 121352314 \
    --to-bp 121733763 \
    --keep {WORK_DIR}/{ancestry}.samplestoKeep \
    --pheno {WORK_DIR}/{ancestry}_pheno.txt \
    --pheno-name PHENO \
    --make-bed \
    --silent \
    --out {WORK_DIR}/{ancestry}_boundaries_Hap

In [None]:
# keep only biallelic variants
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! awk 'length($5)==1 && length($6)==1 && $5 ~ /^[ACGT]$/ && $6 ~ /^[ACGT]$/' \
   {WORK_DIR}/{ancestry}_boundaries_Hap.bim | cut -f2 > {WORK_DIR}/{ancestry}_biallelic_snps.txt
    ! /home/jupyter/tools/plink2 --bfile {WORK_DIR}/{ancestry}_boundaries_Hap \
      --extract {WORK_DIR}/{ancestry}_biallelic_snps.txt \
      --make-bed \
      --out {WORK_DIR}/{ancestry}_biallelic_Hap

### Quality Control

In [None]:
# Make covariate files
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! /home/jupyter/tools/plink2 \
    --bfile {WORK_DIR}/{ancestry}_biallelic_Hap \
    --maf 0.05 \
    --geno 0.05 \
    --make-bed \
    --out {WORK_DIR}/{ancestry}_biallelic_Hap_filtered

### Create VCF files for phasing

In [11]:
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! /home/jupyter/tools/plink \
    --bfile {WORK_DIR}/{ancestry}_biallelic_Hap_filtered \
    --silent \
    --recode vcf-iid \
    --out {WORK_DIR}/{ancestry}_biallelic_Hap_filtered

### Phasing Data

In [None]:
# download 
! wget -O /home/jupyter/{GENE}_results/haplotype_files/beagle.jar https://faculty.washington.edu/browning/beagle/beagle.28Jun21.220.jar

In [None]:
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! java -Xmx8g -jar /home/jupyter/{GENE}_results/haplotype_files/beagle.jar \
      gt={WORK_DIR}/{ancestry}_biallelic_Hap_filtered.vcf \
      out={WORK_DIR}/{ancestry}_biallelic_Hap_filtered_phased

### Create new pheno file with double-ids

In [13]:
# need pheno file to match new fam file
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    pheno = pd.read_csv(f'{WORK_DIR}/{ancestry}_pheno.txt', sep="\t")
    pheno["#FID"] = pheno["IID"]
    pheno.to_csv(f'{WORK_DIR}/{ancestry}_pheno_doubleid.txt', sep="\t", index=False)
    

### Convert phased file to PLINK 1.9 format

In [14]:
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! /home/jupyter/tools/plink \
    --vcf {WORK_DIR}/{ancestry}_biallelic_Hap_filtered_phased.vcf.gz \
    --double-id \
    --pheno {WORK_DIR}/{ancestry}_pheno_doubleid.txt \
    --pheno-name PHENO \
    --allow-no-sex \
    --silent \
    --make-bed \
    --out {WORK_DIR}/{ancestry}_biallelic_Hap_filtered_phased

### Create necessary files (.ped and .info) for Haploview input

In [15]:
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! /home/jupyter/tools/plink \
    --bfile {WORK_DIR}/{ancestry}_biallelic_Hap_filtered_phased \
    --recodeHV \
    --silent \
    --allow-no-sex \
    --out {WORK_DIR}/{ancestry}_biallelic_{GENE}_haploview_imputed

### Run the analysis in a GUI environment in Haploview

In [None]:
! wget https://www.broadinstitute.org/ftp/pub/mpg/haploview/Haploview.jar

### Clean the results

In [None]:
import pandas as pd
import os

ancestries = ['EUR', 'AFR', 'AMR', 'EAS', 'SAS', 'AAC', 'AJ', 'CAH', 'CAS', 'MDE', 'FIN']
output_dir = "GP2/SORL1_haploview_and_det_imputed"
os.makedirs(output_dir, exist_ok=True)

for anc in ancestries:
    file_path = os.path.join(output_dir, f"{anc}_IM_RESULTS")

    if not os.path.exists(file_path):
        print(f"❌ File not found: {file_path}")
        continue

    print(f"✅ Processing {file_path} ...")

    data = []
    current_block = None  

    with open(file_path, 'r') as f:
        header_line = f.readline().strip()
        header_cols = header_line.split('\t')
        if header_cols[0].lower() == "block":
            header_cols = header_cols[1:]  

        for line in f:
            line = line.strip()
            if line.startswith("Block "):  
                current_block = line  
                continue
            fields = line.split('\t')
            if len(fields) == len(header_cols):
                data.append([current_block] + fields)

    if not data:
        print(f"⚠️ No data found inside {file_path}")
        continue


    df_columns = ['Block'] + [
        col.strip().replace(' ', '_').replace('.', '').replace(',', '') 
        for col in header_cols
    ]
    df = pd.DataFrame(data, columns=df_columns)

    numeric_cols = ['Freq', 'Chi_Square', 'P_Value']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')


    if 'CaseControl_Frequencies' in df.columns:
        df[['CaseFreq', 'ControlFreq']] = df['CaseControl_Frequencies'] \
            .str.strip().str.split(',', expand=True).astype(float)

    
    cols_to_drop = ['Case_Control_Ratio_Counts', 'CaseControl_Frequencies']
    for col in cols_to_drop:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

   
    cols_to_keep = ['Block', 'Haplotype', 'Freq', 'Chi_Square', 'P_Value', 'CaseFreq', 'ControlFreq']
    df = df[[col for col in cols_to_keep if col in df.columns]]

   
    output_path = os.path.join(output_dir, f"{anc}_IM_cleaned.tsv")
    df.to_csv(output_path, sep='\t', index=False)


### Seperate significant results

In [None]:
import pandas as pd
import os

ancestries = ['EUR', 'AFR', 'AMR', 'EAS', 'SAS', 'AAC', 'AJ', 'CAH', 'CAS', 'MDE', 'FIN']
input_dir = "GP2/SORL1_haploview_and_det_imputed"

for anc in ancestries:
    input_file = os.path.join(input_dir, f"{anc}_IM_cleaned.tsv")
    
    if not os.path.exists(input_file):
        print(f"File not found: {input_file}")
        continue
    
    df = pd.read_csv(input_file, sep='\t')
    
    df['Direction'] = df.apply(
        lambda row: '↑ in cases' if row['CaseFreq'] > row['ControlFreq']
        else '↑ in controls' if row['CaseFreq'] < row['ControlFreq']
        else 'no difference',
        axis=1
    )
    
    sig_df = df[df['P_Value'] < 0.05].copy()
    sig_df.reset_index(drop=True, inplace=True)
    
    output_file = os.path.join(input_dir, f"{anc}_IM_significant.tsv")
    sig_df.to_csv(output_file, sep='\t', index=False)

### Visualize data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from matplotlib.backends.backend_pdf import PdfPages


warnings.filterwarnings("ignore", category=FutureWarning)

ancestries = ['EUR', 'AFR', 'AMR', 'EAS', 'SAS', 'AAC', 'AJ', 'CAH', 'CAS', 'MDE', 'FIN']
input_dir = "GP2/SORL1_haploview_and_det_imputed"
os.makedirs(input_dir, exist_ok=True)

pdf_output = os.path.join(input_dir, "all_IM_log2fc_heatmaps.pdf")

with PdfPages(pdf_output) as pdf:
    for anc in ancestries:
        input_file = os.path.join(input_dir, f"{anc}_IM_significant.tsv")
        
        if not os.path.exists(input_file):
            print(f"File not found: {input_file}")
            continue

        df = pd.read_csv(input_file, sep='\t')
        
        if df.empty:
            print(f"No significant results for {anc.upper()}")
            continue

        df['FoldChange'] = df['CaseFreq'] / df['ControlFreq']
        with np.errstate(divide='ignore', invalid='ignore'):
            df['log2FC'] = np.log2(df['FoldChange'])

        df = df.sort_values(by='FoldChange', ascending=False).reset_index(drop=True)

        output_table = os.path.join(input_dir, f"{anc}_IM_significant_with_foldchange.tsv")
        df.to_csv(output_table, sep='\t', index=False)
        print(f"Saved TSV: {output_table}")

        heatmap_df = df.set_index('Haplotype')[['log2FC']].copy()
        finite_vals = heatmap_df.replace([np.inf, -np.inf], np.nan)

        if finite_vals.dropna().empty and not any(np.isinf(heatmap_df['log2FC'])):
            print(f"Skipped heatmap for {anc.upper()} — nothing to plot")
            continue

        mask = heatmap_df['log2FC'].isin([np.inf, -np.inf]).values.reshape(-1, 1)

        max_hap_len = max(len(h) for h in heatmap_df.index)
        fig_width = 10 if max_hap_len < 40 else 12 if max_hap_len < 80 else 14
        fig_height = max(3.5, len(heatmap_df) * 0.45)

        fig, ax = plt.subplots(figsize=(fig_width, fig_height))

        sns.heatmap(
            finite_vals,
            annot=True,
            fmt=".2f",
            cmap='coolwarm',
            center=0,
            cbar_kws={'label': 'log2(CaseFreq / ControlFreq)', 'shrink': 0.6},
            mask=mask,
            annot_kws={"color": "black"},
            ax=ax
        )

        for i, (hap, val) in enumerate(heatmap_df['log2FC'].items()):
            if val == np.inf or val == -np.inf:
                color = 'red' if val == np.inf else 'blue'
                ax.add_patch(plt.Rectangle((0, i), 1, 1, fill=True, color=color))
                ax.text(0.5, i + 0.5, '∞', va='center', ha='center', color='black', fontsize=10, fontweight='bold')

        ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=6)
        ax.set_xticklabels(ax.get_xticklabels(), fontsize=6)
        ax.set_title(f'Log2 Fold Change (Case vs Control): {anc.upper()}', fontsize=10)
        ax.set_ylabel('Haplotype', fontsize=8)
        ax.set_xlabel("")  

        colorbar = ax.collections[0].colorbar
        colorbar.ax.tick_params(labelsize=6)
        colorbar.set_label('log2(CaseFreq / ControlFreq)', fontsize=7)

        plt.subplots_adjust(left=0.42, right=0.92, top=0.88, bottom=0.08)

        heatmap_image = os.path.join(input_dir, f"{anc}_IM_log2fc_heatmap.png")
        plt.savefig(heatmap_image, dpi=300, bbox_inches='tight')
        print(f"Saved PNG: {heatmap_image}")

        pdf.savefig(fig, bbox_inches='tight')
        plt.close()
        print(f"Added to PDF: {anc.upper()}")



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from matplotlib.backends.backend_pdf import PdfPages

warnings.filterwarnings("ignore", category=FutureWarning)

input_dir = "GP2/SORL1_haploview_and_det_imputed"
os.makedirs(input_dir, exist_ok=True)

ordered_ancestries = ['EUR', 'AFR', 'AMR', 'AAC', 'AJ', 'CAS', 'EAS', 'SAS', 'MDE', 'CAH','FIN']

pdf_output = os.path.join(input_dir, "all_IM_case_enriched_heatmaps.pdf")

with PdfPages(pdf_output) as pdf:
    for anc in ordered_ancestries:
        input_file = os.path.join(input_dir, f"{anc}_IM_significant.tsv")

        if not os.path.exists(input_file):
            print(f"File not found: {input_file}")
            continue

        df = pd.read_csv(input_file, sep='\t')

        if df.empty:
            print(f"No significant results for {anc.upper()}")
            continue

        df['FoldChange'] = df['CaseFreq'] / df['ControlFreq']
        with np.errstate(divide='ignore', invalid='ignore'):
            df['log2FC'] = np.log2(df['FoldChange'])

        df_case_enriched = df[df['log2FC'] > 0].copy()

        if df_case_enriched.empty:
            print(f"No case-enriched haplotypes for {anc.upper()}")
            continue

        df_case_enriched = df_case_enriched.sort_values(by='log2FC', ascending=False).reset_index(drop=True)

        tsv_file = os.path.join(input_dir, f"{anc}_IM_case_enriched.tsv")
        df_case_enriched.to_csv(tsv_file, sep='\t', index=False)
        print(f"Saved TSV: {tsv_file}")

        heatmap_df = df_case_enriched.set_index('Haplotype')[['log2FC']].copy()
        finite_vals = heatmap_df.replace([np.inf], np.nan)
        mask = heatmap_df['log2FC'].isin([np.inf]).values.reshape(-1, 1)

        max_hap_len = max(len(h) for h in heatmap_df.index)
        fig_width = 10 if max_hap_len < 40 else 12 if max_hap_len < 80 else 14
        fig_height = max(2, len(heatmap_df) * 0.5)

        plt.figure(figsize=(fig_width, fig_height))
        ax = sns.heatmap(
            finite_vals,
            annot=True,
            fmt=".2f",
            cmap='Reds',
            cbar_kws={'label': 'log2(CaseFreq / ControlFreq)'},
            mask=mask,
            annot_kws={"color": "black"}
        )

        for i, (hap, val) in enumerate(heatmap_df['log2FC'].items()):
            if val == np.inf:
                plt.gca().add_patch(plt.Rectangle((0, i), 1, 1, fill=True, color='darkred'))
                plt.text(0.5, i + 0.5, '+inf', va='center', ha='center', color='black')

        plt.yticks(rotation=0, fontsize=7)
        plt.title(f'Case-Enriched Haplotypes: {anc.upper()}')
        plt.ylabel('Haplotype')

        if len(heatmap_df) < 4:
            ax.figure.axes[-1].yaxis.label.set_size(8)
            plt.xlabel('', fontsize=8)
            plt.xticks(fontsize=8)
        else:
            plt.xlabel('')
            plt.xticks(fontsize=10)

        plt.subplots_adjust(left=0.45, right=0.95, top=0.85, bottom=0.10)

        png_file = os.path.join(input_dir, f"{anc}_IM_case_enriched_heatmap.png")
        plt.savefig(png_file)
        print(f"Saved PNG: {png_file}")

        pdf.savefig()
        plt.close()
        print(f"Added to PDF: {anc.upper()}")

## Create Haplotype Blocks

In [None]:
{WORK_DIR}/{ancestry}_boundaries_Hap.bim

In [None]:
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! /home/jupyter/tools/plink \
    --bfile {WORK_DIR}/{ancestry}_boundaries_Hap \
    --maf 0.05 \
    --geno 0.05 \
    --make-bed \
    --out {WORK_DIR}/{ancestry}_filtered_Hap_blocks

In [None]:
### Separate the data by cases and controls

for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! /home/jupyter/tools/plink \
    --bfile {WORK_DIR}/{ancestry}_filtered_Hap_blocks \
    --make-bed \
    --filter-cases \
    --out {WORK_DIR}/{ancestry}_filtered_Hap_blocks_cases
        
    ! /home/jupyter/tools/plink \
    --bfile {WORK_DIR}/{ancestry}_filtered_Hap_blocks \
    --make-bed \
    --filter-controls \
    --out {WORK_DIR}/{ancestry}_filtered_Hap_blocks_controls

In [19]:
# convert the data to vcf format before phasing
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    
    ! /home/jupyter/tools/plink \
    --bfile {WORK_DIR}/{ancestry}_filtered_Hap_blocks_cases \
    --silent \
    --recode vcf-iid \
    --out {WORK_DIR}/{ancestry}_filtered_Hap_blocks_cases
        
    ! /home/jupyter/tools/plink \
    --bfile {WORK_DIR}/{ancestry}_filtered_Hap_blocks_controls \
    --silent \
    --recode vcf-iid \
    --out {WORK_DIR}/{ancestry}_filtered_Hap_blocks_controls

In [None]:
# phase data
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    
    #cases
    ! java -Xmx8g -jar /home/jupyter/{GENE}_results/haplotype_files/beagle.jar \
      gt={WORK_DIR}/{ancestry}_filtered_Hap_blocks_cases.vcf \
      out={WORK_DIR}/{ancestry}_filtered_Hap_blocks_cases_phased

    #controls
    ! java -Xmx8g -jar /home/jupyter/{GENE}_results/haplotype_files/beagle.jar \
      gt={WORK_DIR}/{ancestry}_filtered_Hap_blocks_controls.vcf \
      out={WORK_DIR}/{ancestry}_filtered_Hap_blocks_controls_phased

In [22]:
# convert phased data vcfs to bfiles

for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'

    #cases
    ! /home/jupyter/tools/plink \
    --vcf {WORK_DIR}/{ancestry}_filtered_Hap_blocks_cases_phased.vcf.gz \
    --double-id \
    --pheno {WORK_DIR}/{ancestry}_pheno_doubleid.txt \
    --pheno-name PHENO \
    --allow-no-sex \
    --silent \
    --make-bed \
    --out {WORK_DIR}/{ancestry}_filtered_Hap_blocks_cases_phased
        
    # controls
    ! /home/jupyter/tools/plink \
    --vcf {WORK_DIR}/{ancestry}_filtered_Hap_blocks_controls_phased.vcf.gz \
    --double-id \
    --pheno {WORK_DIR}/{ancestry}_pheno_doubleid.txt \
    --pheno-name PHENO \
    --allow-no-sex \
    --silent \
    --make-bed \
    --out {WORK_DIR}/{ancestry}_filtered_Hap_blocks_controls_phased

### Get haplotype blocks

In [None]:
# for cases
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! /home/jupyter/tools/plink \
    --bfile {WORK_DIR}/{ancestry}_filtered_Hap_blocks_cases_phased \
    --blocks no-pheno-req \
    --out {WORK_DIR}/{ancestry}_cases_phased_blocks_imputed

In [None]:
# for controls
for ancestry in ancestry_list:
    CHROM = 'chr11'
    GENE = 'SORL1'
    WORK_DIR =  f'/home/jupyter/{GENE}_results/haplotype_files/{ancestry}_IMP'
    ! /home/jupyter/tools/plink \
    --bfile {WORK_DIR}/{ancestry}_filtered_Hap_blocks_controls_phased \
    --blocks no-pheno-req \
    --out {WORK_DIR}/{ancestry}_controls_phased_blocks_imputed