In [None]:
# 1. Install Required Tools (GCTA)
# GCTA is required for conditional analysis (COJO)
import os

# Create tools directory
!mkdir -p ./tools

# Check if GCTA is installed
if not os.path.exists("./tools/gcta64"):
    print("Installing GCTA...")
    # Updated URL for version 1.94.4
    gcta_url = "https://yanglab.westlake.edu.cn/software/gcta/bin/gcta-1.94.4-linux-kernel-3-x86_64.zip"
    zip_name = "gcta-1.94.4-linux-kernel-3-x86_64.zip"
    folder_name = "gcta-1.94.4-linux-kernel-3-x86_64"
    
    # Download
    !wget -O ./tools/{zip_name} {gcta_url}
    
    # Verify download and install
    if os.path.exists(f"./tools/{zip_name}"):
        !unzip -o ./tools/{zip_name} -d ./tools/
        # Move binary to main tools folder
        if os.path.exists(f"./tools/{folder_name}/gcta64"):
            !mv ./tools/{folder_name}/gcta64 ./tools/
            !chmod +x ./tools/gcta64
            # Clean up
            !rm -rf ./tools/{folder_name}
            !rm ./tools/{zip_name}
            print("GCTA installed successfully.")
        else:
             print(f"Error: gcta64 binary not found in {folder_name}")
    else:
        print("Error: GCTA download failed. Please check the URL.")
else:
    print("GCTA already installed.")

GCTA_PATH = os.path.abspath("./tools/gcta64")
print(f"GCTA Path: {GCTA_PATH}")

In [None]:
# 3. Define Paths and Constants

# Inputs
GWAS_RESULTS_PATH = "gs://fc-aou-datasets-controlled/AllxAll/v1/ht/ACAF/EUR/phenotype_NS_326.1_ACAF_results.ht"
WGS_MT_PATH = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/splitMT/hail.mt"
ANCESTRY_PREDS_PATH = "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv"

# Outputs
LOCI_OUTPUT_PATH = f"{bucket}/results/phase2_defined_loci.bed"
FINE_MAPPING_DIR = f"{bucket}/results/fine_mapping"

# Constants
P_THRESHOLD = 5e-8
MHC_INTERVAL = "chr6:25000000-35000000" # Broad MHC exclusion as per Analysis_README
FLANK_WINDOW = 500000 # 500kb flank for initial locus definition (1Mb total width)

In [None]:
# 4. Load EUR Sample IDs (for LD Reference)
print("Loading EUR sample IDs...")
!gsutil -u $GOOGLE_PROJECT cp {ANCESTRY_PREDS_PATH} ./ancestry_temp.tsv
ancestry_df = pd.read_csv("./ancestry_temp.tsv", sep="\t")
eur_ids = set(ancestry_df[ancestry_df['ancestry_pred'] == 'eur']['research_id'].astype(str))
!rm ./ancestry_temp.tsv

print(f"Identified {len(eur_ids)} European samples.")

In [None]:
# 5. Load GWAS Summary Statistics and Filter
print("Loading GWAS summary statistics...")
gwas_ht = hl.read_table(GWAS_RESULTS_PATH)

# Filter for genome-wide significant hits
sig_ht = gwas_ht.filter(gwas_ht.p_value < P_THRESHOLD)

# Exclude MHC Region
print(f"Excluding MHC region: {MHC_INTERVAL}")
sig_ht = sig_ht.filter(~hl.parse_locus_interval(MHC_INTERVAL).contains(sig_ht.locus))

# Check count
n_sig = sig_ht.count()
print(f"Number of significant SNPs outside MHC: {n_sig}")

In [None]:
# 6. Define Loci (Distance-based Clumping)
# We perform a greedy distance-based clumping to define the initial windows for fine-mapping.

# Collect significant hits to Pandas
df_sig = sig_ht.select('p_value').to_pandas()
df_sig['chrom'] = df_sig['locus'].apply(lambda x: x.contig)
df_sig['pos'] = df_sig['locus'].apply(lambda x: x.position)
df_sig = df_sig.sort_values('p_value')

defined_loci = []

print("Performing distance-based clumping...")
while not df_sig.empty:
    # Take top SNP
    lead_snp = df_sig.iloc[0]
    
    # Define window
    chrom = lead_snp['chrom']
    start = max(1, lead_snp['pos'] - FLANK_WINDOW)
    end = lead_snp['pos'] + FLANK_WINDOW
    
    defined_loci.append({
        'chrom': chrom,
        'start': start,
        'end': end,
        'lead_snp_pos': lead_snp['pos'],
        'lead_snp_p': lead_snp['p_value']
    })
    
    # Remove all SNPs within this window from the pool
    df_sig = df_sig[~((df_sig['chrom'] == chrom) & 
                      (df_sig['pos'] >= start) & 
                      (df_sig['pos'] <= end))]

loci_df = pd.DataFrame(defined_loci)
print(f"Defined {len(loci_df)} independent loci.")

# Save rough loci definitions
loci_df.to_csv("defined_loci.tsv", sep="\t", index=False)
!gsutil cp defined_loci.tsv {LOCI_OUTPUT_PATH}
print(f"Initial loci definitions saved to {LOCI_OUTPUT_PATH}")
loci_df.head()

In [None]:
# 7. Helper Functions for Fine-Mapping Prep

def extract_locus_plink(chrom, start, end, output_prefix, eur_ids_set):
    """
    Extracts WGS data for a specific locus, filters to EUR samples, 
    and exports to PLINK format for GCTA.
    """
    # Define interval
    interval = hl.parse_locus_interval(f"{chrom}:{start}-{end}")
    
    # Load WGS MT
    mt = hl.read_matrix_table(WGS_MT_PATH)
    
    # Filter to interval AND European samples
    mt_locus = hl.filter_intervals(mt, [interval])
    mt_locus = mt_locus.filter_cols(hl.literal(eur_ids_set).contains(mt_locus.s))
    
    # Export to PLINK
    # Note: GCTA requires .bed/.bim/.fam
    hl.export_plink(mt_locus, output_prefix, ind_id=mt_locus.s, fam_id=mt_locus.s)

def run_gcta_cojo(bfile, sumstats_file, out_prefix):
    """
    Runs GCTA-COJO to identify independent signals.
    """
    # Ensure the sumstats file matches GCTA format: SNP A1 A2 freq b se p N
    cmd = f"{GCTA_PATH} --bfile {bfile} --cojo-file {sumstats_file} --cojo-slct --out {out_prefix}"
    print(f"Running: {cmd}")
    os.system(cmd)

In [None]:
# 8. Run Fine-Mapping (GCTA-COJO) per Locus
# This loop iterates over 'defined_loci', extracts EUR genotypes, and runs GCTA-COJO.

# Create local directory for temporary files
!mkdir -p ./temp_finemap

for i, row in loci_df.iterrows():
    locus_id = f"{row['chrom']}_{row['start']}_{row['end']}"
    print(f"\n--- Processing Locus {i+1}/{len(loci_df)}: {locus_id} ---")
    
    local_prefix = f"./temp_finemap/{locus_id}"
    
    # 1. Extract Genotypes (LD Reference)
    # Checks if PLINK files already exist to avoid re-extracting
    if not os.path.exists(f"{local_prefix}.bed"):
        print("Extracting genotypes...")
        extract_locus_plink(row['chrom'], row['start'], row['end'], local_prefix, eur_ids)
    else:
        print("Genotypes already extracted.")

    # 2. Prepare Sumstats for GCTA
    # We need to export the summary stats for this specific region to a format GCTA accepts
    # This requires filtering the main Hail Table again or subsetting a pandas DF
    # (Simplified placeholder logic below - requires mapping Hail fields to GCTA columns)
    
    # 3. Run GCTA-COJO
    # run_gcta_cojo(local_prefix, f"{local_prefix}.ma", f"{local_prefix}_cojo")
    
    # 4. (Optional) SuSiE would be run here using the LD matrix from the PLINK files
    
    # Cleanup large PLINK files if space is an issue
    # !rm {local_prefix}.bed {local_prefix}.bim {local_prefix}.fam

print("Fine-mapping preparation complete.")