In [None]:
# Date of creation: 2024-10-18
# Author: David Yang
# Purpose: Run Allele Stacker 



In [1]:
# Set up environment

import os
import sys


base_dir = "/gs/gsfs0/shared-lab/greally-lab/David/simple_allele-stacker"
# Set up paths
bash_script_path = base_dir + "/bash"

#### STEP 1: Generate segmentation data and prepare outputs for downstream analysis 
- Generate input consensus regions 

In [None]:
# Step 1a: Segment the samples using the following parameters
# - min CpGs = 5
# - max gap = 500bp


!sbatch $bash_script_path/1a_segment_samples.sh    

# outputs in:/gs/gsfs0/shared-lab/greally-lab/David/simple_allele-stacker/outputs/segmentation_regions

Submitted batch job 7841642


In [8]:
# Step 1b: Extract the regions by segmentation label
!sbatch $bash_script_path/1b_extract-regions.sh

# outputs in:  /gs/gsfs0/shared-lab/greally-lab/David/simple_allele-stacker/outputs/segmentation_regions/regions_by_label


Submitted batch job 7841667


In [3]:
# Step 1c: Extract the consensus regions
# Data and summary graphs are generated in the consensus_regions directory
!sbatch $bash_script_path/1c_consensus_regions.sh

# outputs in: /gs/gsfs0/shared-lab/greally-lab/David/simple_allele-stacker/outputs/consensus_regions
# grouped by parameter settings


Submitted batch job 7875224


#### STEP 2: Filter consensus regions based on methylated regions

In [37]:
# Step 2a: Filter the consensus regions
!sbatch $bash_script_path/2a_filter-consensus.sh H1
!sbatch $bash_script_path/2a_filter-consensus.sh H2

# outputs in: /gs/gsfs0/shared-lab/greally-lab/David/simple_allele-stacker/outputs/filtered_consensus_regions

Submitted batch job 7861550
Submitted batch job 7861551


##### STEP 3: Stack the alleles for IGV viewing


In [6]:
# Step 3a: Generate allele stacks for IGV of the entire cohort
!sbatch $bash_script_path/3a_IGV_all-samples.sh

# outputs in: /gs/gsfs0/shared-lab/greally-lab/David/simple_allele-stacker/outputs/allele_stacks/

Submitted batch job 7843451


In [5]:
# Step 3b: Create BED files for IGV viewing for the filtered consensus regions
!sbatch $bash_script_path/3b_consensus-IGV.sh

# outputs in: /gs/gsfs0/shared-lab/greally-lab/David/simple_allele-stacker/outputs/allele_stacks/filtered_consensus

Submitted batch job 7842985


In [3]:
# Step 3c: Create BED files for IGV viewing each sample's H1 and H2 segments
!sbatch $bash_script_path/3c_IGV-segmentation.sh

# outputs in: /gs/gsfs0/shared-lab/greally-lab/David/simple_allele-stacker/outputs/allele_stacks/all_samples/sample_segmentation_haplotypes

Submitted batch job 7898717


In [None]:
# Map Variants proto:

#!/usr/bin/env python3

import argparse
from pysam import VariantFile
import pandas as pd
from collections import defaultdict
import os
from pathlib import Path

class SampleVariantMapper:
    def __init__(self, vcf_path, bed_path, output_dir):
        self.vcf = VariantFile(vcf_path)
        self.bed_path = bed_path
        self.output_dir = Path(output_dir)
        
        # Create output directories if they don't exist
        self.sample_dir = self.output_dir / 'sample_analysis'
        self.sample_dir.mkdir(parents=True, exist_ok=True)
        
    def parse_bed(self):
        """Parse BED file with methylation status."""
        regions = []
        with open(self.bed_path, 'r') as f:
            # Skip header if present
            header = f.readline()
            if not header.startswith('chr'):
                f.seek(0)
                
            for line in f:
                fields = line.strip().split('\t')
                meth_samples = fields[7].split(',') if fields[7] != '.' else []
                unmeth_samples = fields[6].split(',') if fields[6] != '.' else []
                
                regions.append({
                    'chrom': fields[0],
                    'start': int(fields[1]),
                    'end': int(fields[2]),
                    'region_id': f"{fields[0]}:{fields[1]}-{fields[2]}",
                    'methylated_samples': set(meth_samples),
                    'unmethylated_samples': set(unmeth_samples)
                })
        return regions

    def get_variant_info(self, variant):
        """Extract relevant variant information."""
        return {
            'id': variant.id if variant.id else f"{variant.chrom}:{variant.pos}",
            'pos': variant.pos,
            'ref': variant.ref,
            'alt': ','.join(str(a) for a in variant.alts),
            'type': self.get_variant_type(variant.ref, variant.alts[0])
        }

    def get_variant_type(self, ref, alt):
        """Determine variant type and size."""
        ref_len = len(ref)
        alt_len = len(alt)
        
        if ref_len == alt_len == 1:
            return "SNV"
        elif ref_len > alt_len:
            return f"DEL:{ref_len - alt_len}bp"
        elif alt_len > ref_len:
            return f"INS:{alt_len - ref_len}bp"
        return "OTHER"

    def process_sample(self, sample_id, regions):
        """Process all regions for a specific sample."""
        sample_data = []
        
        for region in regions:
            variants_in_region = []
            genotypes = []
            
            # Determine methylation status
            if sample_id in region['methylated_samples']:
                meth_status = 'methylated'
                group_size = len(region['methylated_samples'])
            elif sample_id in region['unmethylated_samples']:
                meth_status = 'unmethylated'
                group_size = len(region['unmethylated_samples'])
            else:
                continue
                
            # Get variants in region
            for variant in self.vcf.fetch(region['chrom'], region['start'], region['end']):
                if sample_id in variant.samples:
                    gt = variant.samples[sample_id]['GT']
                    if gt is not None and sum(gt) > 0:  # Has at least one alt allele
                        var_info = self.get_variant_info(variant)
                        variants_in_region.append(
                            f"{var_info['id']}:{var_info['pos']}:{var_info['ref']}>{var_info['alt']}"
                        )
                        genotypes.append(f"{gt[0]}|{gt[1]}")
            
            if variants_in_region:
                sample_data.append({
                    'region_id': region['region_id'],
                    'chrom': region['chrom'],
                    'start': region['start'],
                    'end': region['end'],
                    'methylation_status': meth_status,
                    'region_variants': ','.join(variants_in_region),
                    'genotypes': ','.join(genotypes),
                    'group_size': group_size
                })
                
        return sample_data

    def process_regions(self, regions):
        """Create summary of all regions."""
        region_data = []
        
        for region in regions:
            variants_in_region = []
            
            for variant in self.vcf.fetch(region['chrom'], region['start'], region['end']):
                var_info = self.get_variant_info(variant)
                # Calculate overall AF
                af = sum(1 for sample in variant.samples if sum(variant.samples[sample]['GT'] or [0]) > 0) / len(variant.samples)
                variants_in_region.append(f"{var_info['id']}:{var_info['pos']}:{var_info['ref']}>{var_info['alt']}:AF={af:.3f}")
            
            region_data.append({
                'region_id': region['region_id'],
                'chrom': region['chrom'],
                'start': region['start'],
                'end': region['end'],
                'total_variants': len(variants_in_region),
                'methylated_samples': ','.join(region['methylated_samples']),
                'unmethylated_samples': ','.join(region['unmethylated_samples']),
                'variant_details': ','.join(variants_in_region)
            })
            
        return region_data

    def write_outputs(self, sample_data_dict, region_data):
        """Write all output files."""
        # Write individual sample files
        for sample_id, sample_data in sample_data_dict.items():
            if sample_data:  # Only write if sample has data
                df = pd.DataFrame(sample_data)
                output_file = self.sample_dir / f"{sample_id}_analysis.tsv"
                df.to_csv(output_file, sep='\t', index=False)
        
        # Write region summary
        region_df = pd.DataFrame(region_data)
        region_df.to_csv(self.output_dir / "region_summary.tsv", sep='\t', index=False)

    def run(self):
        """Main execution method."""
        regions = self.parse_bed()
        
        # Process each sample
        sample_data_dict = {}
        all_samples = set()
        for region in regions:
            all_samples.update(region['methylated_samples'])
            all_samples.update(region['unmethylated_samples'])
        
        for sample_id in all_samples:
            sample_data_dict[sample_id] = self.process_sample(sample_id, regions)
        
        # Process regions
        region_data = self.process_regions(regions)
        
        # Write all outputs
        self.write_outputs(sample_data_dict, region_data)

def main():
    parser = argparse.ArgumentParser(description='Map variants to regions with methylation status')
    parser.add_argument('--vcf', required=True, help='Input VCF file path')
    parser.add_argument('--bed', required=True, help='Input BED file path')
    parser.add_argument('--outdir', required=True, help='Output directory path')
    
    args = parser.parse_args()
    
    mapper = SampleVariantMapper(args.vcf, args.bed, args.outdir)
    mapper.run()

if __name__ == '__main__':
    main()

In [None]:
#!/bin/bash

# Set input/output paths
VCF_PATH="/path/to/variants.vcf.gz"
BED_PATH="/path/to/methylation.bed"
OUTPUT_DIR="/path/to/output"

# Run the variant mapper
python3 variant_mapper.py \
    --vcf ${VCF_PATH} \
    --bed ${BED_PATH} \
    --outdir ${OUTPUT_DIR}

In [None]:
# generate figure of number of loci per sample __builtins__
# map variants and characterize them