In [26]:
import pandas as pd
import numpy as np
import allel
import moments.LD
import scipy.interpolate
import msprime as ms

### Preprocessing of 1000G VCF files
1. Apply strict mask
    - Some sites are not reliably mapped in hg38 and need to be excluded.
    - These regions include: repeat regions, pseudogenes, centromeric, and telomeric regions.
2. Select only biallelic SNP sites
    - A VCF can contain mult-allelic SNPs and structure variants (indels, tandem repeats, etc)
    - The biallelic SNP sites are the easiest to work with, and most studies focus on them. 



This process can take quite a long time, I saved the processed VCFs under ~/projects/ctb-sgravel/data/30x1000G_biallelic_strict_masked <br>
The example preprocessing script is provided below and also in the directory's README.md.

In [None]:
%%sh
### in bash, preprocess using bcftools
### replace variable path
chr22_vcf_path='/home/alouette/projects/ctb-sgravel/data/1000G/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz'
strict_mask_1000G='/home/alouette/projects/ctb-sgravel/data/1000G/masks_and_annotations/20160622.allChr.mask.bed'
IntegerT=6
module load bcftools
bcftools --version | head -1
### -Oz output compressed VCF format
### -m2 -M2 -v to only view biallellic SNPs
### -R subset VCF from region bed file
### --threads parallelization currently only works for compression
bcftools view -i 'FILTER=="PASS"' -Oz -m2 -M2 -v snps -R ${strict_mask_1000G} ${chr22_vcf_path} --threads ${IntegerT} -o "chr22.strict_masked.vcf.gz"

bcftools manual: http://samtools.github.io/bcftools/bcftools.html#expressions

### Optional: covert the VCF to zarr format for faster retrieval in scikit-allele
In sgkit, there might be better approaches. <br>
I also completed this step and stored the output in ~/projects/ctb-sgravel/data/30x1000G_biallelic_strict_masked/zarrFormat

### Population information retrieval
1000G stored its population information in a txt file: ~/projects/ctb-sgravel/data/1000G/population_info/20130606_g1k_3202_samples_ped_population.txt <br>
This file includes family ID, sample ID, parent ID, sex, population, superpopulation. <br>
There are some family trio in 1000G data. I currently exclude the individuals information if any of their parents are also in 1000G. <br>


### Formatting the recombination map for pipeline
Several recombination maps exist for the human population. Each is computed using specific populations and with specific techniques.<br>
Two major formats exist: HapMap format and PLINK format. <br>
The main difference is how recombination rate is stored:
- HapMap: Chromosome, Position(bp), Rate(cM/Mb)
- PLINK: Chromosome, Cumulative Map(cM), Position

Recombination rate starts at 0 at the first measured SNP position, instead of starting at the first chromosome position. <br>
The current ~/projects/ctb-sgravel/data/genetic_maps contains many maps in Hg37 format. <br>
I used the HapMapII map in ~/projects/ctb-sgravel/data/genetic_maps/HapMapII_GRCh38. But transformed it into cumulative Map.

### Minimal example pipeline: EDAR region in chr2:107,500,000-110,000,000
EDAR is an identified sweep in the EAS population.

In [None]:
%%sh
chr2_vcf_path='/home/alouette/projects/ctb-sgravel/data/1000G/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr2.recalibrated_variants.vcf.gz'
strict_mask_1000G='/home/alouette/projects/ctb-sgravel/data/1000G/masks_and_annotations/20160622.allChr.mask.bed'
IntegerT=6
module load bcftools
### takes 6 min
bcftools view -i 'FILTER=="PASS"' -Oz -m2 -M2 -v snps -r chr2:107500000-110000000 ${chr2_vcf_path} --threads ${IntegerT} -o "chr2.EDAR_region.vcf.gz"
bcftools index --threads ${IntegerT} "chr2.EDAR_region.vcf.gz"
### takes 3 min
bcftools view -Oz -R ${strict_mask_1000G} "chr2.EDAR_region.vcf.gz" --threads ${IntegerT} -o "chr2.EDAR_region.strict_mask.vcf.gz"

In [2]:
def read_vcf(vcf_path:str) -> dict: 
    """
    Read in the VCF using scikit-allel, the genotype information is stored in a dict
    Optimization: could limit the fields read in, can also specify detailed format like float32
    """
    callset = allel.read_vcf(vcf_path)
    return callset

In [3]:
vcf_path = "chr2.EDAR_region.strict_mask.vcf.gz"
EDAR_callset = read_vcf(vcf_path)

In [4]:
EDAR_callset

{'samples': array(['HG00096', 'HG00097', 'HG00099', ..., 'NA21142', 'NA21143',
        'NA21144'], dtype=object),
 'calldata/GT': array([[[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],
 
        ...,
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]]], dtype=int8),
 'variants/ALT': array([['G', '', ''],
        ['T', '', ''],
        ['T', '', ''],
        ...,
        ['A', '', ''],
        [

In [44]:
len(EDAR_callset['variants/POS'])

58860

In [46]:
type(EDAR_callset['variants/POS'])

numpy.ndarray

In [8]:
pos_array = EDAR_callset['variants/POS']
snp_array = EDAR_callset['calldata/GT']

In [35]:
def _read_rec_map(rec_map_path: str) -> pd.DataFrame:
    """
    Read the recombination map, normalize between different formats.
    Currently set to read from HapMap format, with a one row header
    Covert the Rate(cM/MB) to cumulative Rate (cM)
    Return a pd.DataFrame
    """
    rec_map_df = pd.read_csv(rec_map_path, sep = "\t", names = ["Chr", "bp", "Rate_cM"], skiprows = 1)
    rec_map_df["diff_bp"] = rec_map_df.bp.diff()[1:].reset_index(drop = True)
    rec_map_df.diff_bp = rec_map_df.diff_bp.astype("Int32")
    rec_map_df["cM"] = rec_map_df.diff_bp * rec_map_df.Rate_cM / 1e6
    rec_map_df["cum_cM"] = rec_map_df.cM.cumsum()
    return rec_map_df

In [44]:
def _msprime_read_HapMap(rec_map_path: str) -> pd.DataFrame:
    """
    Require msprime >= 1.0.0
    Read the recombination map using msprime functionality.
    Can uses the functionality: get_cumulative_mass(), find_index()
    Column 1-2, positions; left: inclusive, right: exclusive
    """
    ms_RateMap = ms.RateMap.read_hapmap(rec_map_path, position_col=1, rate_col=2)
    return ms_RateMap

In [74]:
def _bp_to_cM(ms_RateMap: ms.RateMap) -> np.ndarray:
    """
    Return the function transforming bp to cM.
    """
    bp_to_cM_F = scipy.interpolate.interp1d(ms_RateMap.left, np.nancumsum(rec_map_df.mass)*100, fill_value='extrapolate' )
    return bp_to_cM_F

In [75]:
def _cM_to_bp(ms_RateMap: ms.RateMap) -> np.ndarray:
    """
    Return the function transforming cM to bp.
    """
    cM_to_bp_F = scipy.interpolate.interp1d(np.nancumsum(rec_map_df.mass)*100, ms_RateMap.left, fill_value='extrapolate' )
    return cM_to_bp_F

In [71]:
cM_to_bp_F = _bp_to_cM(rec_map_df)

In [73]:
bp_to_cM_F(52147)

array(0.00334915)

In [68]:
rec_map_path = "/home/alouette/projects/ctb-sgravel/data/genetic_maps/HapMapII_GRCh38/genetic_map_Hg38_chr10.txt"
rec_map_df2 = _read_rec_map(rec_map_path)
rec_map_df2

Unnamed: 0,Chr,bp,Rate_cM,diff_bp,cM,cum_cM
0,chr10,26829,0.124482,21403,0.002664,0.002664
1,chr10,48232,0.160359,254,0.000041,0.002705
2,chr10,48486,0.158828,1523,0.000242,0.002947
3,chr10,50009,0.158845,2138,0.00034,0.003287
4,chr10,52147,0.158922,394,0.000063,0.003349
...,...,...,...,...,...,...
180750,chr10,133633304,3.279045,1163,0.003814,178.181552
180751,chr10,133634467,1.324084,127371,0.16865,178.350202
180752,chr10,133761838,5.906078,164,0.000969,178.35117
180753,chr10,133762002,3.358621,4366,0.014664,178.365834


In [53]:
rec_map_df.mass

array([           nan, 2.66429381e-05, 4.07311860e-07, ...,
       1.68649945e-03, 9.68596792e-06, 1.46637393e-04])

In [33]:
rec_map_msprime.get_cumulative_mass(133766368)

1.783658341121365

In [60]:
bp_to_cM_F = rec_map_interpolate(rec_map_df)

In [61]:
bp_to_centimorgan(EDAR_callset['variants/POS'], rec_map_df)

array([0.00525299, 0.0052575 , 0.005267  , ..., 2.73506194, 2.73505867,
       2.73505847])

### sgkit window function inspiration 

sgkit has 3 windowing functions: by variant, by position, and by genome.

In [None]:
### sgkit window master function functions
    n_variants = ds.sizes["variants"]
    n_contigs = num_contigs(ds)
    contig_ids = np.arange(n_contigs)
    variant_contig = ds["variant_contig"]
    contig_starts = np.searchsorted(variant_contig.values, contig_ids)
    contig_bounds = np.append(contig_starts, [n_variants], axis=0)  # type: ignore[no-untyped-call]

    contig_window_contigs = []
    contig_window_starts = []
    contig_window_stops = []
    for i, contig in enumerate(get_contigs(ds)):
        starts, stops = windowing_fn(
            contig, contig_bounds[i], contig_bounds[i + 1], *args, **kwargs
        )
        contig_window_starts.append(starts)
        contig_window_stops.append(stops)
        contig_window_contigs.append(np.full_like(starts, i))

    window_contigs = np.concatenate(contig_window_contigs)  # type: ignore[no-untyped-call]
    window_starts = np.concatenate(contig_window_starts)  # type: ignore[no-untyped-call]
    window_stops = np.concatenate(contig_window_stops)  # type: ignore[no-untyped-call]

### sgkit window function, window_by_position
if step is not None and window_start_position is not None:
        raise ValueError("Only one of step or window_start_position may be specified")
    step = step or size
    positions = ds[variant_position].values
    window_start_positions = (
        ds[window_start_position].values if window_start_position is not None else None
    )
    return _window_per_contig(
        ds,
        variant_contig,
        merge,
        _get_windows_by_position,
        size,
        step,
        offset,
        positions,
        window_start_positions,
    )

In [3]:
def _windowing(snp_array: list, window_index_array: list):
    """
    Master window function, based on the input index, give back the subset windows in a list
    This function is the core function and used for other detailed window_by_* functions.
    :param snp_array: The SNP array read by scikit-allel format.
    :param window_index_array: The list of tuple (start, end) index of each windows of the SNP array.
    """
    window_list = [snp_array[index_tuple[0]:index_tuple[1]] for index_tuple in window_index_array]
    return window_list

In [None]:
def scikit_allel_slice_window(pos_tuple_array, pos_array, snp_array):
    """
    Find the index of the positions in the pos_array.
    Pos_array's position should correspond to the snp_array's SNPs position at the same index.
    Utilizing scikit-allele's locate_range and GenotypeArray function, this might be changed when using sgkit packages.
    Return a list of SNP windows ready to be parsed by moments.
    """
    sorted_index_pos_array = allel.SortedIndex(pos_array)
    gt_region = allel.GenotypeArray(gt_zarr[loc_region])
    pos_slice_list = [sorted_index_pos_array.locate_range(pos_tuple[0], pos_tuple[1]) for pos_tuple in pos_tuple_array]
    snp_slice_list = [allel.GenotypeArray(snp_array[pos_slice]) for pos_slice in pos_slice_list]
    return snp_slice_list
    

In [None]:
def window_by_reombination(snp_array: list, rec_map: ms.RateMap, rec_start: float, rec_end: float, rec_step = 0.04: float, rec_to_pos_func: scipy.interpolate, ) -> list, list:
    """
    Require msprime >= 1.0.0 to use RateMap
    Call this function to get the window list separated by rec_distance
    
    """
    ### end has to be larger than start
    assert rec_start < rec_end
    ### if given rec distance is smaller or equal to the the step 
    ### return the original array without subsetting
    if rec_end - rec_start <= rec_step:
        return snp_array
    else:
        window_index_list = []
        window_counts = (rec_end - rec_start)//rec_step
        ### convert the rec windows into distance windows 
        ### get index of positions
        
        
    ### call _windowing function
    window_list = _windowing(snp_array, window_index_list)
    
    

In [None]:
def window_by_position(rec_start: float, rec_end: float, rec_to_pos_func: scipy.interpolate, rec_step = 0.04: float) -> list, list:
    """
    Call this function just to get list of position in corresponding windows
    Return the start position and end position in two lists
    """
    ### end has to be larger than start
    assert rec_start < rec_end
    if rec_end - rec_start <= rec_step:
        return [rec_to_pos_func(rec_start)], [rec_to_pos_func(rec_end)]
    else:
        window_counts = (rec_end - rec_start)//rec_step
        
    ### call parsing functions
    
    

In [None]:
def window_by_counts(rec_start: float, rec_end: float, rec_to_pos_func: scipy.interpolate, rec_step = 0.04: float) -> list, list:
    """
    Call this function just to get list of position in corresponding windows
    Return the start position and end position in two lists
    """
    ### end has to be larger than start
    assert rec_start < rec_end
    if rec_end - rec_start <= rec_step:
        return [rec_to_pos_func(rec_start)], [rec_to_pos_func(rec_end)]
    else:
        window_counts = (rec_end - rec_start)//rec_step
        
    ### call parsing functions
    
    

In [None]:
def window_Dz(SNP_window):
    """
    
    """
    

In [None]:
def subset_Dz_computation(snp_window, max_snp_per_subset = 5000):
    """
    Memory requirement is large when there are too many SNPs in a single calculation.
    Ideally the total SNP should be less than 5,000
    Subset the Dz caclulation into chunks and return the combined computed values.
    """
    