In [54]:
import pandas as pd
import numpy as np
import allel
import moments.LD
import scipy.interpolate

### Preprocessing of 1000G VCF files
1. Apply strict mask
    - Some sites are not reliably mapped in hg38 and need to be excluded.
    - These regions include: repeat regions, pseudogenes, centromeric, and telomeric regions.
2. Select only biallelic SNP sites
    - A VCF can contain mult-allelic SNPs and structure variants (indels, tandem repeats, etc)
    - The biallelic SNP sites are the easiest to work with, and most studies focus on them. 



This process can take quite a long time, I saved the processed VCFs under ~/projects/ctb-sgravel/data/30x1000G_biallelic_strict_masked <br>
The example preprocessing script is provided below and also in the directory's README.md.

In [None]:
%%sh
### in bash, preprocess using bcftools
### replace variable path
chr22_vcf_path='/home/alouette/projects/ctb-sgravel/data/1000G/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.recalibrated_variants.vcf.gz'
strict_mask_1000G='/home/alouette/projects/ctb-sgravel/data/1000G/masks_and_annotations/20160622.allChr.mask.bed'
IntegerT=6
module load bcftools
bcftools --version | head -1
### -Oz output compressed VCF format
### -m2 -M2 -v to only view biallellic SNPs
### -R subset VCF from region bed file
### --threads parallelization currently only works for compression
bcftools view -i 'FILTER=="PASS"' -Oz -m2 -M2 -v snps -R ${strict_mask_1000G} ${chr22_vcf_path} --threads ${IntegerT} -o "chr22.strict_masked.vcf.gz"

bcftools manual: http://samtools.github.io/bcftools/bcftools.html#expressions

### Optional: covert the VCF to zarr format for faster retrieval in scikit-allele
In sgkit, there might be better approaches. <br>
I also completed this step and stored the output in ~/projects/ctb-sgravel/data/30x1000G_biallelic_strict_masked/zarrFormat

### Population information retrieval
1000G stored its population information in a txt file: ~/projects/ctb-sgravel/data/1000G/population_info/20130606_g1k_3202_samples_ped_population.txt <br>
This file includes family ID, sample ID, parent ID, sex, population, superpopulation. <br>
There are some family trio in 1000G data. I currently exclude the individuals information if any of their parents are also in 1000G. <br>


### Formatting the recombination map for pipeline
Several recombination maps exist for the human population. Each is computed using specific populations and with specific techniques.<br>
Two major formats exist: HapMap format and PLINK format. <br>
The main difference is how recombination rate is stored:
- HapMap: Chromosome, Position(bp), Rate(cM/Mb)
- PLINK: Chromosome, Cumulative Map(cM), Position

Recombination rate starts at 0 at the first measured SNP position, instead of starting at the first chromosome position. <br>
The current ~/projects/ctb-sgravel/data/genetic_maps contains many maps in Hg37 format. <br>
I used the HapMapII map in ~/projects/ctb-sgravel/data/genetic_maps/HapMapII_GRCh38. But transformed it into cumulative Map.

### Minimal example pipeline: EDAR region in chr2:107,500,000-110,000,000
EDAR is an identified sweep in the EAS population.

In [None]:
%%sh
chr2_vcf_path='/home/alouette/projects/ctb-sgravel/data/1000G/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr2.recalibrated_variants.vcf.gz'
strict_mask_1000G='/home/alouette/projects/ctb-sgravel/data/1000G/masks_and_annotations/20160622.allChr.mask.bed'
IntegerT=6
module load bcftools
### takes 6 min
bcftools view -i 'FILTER=="PASS"' -Oz -m2 -M2 -v snps -r chr2:107500000-110000000 ${chr2_vcf_path} --threads ${IntegerT} -o "chr2.EDAR_region.vcf.gz"
bcftools index --threads ${IntegerT} "chr2.EDAR_region.vcf.gz"
### takes 3 min
bcftools view -Oz -R ${strict_mask_1000G} "chr2.EDAR_region.vcf.gz" --threads ${IntegerT} -o "chr2.EDAR_region.strict_mask.vcf.gz"

In [45]:
def read_vcf(vcf_path:str) -> dict: 
    """
    Read in the VCF using scikit-allel, the genotype information is stored in a dict
    Optimization: could limit the fields read in, can also specify detailed format like float32
    """
    callset = allel.read_vcf(vcf_path)
    return callset

In [42]:
vcf_path = "chr2.EDAR_region.strict_mask.vcf.gz"
EDAR_callset = read_vcf(vcf_path)

In [43]:
EDAR_callset

{'samples': array(['HG00096', 'HG00097', 'HG00099', ..., 'NA21142', 'NA21143',
        'NA21144'], dtype=object),
 'calldata/GT': array([[[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],
 
        ...,
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]]], dtype=int8),
 'variants/ALT': array([['G', '', ''],
        ['T', '', ''],
        ['T', '', ''],
        ...,
        ['A', '', ''],
        [

In [44]:
len(EDAR_callset['variants/POS'])

58860

In [46]:
type(EDAR_callset['variants/POS'])

numpy.ndarray

In [55]:
def read_rec_map(rec_map_path: str, format = "HapMap") -> pd.DataFrame:
    """
    Read the recombination map, normalize between different formats.
    Currently set to read from HapMap format
    Covert the Rate(cM/MB) to cumulative Rate (cM)
    Return a pd.DataFrame
    """
    rec_map_df = pd.read_csv(rec_map_path, sep = "\t")
    rec_map_df.columns = ["Chr", "bp", "Rate_cM"]
    rec_map_df["cum_bp"] = rec_map_df.bp.cumsum()
    return rec_map_df

In [63]:
def bp_to_centimorgan(position_array: np.ndarray, rec_map_df: pd.DataFrame) -> np.ndarray:
    """
    Return a list of centimorgan map distance corresponding to each position bp distance
    """
    bp_to_cM_F = scipy.interpolate.interp1d(rec_map_df.bp, rec_map_df.Rate_cM, fill_value='extrapolate' )
    cM_array = bp_to_cM_F(position_array)
    return cM_array

In [56]:
rec_map_path = "/home/alouette/projects/ctb-sgravel/data/genetic_maps/HapMapII_GRCh38/genetic_map_Hg38_chr2.txt"
rec_map_df = read_rec_map(rec_map_path)
rec_map_df

Unnamed: 0,Chr,bp,Rate_cM
0,chr2,12994,0.339408
1,chr2,15491,0.336057
2,chr2,15672,0.308424
3,chr2,15703,0.273441
4,chr2,16111,0.232868
...,...,...,...
286388,chr2,242106373,0.122849
286389,chr2,242106609,0.133203
286390,chr2,242110180,0.136364
286391,chr2,242119359,0.174241


In [60]:
bp_to_cM_F = rec_map_interpolate(rec_map_df)

In [61]:
bp_to_centimorgan(EDAR_callset['variants/POS'], rec_map_df)

array([0.00525299, 0.0052575 , 0.005267  , ..., 2.73506194, 2.73505867,
       2.73505847])