# Script for starting the phasing analysis in Yale - Penn

Author: Jose Jaime Martinez-Magana

Day: 25 February 2023

This script was developed to start the phasing analysis using shapeit in Yale Penn 2 cohort, for the empirical testing of Tractor - Mix.

In [None]:
# in this analysis we used shapeit2, for a detail documentation follow this link:
# https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html

# if your cluster uses slurm as job handler follow the next command for requesting resources
# request resources in slurm based HPC
srun --pty --mem=32G -p interactive bash

# load miniconda
module load miniconda
# activate tractor-mix environmnet
conda activate tractor_mix

# move to your analysis directory
cd palmer_scratch/genomics/yalepenn/tractor_mix/

# move to the databases directory
cd databases/
# create a directory to sctore the genotype data, named genotype/
mkdir genotype/
# WARNING remember to run mkdir, only once !!!!
# if you directories has been created just move to it with cd (Linux)
cd genotype

# create a directory to store the original data, filtered data and phased data, named original_data/, filtered/ and phased/
mkdir original_data/ filtered/ phased/
# WARNING remember to run mkdir, only once !!!!
# if you directories has been created just move to it with cd (Linux)
cd original_data/

# make a copy of yale penn, we have previous analyzed Yale Penn and is stored in the following path
# /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/nic_dep/genomic_data/vcfs/annot_rsids/yp2/
cp /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/nic_dep/genomic_data/vcfs/annot_rsids/yp2/GWGO_GWCIDR.1kg_phase3_v5.chr* .
# WARNING remember to run cp only once, if run again the previous line, will overwritte your data
# this could tak a while, depending on your sample size


# inside scripts/ in our environment create the following directories
mkdir environment  local_anc  phasing  pheno
# WARNING remember to run mkdir, only once !!!!
# if you directories has been created just move to it with cd (Linux)
cd phasing
# the scripts from this notebook should be found in the phasing directory
# this part of the script was used for developing the next scripts and adjusting the environment for the outputs
# and input structure of the directories

In [None]:
# we are going to filter the vcf to include only SNPs with imputation score > 0.80
# for this we are going to use bcftools
# the following script is going to be added to the bash script filter_samples.sh
# this script uses a sample list in our specific analysis is a list of the sampleIDs with the form FID_IID, no header
# we have created this list and could be found in the following path
# /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/sample_lists/tractor_mix_samples.list.forvcfs
# start script
#!/bin/bash
#SBATCH --job-name=yale_penn_generate_vcf_subset
#SBATCH --out="slurm-%j.out"
#SBATCH --time=20:00:00
#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=10
#SBATCH --mem-per-cpu=90G
#SBATCH --mail-type=ALL
#SBATCH --partition=bigmem
####################################################################################
# script to stratify the Yale Penn 2 cohort based on our previous selection of samples
# for this analysis we used a thresold of higher than 50% of African Ancestry
# day: 26 February 2023
# analyzer: Jose Jaime Martinez-Magana - jjm262
# cluster: Grace - HPC Yale
####################################################################################
# This script uses the Yale Penn cohort imputed vcfs annotated with rsIDs
####################################################################################
# load conda 
module load miniconda
# activate tractor_mix environment
conda activate tractor_mix
# set parameters
# input paths for Yale Penn vcfs annotated with rsIDs
vcfip='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/original_data'
# sample filter
sm_f='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/sample_lists/tractor_mix_samples.list.forvcfs'
# set output path
vcfup='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/filtered'
# set path to bcftools from our environment
bcftools='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/scripts/environment/extdata/bcftools/bcftools/bcftools'
# running script
for chr in {1..22}
do
$bcftools view -i 'R2>=0.8' ${vcfip}/GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.rsids.vcf.gz \
--output-file ${vcfup}/GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.rsids.filtered.vcf.gz \
--samples-file ${sm_f} \
--output-type z \
--threads $SLURM_CPUS_PER_TASK
done
# end of script
# run the previous script with the next command, for this your server should have a slurm handler
# if not run your script with your handler
sbatch filter_samples.sh
# notes: slurm-16051489.out, error because header in samples-file, slurm-16051613.out, running succesfully

In [None]:
# adding a script to remove duplicated from the vcf files, named nodup_vcf.sh, if needed it
# start script
#!/bin/bash
#SBATCH --job-name=yale_penn_remove_dup_vcf
#SBATCH --out="slurm-%j.out"
#SBATCH --time=24:00:00
#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=10
#SBATCH --mem-per-cpu=90G
#SBATCH --mail-type=ALL
#SBATCH --partition=bigmem
####################################################################################
# script to index the filtered samples Yale Penn 2 cohort based on our previous selection of samples
# for this analysis we used a thresold of higher than 50% of African Ancestry
# day: 26 February 2023
# analyzer: Jose Jaime Martinez-Magana - jjm262
# cluster: Grace - HPC Yale
####################################################################################
# This script uses the Yale Penn cohort imputed vcfs annotated with rsIDs
####################################################################################
# load conda 
module load miniconda
# activate tractor_mix environment
conda activate tractor_mix
# set parameters
# input paths for filtered Yale Penn 2 files
vcfip='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/filtered'
# set path to bcftools from our environment
bcftools='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/scripts/environment/extdata/bcftools/bcftools/bcftools'
# running script
# move to input directory
for chr in {1..22}
do
$bcftools norm -d all ${vcfip}/GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.rsids.filtered.vcf.gz \
--output ${vcfip}/GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.rsids.filtered.nodup.vcf.gz \
--output-type z \
--threads $SLURM_CPUS_PER_TASK
done
# end of script
# we highly recommend removing duplicates in your data, but if you believe that your data does not requiered this step,
# you could not perform it, in this notebook, we will continue with the remove duplicate for further analysis
# run the previous script with the next command, for this your server should have a slurm handler
# if not run your script with your handler
sbatch nodup_vcf.sh

In [None]:
# also adding a script to index the vcf files, named index_sample_vcf.sh, if needed it
# start script
#!/bin/bash
#SBATCH --job-name=yale_penn_index_vcfs
#SBATCH --out="slurm-%j.out"
#SBATCH --time=36:00:00
#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=2
#SBATCH --mem-per-cpu=32G
#SBATCH --mail-type=ALL
#SBATCH --partition=week
####################################################################################
# script to index the filtered samples Yale Penn 2 cohort based on our previous selection of samples
# for this analysis we used a thresold of higher than 50% of African Ancestry
# day: 26 February 2023
# analyzer: Jose Jaime Martinez-Magana - jjm262
# cluster: Grace - HPC Yale
####################################################################################
# This script uses the Yale Penn cohort imputed vcfs annotated with rsIDs
####################################################################################
# load conda 
module load miniconda
# activate tractor_mix environment
conda activate tractor_mix
# set parameters
# input paths for filtered Yale Penn 2 files
vcfip='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/filtered'
# set path to bcftools from our environment
bcftools='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/scripts/environment/extdata/bcftools/bcftools/bcftools'
# running script
# move to input directory
cd ${vcfip}
for chr in {1..22}
do
$bcftools index GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.rsids.filtered.nodup.vcf.gz
done
# end of script
# run the previous script with the next command, for this your server should have a slurm handler
# if not run your script with your handler
sbatch index_sample_vcf.sh

In [None]:
# adding a script for checking the input files with the reference for shapeit2, named check_shapeit2.sh
# start script
#!/bin/bash
#SBATCH --job-name=yale_penn_check_vcf_file
#SBATCH --out="slurm-%j.out"
#SBATCH --time=22:00:00
#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=4
#SBATCH --mem-per-cpu=250G
#SBATCH --mail-type=ALL
#SBATCH --partition=bigmem
####################################################################################
# script to check phased files Yale Penn 2 cohort for tractor mix, before local ancestry
# day: 26 February 2023
# analyzer: Jose Jaime Martinez-Magana - jjm262
# cluster: Grace - HPC Yale
####################################################################################
# This script uses the Yale Penn cohort filtered information
####################################################################################
# load conda 
module load miniconda
# activate tractor_mix environment
conda activate tractor_mix
# set input 
# set path for input vcf files
vcfip='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/filtered'
# set path for phased output vcf files
vcfop='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/phased'
# set path input for haplotypes of references
hap_ref='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/haplotypes/1000GP_Phase3'
# set path input for genetic maps
map_ref='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/genetic_maps'
# phasing data with shapeit2
for chr in {1.22}
do
shapeit -check --input-vcf ${vcfip}/GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.rsids.filtered.nodup.vcf.gz \
--input-map ${map_ref}/genetic_map_GRCh37_chr${chr}.txt \
--input-ref ${hap_ref}/1000GP_Phase3_chr${chr}.hap.gz ${hap_ref}/1000GP_Phase3_chr${chr}.legend.gz ${hap_ref}/1000GP_Phase3.sample \
--thread $SLURM_CPUS_PER_TASK \
--output-log ${vcfop}/GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.test_aligment_check
done
# end script
# this script will generate a list of positions to be excluded from the vcf file
# run the previous script with the next command, for this your server should have a slurm handler
# if not run your script with your handler
sbatch check_shapeit2.sh

In [None]:
# for phasing we will used only the samples from 1000G of AFR and EUR ancestry, for this reason 
# we need to create a file, named ances_groups.list
# that will be the input to shapeit2 for only considering those ancestry groups in the phasing
# to generate this script we follow the next script in any bash
sample_path='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/haplotypes/1000GP_Phase3'
sample_list='1000GP_Phase3.sample'
# we have decided to use only individuals from the following populations
# FIN Finnish Finnish in Finland
# IBS Iberian Iberian populations in Spain
# GBR British British in England and Scotland
# TSI Toscani Toscani in Italy
# YRI Yoruba Yoruba in Ibadan, Nigeria
# GWD Gambian Mandinka Gambian in Western Division, The Gambia - Mandinka
# MSL Mende Mende in Sierra Leone
# ESN Esan Esan in Nigeria
# LWK Luhya Luhya in Webuye, Kenya
cat ${sample_path}/${sample_list} | egrep 'FIN|\IBS|\GBR|\TSI|\YRI|\GWD|\MSL|\ESN|\LWK' | cut -f 2 -d" " | sort | uniq > ${sample_path}/ances_groups.list

In [None]:
# adding a script for phasing with shapeit2, named phase_shapeit2.sh
# start script
#!/bin/bash
#SBATCH --job-name=yale_penn_phasing_shapeit2
#SBATCH --out="slurm-%j.out"
#SBATCH --time=22:00:00
#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=10
#SBATCH --mem-per-cpu=32G
#SBATCH --mail-type=ALL
#SBATCH --partition=bigmem
####################################################################################
# script to phased Yale Penn 2 cohort for tractor mix, before local ancestry
# day: 26 February 2023
# analyzer: Jose Jaime Martinez-Magana - jjm262
# cluster: Grace - HPC Yale
####################################################################################
# This script uses the Yale Penn cohort filtered information
####################################################################################
# load conda 
module load miniconda
# activate tractor_mix environment
conda activate tractor_mix
# set input 
# set path for input vcf files
vcfip='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/filtered'
# set path for phased output vcf files
vcfop='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/phased'
# set path input for haplotypes of references
hap_ref='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/haplotypes/1000GP_Phase3'
# set path input for genetic maps
map_ref='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/haplotypes/1000GP_Phase3'
# set group list, to only include those in the filtering
group_list='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/haplotypes/1000GP_Phase3/ances_groups.list'
# phasing data with shapeit2
for chr in {1..22}
do
shapeit --input-vcf ${vcfip}/GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.rsids.filtered.nodup.vcf.gz \
--input-map ${map_ref}/genetic_map_chr${chr}_combined_b37.txt \
--input-ref ${hap_ref}/1000GP_Phase3_chr${chr}.hap.gz ${hap_ref}/1000GP_Phase3_chr${chr}.legend.gz ${hap_ref}/1000GP_Phase3.sample \
--include-grp ${group_list} \
--exclude-snp ${vcfop}/GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.test_aligment_check.strand.exclude \
--thread $SLURM_CPUS_PER_TASK \
-O ${vcfop}/GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.rsids.filtered.nodup.phased.vcf.gz
done
# end script
# run the previous script with the next command, for this your server should have a slurm handler
# if not run your script with your handler
sbatch phase_shapeit2.sh