# Script for starting the phasing analysis in Yale - Penn

Author: Jose Jaime Martinez-Magana

Day: 27 March 2023

This script was developed to adjust chromosome 9 for phasing

In [None]:
# updated 27 march 2023
# in chromosome 9 we got the folllowing error
# ERROR: 61 SNPs with high rates of missing data (>10%).  These sites should be removed. You can disable this error with --force (at your own risk).

# we are going to modify chromosome 9 output
# requiere computational resources in McCleary
srun --pty --mem=32G -p devel bash

# mave to directory
cd /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype
mkdir vcf_stats

# load plink
module load PLINK

# set parameters
in_p="/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/filtered"
out_p="/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/vcf_stats"

# using plink to estimate allelic frequency
for chr in {1..14}
do
plink2 --vcf ${in_p}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.rsids.filtered.nodup.vcf.gz \
--missing variant-only \
--threads 20 \
--out ${out_p}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.test_aligment_check.snp.missing
done


# using plink to estimate allelic frequency
for chr in {15..22}
do
plink2 --vcf ${in_p}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.rsids.filtered.nodup.vcf.gz \
--missing variant-only \
--threads 20 \
--out ${out_p}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.test_aligment_check.snp.missing
done

In [None]:
# merging the exclude probes from shapeit with the missing probes of vcf
# only chromosome 9 had missing snps
cat /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/vcf_stats/GWGO_GWCIDR.1kg_phase3_v5.chr9.dose.test_aligment_check.snp.missing.vmiss | awk '{if($5 > 0)print$2}'  | cut -f 2 -d ":"  > /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/phased/GWGO_GWCIDR.1kg_phase3_v5.chr9.dose.test_aligment_check.snp.strand.exclude.vmiss
# merge with previous filter from shapeit
cd /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/phased/
cat GWGO_GWCIDR.1kg_phase3_v5.chr9.dose.test_aligment_check.snp.strand.exclude GWGO_GWCIDR.1kg_phase3_v5.chr9.dose.test_aligment_check.snp.strand.exclude.vmiss | sort -n | egrep -v "ID" > GWGO_GWCIDR.1kg_phase3_v5.chr9.dose.test_aligment_check.snp.strand.exclude.vmiss.all

In [None]:
# run the phasing for chromosome 9
# move to working directory
cd /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/scripts/phasing/phasing_array

# content of script for phase chromosome 9
# script name: job_tractor_phasing_chr9_rerun_v03272023.sh
#!/bin/bash
#SBATCH --output tractor_phasing_chr9_rerun_v03272023-%A_%1a-%N.out
#SBATCH --job-name tractor_phasing_chr9__rerun_v03272023
#SBATCH --mem 130g --time 5-00:00:00 --partition=week --cpus-per-task=36 --mail-type ALL

# set parameters
# set path for input vcf files
vcfip='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/filtered/GWGO_GWCIDR.1kg_phase3_v5.chr9.dose.rsids.filtered.nodup.vcf.gz'
# load miniconda
module load miniconda
# activate environment
conda activate tractor_mix
# run phasing with shapeit
shapeit --input-vcf ${vcfip} \
--input-map /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/haplotypes/1000GP_Phase3/genetic_map_chr9_combined_b37.txt \
--input-ref /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/haplotypes/1000GP_Phase3/1000GP_Phase3_chr9.hap.gz /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/haplotypes/1000GP_Phase3/1000GP_Phase3_chr9.legend.gz /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/haplotypes/1000GP_Phase3/1000GP_Phase3.sample \
--include-grp /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/haplotypes/1000GP_Phase3/ances_groups.list \
--exclude-snp /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/phased/GWGO_GWCIDR.1kg_phase3_v5.chr9.dose.test_aligment_check.snp.strand.exclude.vmiss.all \
--thread 36 \
-O /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/phased/GWGO_GWCIDR.1kg_phase3_v5.chr9.dose.rsids.filtered.nodup.phased.shapeit

# running script
sbatch job_tractor_phasing_chr9_rerun_v03272023.sh