# Script for running the local ancestry with RFMix

Author: Jose Jaime Martinez-Magana

Day: 25 February 2023

This script was developed to run local ancestry for the Tractor - Mix model for in the Yale HPC - Grace cluster.
Using the Yale Penn 2 cohort.

In [None]:
# move to your analysis directory
cd palmer_scratch/genomics/yalepenn/tractor_mix/

# move to database directory
cd databases/

# create directory for storing local_ancestry results
mkdir local_ancestry
# WARNING remember to run mkdir, only once !!!!
# if you directories has been created just move to it with cd (Linux)

# we will create a sample list for the local ancestry estimation using RFMix
# to use only the following populations
# FIN Finnish Finnish in Finland
# IBS Iberian Iberian populations in Spain
# GBR British British in England and Scotland
# TSI Toscani Toscani in Italy
# YRI Yoruba Yoruba in Ibadan, Nigeria
# GWD Gambian Mandinka Gambian in Western Division, The Gambia - Mandinka
# MSL Mende Mende in Sierra Leone
# ESN Esan Esan in Nigeria
# LWK Luhya Luhya in Webuye, Kenya

# we have previous downloaded the 1000Genomes database in the following path
# /gpfs/gibbs/pi/montalvo-ortiz/reference_panel/1kg/hg37/r20130502
# to generate this script we follow the next script in any bash
sample_path='/gpfs/gibbs/pi/montalvo-ortiz/reference_panel/1kg/hg37/r20130502'
sample_list='integrated_call_samples_v3.20200731.ALL.ped'
# set output datapath
sample_list_p='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/sample_lists/integrated_call_samples_v3.20200731.ALL.ped.forrfmix'
# extract samples
cat ${sample_path}/${sample_list} | egrep 'FIN|\IBS|\GBR|\TSI|\YRI|\GWD|\MSL|\ESN|\LWK' | egrep 'unrel' | cut -f2,7 | sed 's/FIN/EUR/g' | sed 's/IBS/EUR/g' | sed 's/GBR/EUR/g' | sed 's/TSI/EUR/g' | sed 's/TSI/AFR/g' | sed 's/YRI/AFR/g' | sed 's/GWD/AFR/g' | sed 's/MSL/AFR/g' | sed 's/ESN/AFR/g' | sed 's/LWK/AFR/g' > ${sample_list_p}
# add the header manualy to the file
vim ${sample_list_p}
# IndividualID    Superpopulation


In [None]:
# adding a script for phasing with shapeit2, named local_ancestry_rfmix2.sh
# start script
#!/bin/bash
#SBATCH --job-name=yale_penn_local_ancestry_rfmix
#SBATCH --out="slurm-%j.out"
#SBATCH --time=22:00:00
#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=4
#SBATCH --mem-per-cpu=350G
#SBATCH --mail-type=ALL
#SBATCH --partition=bigmem
####################################################################################
# script to phased Yale Penn 2 cohort for tractor mix, runned local ancestry
# day: 26 February 2023
# analyzer: Jose Jaime Martinez-Magana - jjm262
# cluster: Grace - HPC Yale
####################################################################################
# This script uses the Yale Penn cohort filtered information
####################################################################################
# load conda 
module load miniconda
# activate tractor_mix environment
conda activate tractor_mix
# set input 
# set path for input vcf files
vcfip='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/phased'
# set path for phased output vcf files
vcfop='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/local_ancestry'
# set path input for haplotypes of references
r_vcf='/gpfs/gibbs/pi/montalvo-ortiz/reference_panel/1kg/hg37/r20130502'
# set path input for genetic maps
map_ref='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/shapeit/genetic_maps'
# set group list, to only include those in the filtering
sample_m='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/sample_lists/integrated_call_samples_v3.20200731.ALL.ped.forrfmix'
# phasing data with shapeit2
for chr in {1..22}
do
rfmix -f ${vcfip}/GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.rsids.filtered.nodup.phased.vcf.gz \
-r ${r_vcf}/ALL.chr$chr.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz \
-m ${sample_m} \
--chromosome=${chr} \
-o ${vcfop}/GWGO_GWCIDR.1kg_phase3_v5.chr$chr.dose.rsids.filtered.nodup.phased.localancestry \
-g ${map_ref}/genetic_map_GRCh37_chr$chr.txt \
--n-threads=$SLURM_CPUS_PER_TASK
done
# end script
# run the previous script with the next command, for this your server should have a slurm handler
# if not run your script with your handler
sbatch local_ancestry_rfmix2.sh

In [None]:
# modification day: 27 march 2023
# modification for running RFMix in arrays in the servers

# making directory to store local ancestry data
mkdir /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/local_ancestry

# first we need to create a file with the list of jobs to submit
# we generate the content of this file with the following commands
vcfip='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/genotype/phased_vcfs'
# set path for phased output vcf files
vcfop='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/databases/local_ancestry'
# set path input for haplotypes of references
r_vcf='/gpfs/gibbs/pi/montalvo-ortiz/reference_panel/1kg/hg37/r20130502'
# set path input for genetic maps
# this map file has to have removed the chr code from all chromosomes
map_ref='/gpfs/gibbs/project/montalvo-ortiz/jjm262/analysis/genomic/databases/reference/hapmap/genetic_map/hg37'
# set group list, to only include those in the filtering
sample_m='/vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/sample_lists/integrated_call_samples_v3.20200731.ALL.ped.forrfmix'

# testing script
# testing conversion in chromosome 22
module load miniconda
conda activate tractor_mix
# testing conversion
rfmix -f ${vcfip}/GWGO_GWCIDR.1kg_phase3_v5.chr3.dose.rsids.filtered.nodup.phased.shapeit.vcf \
-r ${r_vcf}/ALL.chr3.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz \
-m ${sample_m} \
--chromosome=3 \
-o ${vcfop}/GWGO_GWCIDR.1kg_phase3_v5.chr3.dose.rsids.filtered.nodup.phased.localancestry \
-g ${map_ref}/genetic_map_GRCh37_chr3_red.txt \
--n-threads=36
# code runned

# move to directory storing the scripts for local ancestry
cd /vast/palmer/scratch/montalvo-ortiz/jjm262/genomics/yalepenn/tractor_mix/scripts/local_anc
# base code to run
rfmix -f ${vcfip}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.rsids.filtered.nodup.phased.shapeit.vcf -r ${r_vcf}/ALL.chr"${chr}".phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz -m ${sample_m} --chromosome="${chr}" -o ${vcfop}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.rsids.filtered.nodup.phased.localancestry -g ${map_ref}/genetic_map_GRCh37_chr"${chr}"_red.txt --n-threads=$SLURM_CPUS_PER_TASK

# creating scripts for chromosomes
# chr1 - chr8
for chr in {1..8}
do 
echo "module load miniconda;conda activate tractor_mix;rfmix -f ${vcfip}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.rsids.filtered.nodup.phased.shapeit.vcf -r ${r_vcf}/ALL.chr"${chr}".phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz -m ${sample_m} --chromosome="${chr}" -o ${vcfop}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.rsids.filtered.nodup.phased.localancestry -g ${map_ref}/genetic_map_GRCh37_chr"${chr}"_red.txt --n-threads=30"
done > job_list_tractormix_rfmix_chr1_chr8_v03272023.txt
# chr9 - chr16
for chr in {9..16}
do 
echo "module load miniconda;conda activate tractor_mix;rfmix -f ${vcfip}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.rsids.filtered.nodup.phased.shapeit.vcf -r ${r_vcf}/ALL.chr"${chr}".phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz -m ${sample_m} --chromosome="${chr}" -o ${vcfop}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.rsids.filtered.nodup.phased.localancestry -g ${map_ref}/genetic_map_GRCh37_chr"${chr}"_red.txt --n-threads=30"
done > job_list_tractormix_rfmix_chr9_chr16_v03272023.txt
# chr17 - chr22
for chr in {17..22}
do 
echo "module load miniconda;conda activate tractor_mix;rfmix -f ${vcfip}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.rsids.filtered.nodup.phased.shapeit.vcf -r ${r_vcf}/ALL.chr"${chr}".phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz -m ${sample_m} --chromosome="${chr}" -o ${vcfop}/GWGO_GWCIDR.1kg_phase3_v5.chr"${chr}".dose.rsids.filtered.nodup.phased.localancestry -g ${map_ref}/genetic_map_GRCh37_chr"${chr}"_red.txt --n-threads=30"
done > job_list_tractormix_rfmix_chr17_chr22_v03272023.txt

# load module dSQ
module load dSQ

# generating script for dSQ
dsq --job-file job_list_tractormix_rfmix_chr1_chr8_v03272023.txt --mem 500g --time 1-00:00:00 --partition=bigmem --cpus-per-task=30 --mail-type ALL
dsq --job-file job_list_tractormix_rfmix_chr9_chr16_v03272023.txt --mem 500g --time 1-00:00:00 --partition=bigmem --cpus-per-task=30 --mail-type ALL
dsq --job-file job_list_tractormix_rfmix_chr17_chr22_v03272023.txt --mem 500g --time 1-00:00:00 --partition=bigmem --cpus-per-task=30 --mail-type ALL
        
# the previous script generates file named dsq-job_list_tractor_phasing_v02282023-2023-02-28.sh
sbatch dsq-job_list_tractormix_rfmix_chr1_chr8_v03272023-2023-03-27.sh
sbatch dsq-job_list_tractormix_rfmix_chr9_chr16_v03272023-2023-03-27.sh
sbatch dsq-job_list_tractormix_rfmix_chr17_chr22_v03272023-2023-03-27.sh
