LD Docs https://www.cog-genomics.org/plink/1.9/ld

In [11]:
import time
plink = "/cm/shared/apps/plink/1.90-beta7/bin/plink"
ratgenes_master = "/projects/ps-palmer/hs_rats/round9_1/Heterogenous-stock_n14780_10182022_QC_Sex_Het_pass_n13526.vcf.gz"
ratgenes = "/scratch/capstone/Heterogenous-stock_n14780_10182022_QC_Sex_Het_pass_n13526.vcf.gz"

In [2]:
# Copy file over (if necessary)

tic = time.time()
! cp {ratgenes_master} {ratgenes}
toc = time.time()
print(tic-toc)

-175.34983205795288


In [13]:
# Create BED files
bfile_no_id = "/scratch/capstone/ratgenes_unpruned"

tic = time.time()
! {plink} --vcf {ratgenes} --make-bed --out {bfile_no_id}
toc = time.time()
print(tic-toc)

PLINK v1.90b7 64-bit (16 Jan 2023)             www.cog-genomics.org/plink/1.9/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /scratch/capstone/ratgenes_unpruned.log.
Options in effect:
  --make-bed
  --out /scratch/capstone/ratgenes_unpruned
  --vcf /scratch/capstone/Heterogenous-stock_n14780_10182022_QC_Sex_Het_pass_n13526.vcf.gz

95274 MB RAM detected; reserving 47637 MB for main workspace.
--vcf: /scratch/capstone/ratgenes_unpruned-temporary.bed +
/scratch/capstone/ratgenes_unpruned-temporary.bim +
/scratch/capstone/ratgenes_unpruned-temporary.fam written.
6533840 variants loaded from .bim file.
13526 people (0 males, 0 females, 13526 ambiguous) loaded from .fam.
Ambiguous sex IDs written to /scratch/capstone/ratgenes_unpruned.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 13526 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323

In [24]:
# Assign variant IDs
bfile = "/scratch/capstone/ratgenes_unpruned_id"
id_string = "@:#\$1,\$2"

tic = time.time()
! {plink} --bfile {bfile_no_id} --set-missing-var-ids {id_string} --make-bed --out {bfile} 
toc = time.time()
print(tic-toc)

PLINK v1.90b7 64-bit (16 Jan 2023)             www.cog-genomics.org/plink/1.9/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /scratch/capstone/ratgenes_unpruned_id.log.
Options in effect:
  --bfile /scratch/capstone/ratgenes_unpruned
  --make-bed
  --out /scratch/capstone/ratgenes_unpruned_id
  --set-missing-var-ids @:#$1,$2

95274 MB RAM detected; reserving 47637 MB for main workspace.
6533840 variants loaded from .bim file.
6533840 missing IDs set.
13526 people (0 males, 0 females, 13526 ambiguous) loaded from .fam.
Ambiguous sex IDs written to /scratch/capstone/ratgenes_unpruned_id.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 13526 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total gen

In [25]:
# Do LD pruning
window_size = 50
step_size = 5
r2_threshold = 0.05
prunefile = f"/scratch/capstone/ld{window_size}_{step_size}_{r2_threshold}"

tic = time.time()
! {plink} --bfile {bfile} --indep-pairwise {window_size} {step_size} {r2_threshold} --allow-no-sex --out {prunefile}
toc = time.time()
print(tic-toc)
! head {prunefile}.prune.in

PLINK v1.90b7 64-bit (16 Jan 2023)             www.cog-genomics.org/plink/1.9/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /scratch/capstone/ld50_5_0.05.log.
Options in effect:
  --allow-no-sex
  --bfile /scratch/capstone/ratgenes_unpruned_id
  --indep-pairwise 50 5 0.05
  --out /scratch/capstone/ld50_5_0.05

95274 MB RAM detected; reserving 47637 MB for main workspace.
6533840 variants loaded from .bim file.
13526 people (0 males, 0 females, 13526 ambiguous) loaded from .fam.
Ambiguous sex IDs written to /scratch/capstone/ld50_5_0.05.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 13526 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.993011.
6533840 variants and 13

In [26]:
# Extract pruned variants
final_file = "/scratch/capstone/ratgenes_pruned"

tic = time.time()
! {plink} --bfile {bfile} --extract {prunefile}.prune.in --make-bed --out {final_file}
toc = time.time()
print(tic-toc)

PLINK v1.90b7 64-bit (16 Jan 2023)             www.cog-genomics.org/plink/1.9/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /scratch/capstone/ratgenes_pruned.log.
Options in effect:
  --bfile /scratch/capstone/ratgenes_unpruned_id
  --extract /scratch/capstone/ld50_5_0.05.prune.in
  --make-bed
  --out /scratch/capstone/ratgenes_pruned

95274 MB RAM detected; reserving 47637 MB for main workspace.
6533840 variants loaded from .bim file.
13526 people (0 males, 0 females, 13526 ambiguous) loaded from .fam.
Ambiguous sex IDs written to /scratch/capstone/ratgenes_pruned.nosex .
--extract: 96654 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 13526 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979