/
ReduceVCF.scala
149 lines (120 loc) · 4.27 KB
/
ReduceVCF.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
package abeel.genometools.reducevcf
import java.io.File
import java.io.PrintWriter
import java.util.Properties
import scala.io.Source
import atk.util.Tool
import be.abeel.util.CountMap
import atk.compbio.vcf.VCFLine
import atk.compbio.vcf.Match
import atk.compbio.vcf._
import abeel.genometools.Main
import atk.compbio.vcf.SingleSubstitution
/**
* Program to whittle down VCF file to variation sites that pass all filters!
*
*/
object ReduceVCF extends Tool with Main {
class Config {
var inputfile: File = null
var outputfile: File = null
var keep: Boolean = false;
}
override def main(args: Array[String]) {
println("##----------------------------------------------")
println("## ReduceVCF.scala")
println("## ")
println("## Tool to reduce the size of VCF files by removing")
println("## all matches.")
println("## ")
println("## ")
println("## The program will conclude with a message")
println("## that indicates the run was successful.")
println("## ")
println("## By Thomas Abeel (tabeel@broadinstitute.org)")
println("##----------------------------------------------")
try {
val prop = new Properties();
prop.load(ReduceVCF.getClass().getResourceAsStream("/tool.properties"));
println("## Program=" + prop.getProperty("program"));
println("## Version=" + prop.getProperty("version"));
} catch {
case e: Exception =>
System.err.println("Problem while reading version information");
e.printStackTrace();
}
val config = new Config();
val parser = new scopt.OptionParser[Unit]("Reducer") {
opt[File]('i', "input") required() valueName("<file>") text("Input file") foreach({ v: File => config.inputfile = v })
opt[File]('o', "output") required() valueName("<file>") text("Output file") foreach({ v: File => config.outputfile = v })
opt[Unit]('k', "keep") text("Keep all non-reference calls, i.e. non-reference calls without the PASS flag.") foreach({ _ => config.keep = true })
// arglist("<file>...", "arglist allows variable number of arguments",
// { v: String => config.files = (v :: config.files).reverse })
}
if (parser.parse(args)) {
assume(config.inputfile!=null)
assume(config.outputfile!=null)
processFile(config.inputfile, config.outputfile, config.keep)
} else {
println("Could not interpret command-line arguments, quitting!")
System.exit(-1)
}
}
def processFile(file: File, outFile: File, keep: Boolean) = {
if (outFile.exists() && outFile.length() > 0) {
log("File already exists, aborting...")
} else {
val pw = new PrintWriter(outFile)
val summary = new PrintWriter(outFile.toString() + ".log")
val filterFM = new CountMap[String]
val typeFM = new CountMap[String]
var samePass = 0
var sameFail = 0
var diffPass = 0
var diffFail = 0
var lineCount = 0
val passCM = new CountMap[String]();
val failCM = new CountMap[String]();
for (line <-Source.fromFile(file).getLines) {
if (line.charAt(0) == '#') {
pw.println(line)
// pwMini.println(line)
} else {
lineCount += 1
val vcfLine = new VCFLine(line)
filterFM.count(vcfLine.filter)
typeFM.count(vcfLine.variation.strType)
if (vcfLine.pass) {
passCM.count(vcfLine.variation.strType)
} else {
failCM.count(vcfLine.variation.strType)
}
vcfLine.variation match {
case Match =>
if (vcfLine.pass)
samePass += 1
else {
if (keep)
pw.println(line)
sameFail += 1
}
case _ =>
if (vcfLine.pass) {
pw.println(line)
diffPass += 1
} else {
if (keep)
pw.println(line)
diffFail += 1
// println("FAIL: " + line)
}
}
}
}
summary.println(List(file.getName(), samePass, sameFail, diffPass, diffFail, lineCount, passCM, failCM).mkString("\t"))
pw.close()
summary.close
finish()
}
}
}