/
halMafScanner.cpp
152 lines (135 loc) · 3.28 KB
/
halMafScanner.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/*
* Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu)
*
* Released under the MIT license, see LICENSE.txt
*/
#include <cassert>
#include <iostream>
#include <stdexcept>
#include <sstream>
#include <algorithm>
#include "halMafScanner.h"
using namespace std;
using namespace hal;
MafScanner::MafScanner()
{
}
MafScanner::~MafScanner()
{
}
void MafScanner::scan(const string& mafFilePath, const set<string>& targets)
{
_targets = targets;
_mafFile.open(mafFilePath.c_str());
_numBlocks = 0;
if (!_mafFile)
{
throw runtime_error("error opening path: " + mafFilePath);
}
_rows = 0;
_block.clear();
string buffer;
while (!_mafFile.eof() && _mafFile.good())
{
buffer.clear();
_mafFile >> buffer;
if (buffer == "a")
{
if (_rows > 0)
{
updateMask();
aLine();
++_numBlocks;
nextLine();
}
_rows = 0;
}
else if (buffer == "s")
{
++_rows;
if (_rows > _block.size())
{
_block.resize(_rows);
}
Row& row = _block[_rows - 1];
_mafFile >> row._sequenceName >> row._startPosition >> row._length
>> row._strand >> row._srcLength >> row._line;
if (!_mafFile.good())
{
throw hal_exception("error parsing sequence " + row._sequenceName);
}
if (_rows > 1 && row._line.length() != _block[_rows - 2]._line.length())
{
stringstream ss;
ss << "two lines in same block have different lengths: "
<< row._sequenceName << " " << row._startPosition << " and "
<< _block[_rows - 2]._sequenceName << " "
<< _block[_rows - 2]._startPosition;
throw hal_exception(ss.str());
}
if (_targets.size() > 1 && // (will always include reference)
_targets.find(genomeName(row._sequenceName)) == _targets.end())
{
// genome not in targets, pretend like it never happened.
--_rows;
}
else
{
sLine();
}
}
else
{
nextLine();
}
}
if (_rows > 0)
{
updateMask();
++_numBlocks;
}
end();
_mafFile.close();
}
void MafScanner::nextLine()
{
while (!_mafFile.eof() && !_mafFile.bad() && _mafFile.peek() != '\n')
{
_mafFile.get();
}
}
// the mask stores a bit for every column where a gap begins in any row
// a mask at position i implies segmentation [0-i-1][i-n]. ie the cut is
// on the left.
void MafScanner::updateMask()
{
if (_rows > 0)
{
size_t length = _block[0]._line.length();
_mask.resize(length);
fill(_mask.begin(), _mask.end(), false);
// scan left to right
for (size_t i = 1; i < length; ++i)
{
// scan up to down
for (size_t j = 0; j < _rows && _mask[i] == false; ++j)
{
// beginning of gap run. add position of first gap to mask
if (_block[j]._line[i] == '-' && _block[j]._line[i-1] != '-')
{
_mask[i] = true;
}
// end of gap run. add position of first non gap to mask
else if (_block[j]._line[i] != '-' && _block[j]._line[i-1] == '-')
{
_mask[i] = true;
}
}
}
}
}
std::string MafScanner::genomeName(const std::string fullName)
{
assert(fullName.find('.') != string::npos);
return fullName.substr(0, fullName.find('.'));
}