This repository has been archived by the owner on Jun 20, 2019. It is now read-only.
/
untag.d
189 lines (171 loc) · 5.02 KB
/
untag.d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
// PERMUTE_ARGS:
// EXTRA_FILES: extra-files/untag.html
import std.algorithm, std.ascii, std.conv, std.exception,
std.file, std.getopt, std.path, std.range, std.stdio,
std.string, std.traits;
auto binaryFun(string pred, T, U)(T a, U b)
{
return(mixin(pred));
}
/**
If $(D startsWith(r1, r2)), consume the corresponding elements off $(D
r1) and return $(D true). Otherwise, leave $(D r1) unchanged and
return $(D false).
*/
bool startsWithConsume(alias pred = "a == b", R1, R2)(ref R1 r1, R2 r2)
{
auto r = r1; // .save();
while (!r2.empty && !r.empty && binaryFun!pred(r.front, r2.front))
{
r.popFront();
r2.popFront();
}
return r2.empty ? (){ r1 = r; return true;}() : false;
}
uint bug = 1;
int main(string[] args) {
getopt(args, "bug", &bug);
enforce(bug <= 2);
auto txt = readText("runnable/extra-files/untag.html");
untag(txt, "runnable/extra-files/untag.html");
return 0;
}
void untag(string txt, string filename) {
string currentParagraph;
string origtxt = txt;
string origtxtcopy = txt.idup;
// Find beginning of content
txt = std.algorithm.find(txt, "<!-- start content -->\n");
// Ancillary function that commits the current paragraph for
// writing
void commit() {
writeParagraph(strip(currentParagraph));
}
void writeChar(dchar c) {
immutable lastWritten = currentParagraph.length
? currentParagraph.back
: dchar.init;
if (lastWritten == ' ' && c == ' ') {
// Two consecutive spaces fused
} else {
// Normal case
currentParagraph ~= c;
}
}
void writeWords(string s) {
if (bug == 0) {
foreach (dchar c; s) {
currentParagraph ~= c;
}
} else if (bug == 1) {
reserve(currentParagraph, currentParagraph.length + s.length);
currentParagraph ~= s;
} else {
currentParagraph = currentParagraph ~ s;
}
}
// Parse the content
while (!txt.empty) {
size_t i = 0;
while (i < txt.length && txt[i] != '<' && txt[i] != '&') {
++i;
}
writeWords(txt[0 .. i]);
if (i == txt.length) {
commit();
return;
}
txt = txt[i .. $];
auto c = txt[0];
txt = txt[1 .. $];
if (c == '<') { // This is a tag
if (startsWithConsume(txt, `/p>`) ||
startsWithConsume(txt, `/li>`)) {
// End of paragraph
commit();
} else {
// This is an uninteresting tag
enforce(findConsume(txt, '>'),
"Could not find closing tag: "~txt);
}
} else {
auto app = appender!string();
findConsume(txt, ';', app);
switch (app.data) {
case "#160;": case "#32;": case "reg;": case "nbsp;":
writeChar(' ');
break;
case "amp;":
writeChar('&');
break;
case "gt;":
writeChar('>');
break;
case "lt;":
writeChar('<');
break;
case "quot;":
writeChar('"');
break;
default:
throw new Exception(text("Unknown code: &", app.data));
break;
}
}
}
}
void writeParagraph(string sentence) {
static bool isSeparator(dchar a) {
return !(isAlpha(a) /*|| a == '.'*/);
}
foreach (string cand; std.algorithm.splitter(sentence, ' ')) {
cand = toLower(cand);
}
}
/**
If $(D r2) can not be found in $(D r1), leave $(D r1) unchanged and
return $(D false). Otherwise, consume elements in $(D r1) until $(D
startsWithConsume(r1, r2)), and return $(D true). Effectively
positions $(D r1) right after $(D r2).
*/
bool findConsume(R1, R2)(ref R1 r1, R2 r2) if (isForwardRange!R2) {
auto r = r1; // .save();
while (!r.empty) {
if (startsWithConsume(r, r2)) {
r1 = r;
return true;
}
r.popFront();
}
return false;
}
/**
If $(D r2) can not be found in $(D r1), leave $(D r1) unchanged and
return $(D false). Otherwise, consume elements in $(D r1) until $(D
startsWith(r1, r2)), and return $(D true).
*/
bool findConsume(R, E)(ref R r, E e) if (is(typeof(r.front == e))) {
auto r1 = std.algorithm.find(r, e);
if (r1.empty) return false;
r = r1;
r.popFront();
return true;
}
/**
If $(D r2) can not be found in $(D r1), leave $(D r1) unchanged and
return $(D false). Otherwise, consume elements in $(D r1) until $(D
startsWith(r1, r2)), and return $(D true).
*/
bool findConsume(R1, E, R2)(ref R1 r1, E e, R2 r2) if (is(typeof(r1.front == e))) {
auto r = r1;
while (!r.empty) {
r2.put(r.front);
if (r.front == e) {
r.popFront();
r1 = r;
return true;
}
r.popFront();
}
return false;
}