forked from majewsky/rust-jmdict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
build.rs
336 lines (289 loc) · 10.8 KB
/
build.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
#[cfg(not(any(
feature = "translations-eng",
feature = "translations-dut",
feature = "translations-fre",
feature = "translations-ger",
feature = "translations-hun",
feature = "translations-rus",
feature = "translations-slv",
feature = "translations-spa",
feature = "translations-swe"
)))]
compile_error!("no target languages selected (select at least one \"translations-XXX\" feature)");
use jmdict_enums::*;
use std::convert::TryInto;
use std::io::Write;
fn main() {
println!("cargo:rerun-if-changed=build.rs");
let opts = jmdict_traverse::Options {
is_db_minimal: cfg!(feature = "db-minimal"),
with_uncommon: cfg!(feature = "scope-uncommon"),
with_archaic: cfg!(feature = "scope-archaic"),
};
let mut omni: OmniBuffer = Default::default();
if cfg!(not(feature = "db-empty")) {
jmdict_traverse::process_dictionary(&mut omni, opts);
}
write_u32s(&path_to("entry_offsets.dat"), &omni.entry_offsets);
write_u32s(&path_to("payload.dat"), &omni.data);
std::fs::write(&path_to("strings.txt"), &omni.text).unwrap();
}
fn path_to(filename: &str) -> std::path::PathBuf {
let out_dir = std::env::var_os("OUT_DIR").unwrap();
std::path::Path::new(&out_dir).join(filename)
}
fn write_u32s(path: &std::path::Path, vals: &[u32]) {
let f = std::fs::File::create(&path).unwrap();
let mut f = std::io::BufWriter::new(f);
for val in vals {
f.write_all(&val.to_ne_bytes()).unwrap();
}
}
///Helper type for references into OmniBuffer::data or OmniBuffer::text.
///Gets constructed as `(start, end).into()` in the respective OmniBuffer methods.
struct StoredRef {
start: u32,
end: u32,
}
impl From<(usize, usize)> for StoredRef {
fn from(val: (usize, usize)) -> Self {
let (start, end) = val;
let start = start.try_into().unwrap();
let end = end.try_into().unwrap();
Self { start, end }
}
}
///Buffer where all payload gets accumulated before being written into the generated data files.
///Check the explanations in CONTRIBUTING.md for how this works, and why it was built this way.
#[derive(Default)]
struct OmniBuffer {
entry_offsets: Vec<u32>,
data: Vec<u32>,
text: String,
}
impl OmniBuffer {
pub fn push_str(&mut self, text: &str) -> StoredRef {
//optimization: empty text doesn't require any work
if text.is_empty() {
return (0, 0).into();
}
let start = self.text.len();
self.text.push_str(text);
let end = self.text.len();
(start, end).into()
}
pub fn push_data(&mut self, data: &[u32]) -> StoredRef {
//optimization: empty arrays don't require any work
if data.is_empty() {
return (0, 0).into();
}
let start = self.data.len();
self.data.extend(data);
(start, start + data.len()).into()
}
pub fn push_array<T: ToPayload>(&mut self, data: &[T]) -> StoredRef {
//optimization: empty arrays don't require any work
if data.is_empty() {
return (0, 0).into();
}
//render all items into a contiguous Vec<u32>
let size = T::size();
let mut repr = vec![0u32; data.len() * size];
for (idx, elem) in data.iter().enumerate() {
elem.encode_one(self, &mut repr[(idx * size)..((idx + 1) * size)]);
}
self.push_data(&repr)
}
}
impl jmdict_traverse::Visitor for OmniBuffer {
fn notify_data_file_path(&mut self, path: &str) {
println!("cargo:rerun-if-changed={}", &path);
}
fn process_entry(&mut self, entry: &jmdict_traverse::RawEntry) {
let size = jmdict_traverse::RawEntry::size();
let mut repr = vec![0u32; size];
entry.encode_one(self, &mut repr);
let r = self.push_data(&repr);
self.entry_offsets.push(r.start);
}
}
//Like omni.push_array(), but does not push the resulting array just yet.
fn push_array<T: ToPayload>(buf: &mut Vec<u32>, omni: &mut OmniBuffer, array: &[T]) -> u32 {
if !array.is_empty() {
let size = T::size();
let mut repr = vec![0u32; array.len() * size];
for (idx, elem) in array.iter().enumerate() {
elem.encode_one(omni, &mut repr[(idx * size)..((idx + 1) * size)]);
}
buf.extend(repr);
}
buf.len() as u32
}
///Helper trait for encoding types from the jmdict-traverse crate into a sequence of u32 for
///embedding in OmniBuffer::data.
trait ToPayload {
///How many u32 are needed to encode one item of this type.
fn size() -> usize;
///Encode one item of this type into the given preallocated buffer of length `Self::size()`.
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]);
}
//NOTE: It would be really nice to just do `impl ToPayload for T where T: EnumPayload`, but this
//conflicts with all other `impl ToPayload` under the current specialization rules.
macro_rules! enum_to_payload {
($t:ident) => {
impl ToPayload for $t {
fn size() -> usize {
1
}
fn encode_one(&self, _omni: &mut OmniBuffer, buf: &mut [u32]) {
buf[0] = self.to_u32();
}
}
};
}
enum_to_payload!(KanjiInfo);
enum_to_payload!(ReadingInfo);
enum_to_payload!(PartOfSpeech);
enum_to_payload!(SenseTopic);
enum_to_payload!(SenseInfo);
enum_to_payload!(Dialect);
impl ToPayload for jmdict_traverse::RawEntry<'_> {
fn size() -> usize {
4
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
//Instead of using `omni.push_array()` on each member and encoding each StoredRef
//separately, we concatenate the payload representations of all member arrays and
//`push_data()` them all at once. We then encode that StoredRef, plus offsets to split the
//encoded array back into its constituents. Since each encoded array is rather short, the
//offsets fit into a single byte, so we can encode both (plus self.ent_seq) in a single u32.
//
//Compared to the naive layout as 3 StoredRef + 1 u32 (28 bytes), we save 12 bytes per Sense.
let mut dbuf = Vec::new();
let offset1 = push_array(&mut dbuf, omni, &self.k_ele);
let offset2 = push_array(&mut dbuf, omni, &self.r_ele);
push_array(&mut dbuf, omni, &self.sense);
let r = omni.push_data(&dbuf);
buf[0] = r.start;
buf[1] = r.end;
buf[2] = offset1 + (offset2 << 16);
buf[3] = self.ent_seq;
}
}
impl ToPayload for jmdict_traverse::RawKanjiElement<'_> {
fn size() -> usize {
5
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
buf[0] = self.ke_pri.to_u32();
let r = omni.push_str(self.keb);
buf[1] = r.start;
buf[2] = r.end;
let r = omni.push_array(&self.ke_inf);
buf[3] = r.start;
buf[4] = r.end;
}
}
impl ToPayload for jmdict_traverse::RawReadingElement<'_> {
fn size() -> usize {
5
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
buf[0] = self.re_pri.to_u32();
let r = omni.push_str(self.reb);
buf[1] = r.start;
buf[2] = r.end;
let r = omni.push_array(&self.re_inf);
buf[3] = r.start;
buf[4] = r.end;
}
}
impl ToPayload for jmdict_traverse::RawSense<'_> {
fn size() -> usize {
5
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
//Instead of using `omni.push_array()` on each member and encoding each StoredRef
//separately, we concatenate the payload representations of all member arrays and
//`push_data()` them all at once. We then encode that StoredRef, plus offsets to split the
//encoded array back into its constituents. Since each encoded array is rather short, the
//offsets fit into a single byte, so we can encode four at a time in a single u32.
//
//Compared to the naive layout as 11 StoredRef (88 bytes), we save 68 bytes per Sense.
let mut dbuf = Vec::new();
let offset1 = push_array(&mut dbuf, omni, &self.stagk);
let offset2 = push_array(&mut dbuf, omni, &self.stagr);
let offset3 = push_array(&mut dbuf, omni, &self.pos);
let offset4 = push_array(&mut dbuf, omni, &self.xref);
let offset5 = push_array(&mut dbuf, omni, &self.ant);
let offset6 = push_array(&mut dbuf, omni, &self.field);
let offset7 = push_array(&mut dbuf, omni, &self.misc);
let offset8 = push_array(&mut dbuf, omni, &self.s_inf);
let offset9 = push_array(&mut dbuf, omni, &self.lsource);
let offset10 = push_array(&mut dbuf, omni, &self.dial);
push_array(&mut dbuf, omni, &self.gloss);
let r = omni.push_data(&dbuf);
buf[0] = r.start;
buf[1] = r.end;
buf[2] = offset1 + (offset2 << 8) + (offset3 << 16) + (offset4 << 24);
buf[3] = offset5 + (offset6 << 8) + (offset7 << 16) + (offset8 << 24);
buf[4] = offset9 + (offset10 << 8);
}
}
impl ToPayload for jmdict_traverse::RawLSource<'_> {
fn size() -> usize {
4
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
let r = omni.push_str(self.text);
buf[0] = r.start;
buf[1] = r.end;
let r = omni.push_str(self.lang);
buf[2] = r.start;
buf[3] = r.end;
//`omni.text` is significantly shorter than 2^28 bytes, so we can shove those two booleans
//into the highest bits of one of the offset values
if self.is_partial {
buf[0] |= 0x10000000;
}
if self.is_wasei {
buf[0] |= 0x20000000;
}
}
}
impl ToPayload for jmdict_traverse::RawGloss<'_> {
fn size() -> usize {
2
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
//`omni.text` is never larger than 30-40 MiB. That's slightly more than 2^24 bytes, but
//comfortably below 2^28 bytes. We can therefore use the upper 4 bits of `buf[0]` and
//`buf[1]`, respectively, to encode `self.lang` and `self.g_type`.
let r = omni.push_str(self.text);
buf[0] = r.start | (self.lang.to_u32() << 28);
buf[1] = r.end | (self.g_type.to_u32() << 28);
}
}
impl<'a> ToPayload for &'a str {
fn size() -> usize {
2
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
let r = omni.push_str(self);
buf[0] = r.start;
buf[1] = r.end;
}
}
impl ToPayload for u32 {
fn size() -> usize {
1
}
fn encode_one(&self, _omni: &mut OmniBuffer, buf: &mut [u32]) {
buf[0] = *self;
}
}