/
mk_he_affix.c
243 lines (219 loc) · 8.26 KB
/
mk_he_affix.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
/* Copyright 2004-2012 Nadav Har'El and Dan Kenigsberg */
/* this little program creates hunspell or aspell dictionaries for Hebrew
* according to the hebrew.wgz*.
* We create a single rule for each of hspell's "word specifier". Each rule
* expands to all the prefixes that provide that specifier (and the null
* prefix is implied and NEEDAFFIX is specified for each word where this is
* not appropriate).
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "prefixes.c"
#include "hspell.h"
#define PREFIXFILE_COMMAND "gzip -dc hebrew.wgz.prefixes | ./specfilter"
/* Convert a number in the range 0..52 (currently) to a readable character
that can be used as the rule (prefix set) name. To facilitate merging our
word list with an English one (for spell-checking mixed text in software
that does not support multiple word lists), we do not use the upper-case
latin characters. Currently we use the lower-case letters, in addition to
Hebrew characters (aspell and myspell have no problems with non-ascii
characters) - but almost any symbols can be used to.
A note for future expansion: Aspell has problems with a backslash, while
Myspell works with them - so we will have to skip the backslash character
if we use symbols. But with the digits and other symbols, there's plenty
of room for future expansion.
*/
static inline char num_to_char(int i)
{
if(i<0){
fprintf(stderr,"internal error: num_to_char(%d)\n",i);
exit(1);
} else if(i<26){
return 'a'+i;
} else if(i<52){
return 'à'+(i-26);
} else {
fprintf(stderr,"internal error: num_to_char(%d) ran out of symbols\n",i);
exit(1);
}
}
/* Usage: mk_he_affix <hunspell> <affixfile> <dictfile>
* Where <hunspell> is: 0 for aspell, 1 for hunspell. Hunspell and aspell have
* some different affix file features, and also different encoding requirements
* (aspell requires ISO-8859-8, while hunspell is, for an unknown reason,
* 10 times faster if we give it UTF-8).
*/
int main(int argc, char *argv[])
{
int i, specifier;
char seen_specifiers[100], rulechar;
int already_seen=0, seen, count;
char needaffix=0;
FILE *prefixfp, *wordsfp;
FILE *afffp, *dicfp;
int prefixes_size = 0;
char *prefix_is_word;
int hunspell;
if(argc!=4){
fprintf(stderr,"%d\n",argc);
fprintf(stderr,"Usage: %s <hunspell> <affixfile> <dictfile>\n", argv[0]);
exit(1);
}
hunspell=atoi(argv[1]);
if(hunspell){
char s[256];
/* Unfortunately, the dictionary file should start with an approximate
* count of the number of words. Note that this count is only approximate
* as we also add a list of stand-alone prefixes at the end.
*/
snprintf(s, sizeof(s), "gzip -dc hebrew.wgz | ./wunzip | wc -l > %s", argv[3]);
system(s);
snprintf(s, sizeof(s), "iconv -f iso-8859-8 -t utf-8 >%s", argv[2]);
afffp = popen(s, "w");
snprintf(s, sizeof(s), "iconv -f iso-8859-8 -t utf-8 >>%s", argv[3]);
dicfp = popen(s, "w");
} else {
afffp = fopen(argv[2], "w");
dicfp = fopen(argv[3], "w");
}
fprintf(afffp, "# This file was generated automatically from data prepared\n"
"# by the Hspell project (http://hspell.ivrix.org.il/).\n"
"# Hspell version %d.%d%s was used.\n"
"# The conversion was carried out in %s\n",
HSPELL_VERSION_MAJOR,HSPELL_VERSION_MINOR,HSPELL_VERSION_EXTRA,
__DATE__);
fprintf(afffp, "# Copyright 2004-2012, Nadav Har'El and Dan Kenigsberg\n");
fprintf(afffp, "# The dictionary (this file and the corresponding word list)\n"
"# is licensed under the GNU Affero General Public License\n"
"# (AGPL) version 3.\n");
if(hunspell){
fprintf(afffp,
"SET UTF-8\n"
"TRY éåäàòçë÷'\"ùñæãâáøðîèöúôíóêõïì\n"
"WORDCHARS àáâãäåæçèéëìîðñòôö÷øùúíïêóõ'\"\n"
"BREAK 3\n"
"BREAK ^\"\n"
"BREAK \"$\n"
"BREAK ^'\n"
"MAP 10\n"
"MAP êëç\n"
"MAP íî\n"
"MAP ïð\n"
"MAP óô\n"
"MAP õö\n"
"MAP ë÷\n"
"MAP àò # for English\n"
"MAP âä # for Russian\n"
"MAP öñ # for Arabic\n"
"MAP çëø # for French\n"
);
}
prefixfp = popen(PREFIXFILE_COMMAND, "r");
while ((specifier=fgetc(prefixfp))!= EOF) {
for(i=0, seen=0; (i<already_seen) && !seen; i++) {
if (seen_specifiers[i] == specifier) seen = 1; }
if (seen) continue;
seen_specifiers[already_seen++] = specifier;
/* count the number of matching prefixes */
for (i=1, count=0; prefixes_noH[i]!=0; i++) {
if (masks_noH[i] & specifier) {
if (!strcmp("å",prefixes_noH[i])) count += 2;
else count += 4;
}
}
rulechar = num_to_char(already_seen-1);
fprintf(afffp, "PFX %c N %d\n",rulechar,count);
/* print one rule for each legal prefix that goes with this word type,
* and remember to double initial waw if a prefix is prepended.
*
* The empty prefix, prefixes_nohH[0], needs special treatment. While
* other allowed prefixes need to be explictly added to the rules (as we
* do below), the empty prefix is by default allowed, and if it is not
* desired we need to explicitly disallow it with a special flag on
* every word for which we don't want to allow the empty prefix, with a
* special NEEDAFFIX flag.
* Unfortunately, NEEDAFFIX is only supported by hunspell; Aspell ignores
* it, and therefore mistakenly accepts the maqor natuy without a prefix,
* e.g., éùåï, as in ìéùåï but without the prefix.
*/
if (!(masks_noH[0] & specifier)){
/* Too bad this isn't supported because only one NEEDAFFIX allowed.
* So we'll need to have a single NEEDAFFIX flag, and specify it on
* individual words that need it
*/
/* fprintf(afffp, "NEEDAFFIX %c\n",rulechar); */
needaffix=1;
}
for (i=1; prefixes_noH[i]!=0; i++) {
if (masks_noH[i] & specifier) {
if (!strcmp("å",prefixes_noH[i])) {
fprintf(afffp, "PFX %c 0 %s .\n",rulechar,prefixes_noH[i]);
fprintf(afffp, "PFX %c 0 %s\" .\n",rulechar,prefixes_noH[i]);
} else {
fprintf(afffp, "PFX %c 0 %s [^å]\n",rulechar,prefixes_noH[i]);
fprintf(afffp, "PFX %c 0 %s åå\n",rulechar,prefixes_noH[i]);
fprintf(afffp, "PFX %c 0 %s\" .\n",rulechar,prefixes_noH[i]);
fprintf(afffp, "PFX %c 0 %så å[^å]\n",rulechar,prefixes_noH[i]);
}
}
}
prefixes_size = i;
fprintf(afffp, "\n");
}
if (hunspell && needaffix) {
needaffix = num_to_char(already_seen);
fprintf(afffp, "NEEDAFFIX %c\n",needaffix);
}
pclose(prefixfp);
if(hunspell)
pclose(afffp);
else
fclose(afffp);
prefix_is_word = (char *)calloc(sizeof(char),prefixes_size);
/* and now, translate hebrew.wgz+hebrew.wgz.prefix into aspell-style word
* list. */
prefixfp = popen(PREFIXFILE_COMMAND, "r");
wordsfp = popen("gzip -dc hebrew.wgz|./wunzip", "r");
while ((specifier=fgetc(prefixfp))!= EOF) {
char word[100];
int len, j;
/* find the specifier place (which infers which aspell rule apply to its
* word) */
for(i=0; (i<already_seen) && (seen_specifiers[i]!=specifier) ; i++);
fgets(word, sizeof(word)-3,wordsfp);
/* write down whether this word is also a legal prefix (and therefore should
not be written again later) */
for (j=1; prefixes_noH[j]!=0; j++) {
if (!strcmp(word,prefixes_noH[j])) {
if (masks_noH[0] & specifier) /* this word is allowed on its own */
prefix_is_word[j] = 1;
break;
}
}
word[strlen(word)-1]='\0'; /* remove trailing newline */
fprintf(dicfp,"%s",word);
putc('/', dicfp);
putc(num_to_char(i), dicfp);
if (hunspell && !(masks_noH[0] & specifier))
/* because we can't specify NEEDAFFIX for several prefixes, unfortunately
* we need to use one ("needaffix") and put it on individual words */
putc(needaffix, dicfp);
putc('\n', dicfp);
}
pclose(prefixfp);
pclose(wordsfp);
/* accept "dangling" prefixes, that many times precede numbers and latin */
/* but make sure not to repeat words that already appear in the dictionary.
* This may cause unwanted warning. */
for (i=1; prefixes_noH[i]!=0; i++) {
if (!prefix_is_word[i])
fprintf(dicfp, "%s\n", prefixes_noH[i]);
}
free(prefix_is_word);
if(hunspell)
pclose(dicfp);
else
fclose(dicfp);
return 0;
}