/
filter_bin.py
38 lines (27 loc) · 1.3 KB
/
filter_bin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# encoding: UTF-8
import sys
"""
filter_bin.py
This simple script runs though the SHsnid.csv BÍN dump provided by http://bin.arnastofnun.is/gogn/
and outputs a list of all general words between 3-6 characters long.
Usage should be something along the lines of:
$ python filter_bin.py > dictionary.txt
This produces a file usable by the levelbuilder (build_levels.py).
"""
is_char = { u'Á':u'á', u'É':u'é', u'Í':u'í', u'Ó':u'ó', u'Ú':u'ú', u'Ý':u'ý', u'Þ':u'þ', u'Æ':u'æ', u'Ð':u'ð', u'Ö':u'ö' }
def lowercase ( s ):
return ''.join( is_char[c] if c in is_char else c.lower() for c in s )
def filter_bin ( filename ):
words = {}
for line in open( filename, 'r' ):
lemma, id, group, category, word, tag = line.strip().split(';')
word = unicode( word, encoding='UTF-8' )
if len( word ) >= 3 and len( word ) <= 6 and category not in ['ism','örn','göt','fyr','föð','móð','bibl','lönd']:
lc = lowercase( word )
if lc not in words and '-' not in word:
if lc == word:
words[ lc ] = lc
print word.encode('UTF-8')
if __name__ == '__main__':
# Allow a file to be passed as a parameter, but assume user is using BÍN
filter_bin( sys.argv[1] if len(sys.argv) > 1 else 'SHsnid.csv' )