-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.py
117 lines (106 loc) · 4.03 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#-*- encoding: utf-8 -*-
import os
import types
from functools import reduce
_GLOBAL_CONFIG = dict(
user_agent = 'citationdetective',
log_dir = os.path.join(os.path.expanduser('~'), 'cd_logs'),
)
# A base configuration that all languages "inherit" from.
_BASE_LANG_CONFIG = dict(
articles_sampling_fraction = 2e-2,
statement_max_size = 5000,
context_max_size = 5000,
min_sentence_length = 6,
min_citation_need_score = 0.5
)
# Language-specific config, inheriting from the base config above.
_LANG_CODE_TO_CONFIG = dict(
en = dict(
# A friendly name for the language
lang_name = 'English',
# The direction of the language, either ltr or rtl
lang_dir = 'ltr',
# The database to use on Tools Labs
database = 'enwiki_p',
# The domain for Wikipedia in this language
wikipedia_domain = 'en.wikipedia.org',
# These sections which content do not need citations
sections_to_skip = [
'See also',
'References',
'External links',
'Further reading',
'Notes',
'Additional sources',
'Sources',
'Bibliography',
],
# Dictionary of word to vector
vocb_path = os.path.expanduser('~/citationdetective/citation-needed/embeddings/word_dict_en.pck'),
# Dictionary of section title to vector
section_path = os.path.expanduser('~/citationdetective/citation-needed/embeddings/section_dict_en.pck'),
# Tensorflow models to detect Citation Need for English
model_path = os.path.expanduser('~/citationdetective/citation-needed/models/fa_en_model_rnn_attention_section.h5'),
# Argument for padding word vectors to the same length
# so as to use as the input for the RNN model
word_vector_length = 187,
),
it = dict(
lang_name = 'Italiano',
lang_dir = 'ltr',
database = 'itwiki_p',
wikipedia_domain = 'it.wikipedia.org',
sections_to_skip = [
'Note',
'Bibliografia',
'Voci correlate',
'Altri progetti',
'Collegamenti esterni',
],
vocb_path = os.path.expanduser('~/citationdetective/citation-needed/embeddings/word_dict_it.pck'),
section_path = os.path.expanduser('~/citationdetective/citation-needed/embeddings/section_dict_it.pck'),
model_path = os.path.expanduser('~/citationdetective/citation-needed/models/fa_it_model_rnn_attention_section.h5'),
word_vector_length = 319,
),
fr = dict(
lang_name = 'Français',
lang_dir = 'ltr',
database = 'frwiki_p',
wikipedia_domain = 'fr.wikipedia.org',
sections_to_skip = [
'Notes et références',
'Références',
'Annexes',
'Voir aussi',
'Liens externes',
],
vocb_path = os.path.expanduser('~/citationdetective/citation-needed/embeddings/word_dict_fr.pck'),
section_path = os.path.expanduser('~/citationdetective/citation-needed/embeddings/section_dict_fr.pck'),
model_path = os.path.expanduser('~/citationdetective/citation-needed/models/fa_fr_model_rnn_attention_section.h5'),
word_vector_length = 296,
),
)
Config = types.SimpleNamespace
def _inherit(base, child):
ret = dict(base) # shallow copy
for k, v in child.items():
if k in ret:
if isinstance(v, list):
v = ret[k] + v
elif isinstance(v, dict):
v = dict(ret[k], **v)
ret[k] = v
return ret
LANG_CODES_TO_LANG_NAMES = {
lang_code: _LANG_CODE_TO_CONFIG[lang_code]['lang_name']
for lang_code in _LANG_CODE_TO_CONFIG
}
def get_localized_config(lang_code='en'):
if lang_code is None:
lang_code = os.getenv('CD_LANG')
lang_config = _LANG_CODE_TO_CONFIG[lang_code]
cfg = Config(lang_code = lang_code, **reduce(
_inherit, [_GLOBAL_CONFIG, _BASE_LANG_CONFIG, lang_config]))
cfg.lang_codes_to_lang_names = LANG_CODES_TO_LANG_NAMES
return cfg