This repository has been archived by the owner on Feb 5, 2024. It is now read-only.
/
model_factory.py
99 lines (85 loc) · 3.2 KB
/
model_factory.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import spacy
from spacy.lang.fr import French
from spacy.tokenizer import Tokenizer
colors = {
"PERS": "#ff9933", # orange
"PHONE_NUMBER": "#ff9933",
"LICENCE_PLATE": "#ff9933",
"MAIL": "#ff9933",
"NUMEROS": "#ff9933",
"ADDRESS": "#ff99cc", # pink
"HOPITAL": "#ff99cc",
"RESIDENCE": "#ff99cc",
"ORGANIZATION": "#00ccff", # blue
"ETABLISSEMENT": "#00ccff",
"MEDIA": "#00ccff",
"FONDS": "#00ccff",
"ETAT": "#00ccff",
"GROUPE": "#00ccff",
"LAWYER": "#ccffcc", # light green
"JUDGE_CLERK": "#ccccff", # purple
"PERSONNE_DE_JUSTICE": "#ccccff",
"COURT": "#ccffff", # light blue
"RG": "#99ff99", # green
"DATE": "#ffcc99", # salmon
"SITE": "#ffcc99",
"BAR": "#ffe699", # light yellow
"UNKNOWN": "#ff0000", # red
}
def prevent_sentence_boundary_detection(doc):
"""
Disable the sentence splitting done by Spacy
More info: https://github.com/explosion/spaCy/issues/1032
:param doc: a Spacy doc
:return: a disable sentence splitting Spacy doc
"""
for token in doc:
# This will entirely disable spaCy's sentence detection
token.is_sent_start = False
return doc
def get_tokenizer(model: French) -> Tokenizer:
split_char = r"[ ,\\.()-/\\|:;'\"+=!?_+#“’'‘]"
extended_infix = [r'[:\\(\\)-\./#"“’\'—'] + model.Defaults.infixes
infix_re = spacy.util.compile_infix_regex(extended_infix)
prefix_re = spacy.util.compile_prefix_regex(tuple(list(model.Defaults.prefixes) + [split_char]))
suffix_re = spacy.util.compile_suffix_regex(tuple(list(model.Defaults.suffixes) + [split_char]))
tok = Tokenizer(
model.vocab,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=None,
)
return tok
def get_empty_model(load_labels_for_training: bool) -> French:
"""
Generate an empty NER model
:rtype: object
"""
# Important to setup the right language because it impacts the tokenizer, sentences split, ...
nlp = spacy.blank(name="fr")
nlp.tokenizer = get_tokenizer(nlp)
nlp.add_pipe(prevent_sentence_boundary_detection, name="prevent-sbd", first=True)
ner = nlp.create_pipe("ner")
# add labels
if load_labels_for_training:
for token_type in list(colors.keys()):
ner.add_label(token_type)
nlp.add_pipe(ner, last=True)
return nlp