-
Notifications
You must be signed in to change notification settings - Fork 0
/
producing_tokens_and_lemmas.py
67 lines (59 loc) · 1.93 KB
/
producing_tokens_and_lemmas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- UTF-8 -*-
"""
- author: Floriane Chiffoleau
- date: September 2022
- description: Cleaning ground truth to obtain a list of tokens and lemmas
- input: TXT file
- output: Python file
- usage :
======
python name_of_this_script.py arg1 arg2
arg1: file with all the groundtruth combined
arg2: file with the lists of tokens and lemmas
"""
import re
import sys
#Importing the NLP tool and the language source we want to work with
import spacy
nlp = spacy.load('fr_core_news_lg')
def delete_punctuation(text):
""" Deleting punctuation marks from the text
:param text: Text to clean
:type text: str
:returns: Texte without punctuation
:rtype: str
"""
punctuation = "!:;()\",?'’.°"
for marker in punctuation:
text = text.replace(marker, " ")
return text
with open(sys.argv[1], 'r') as file_in:
print("reading from "+sys.argv[1])
text = file_in.read()
#Remove elements that can't be taken into account in the frequency list
text = re.sub(r"- [0-9]{1,} -\n", "", text)
text = re.sub(r"-\n", "", text)
text = re.sub(r"\n", " ", text)
text = re.sub(r"[0-9]", "", text)
text = re.sub(r"X{3,}", "", text)
text = re.sub("/", "", text)
text = delete_punctuation(text)
text = text.replace("££", "")
text = text.replace("€", "")
#Transform every uppercase letter in lowercase to avoid falsifying the count
text = text.lower()
token_list = []
lemma_list = []
doc = nlp(text)
for token in doc:
#Call the tokens and their lemma versions
lemmas = token.lemma_ + " "
tokens = str(token) + " "
token_list.append(tokens)
lemma_list.append(lemmas)
tokens = "".join(token_list)
lemmas = "".join(lemma_list)
with open(sys.argv[2], 'w') as file_out:
print("writing to "+sys.argv[2])
file_out.write("lemme = \"" + str(lemmas) + "\"\n\n")
file_out.write("token = \"" + str(tokens) + '"')