-
Notifications
You must be signed in to change notification settings - Fork 0
/
creating_list_of_tokens_from_texts.py
39 lines (36 loc) · 1.28 KB
/
creating_list_of_tokens_from_texts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
"""
- author: Floriane Chiffoleau
- date: March 2023
- description: Creating a list of tokens
- input: TXT files
- output: TXT files
- usage :
======
python name_of_this_script.py arg1 arg2
arg1: folder with the texts from the test set
arg2: folder with the token lists from the test set
"""
import os
import re
import sys
import spacy
nlp = spacy.load('fr_core_news_lg')
#Import the module used for the tokenization of our text
for root, dirs, files in os.walk(sys.argv[1]):
for filename in sorted(files):
with open(sys.argv[1] + filename, 'r') as file:
print(f'Reading {sys.argv[1] + filename}')
text = file.read()
doc = nlp(text)
token_list = []
for token in doc:
tokens = '"' + token.text + '"; '
token_list.append(tokens)
#Dividing the text into tokens, while keeping the punctuation, numbers, etc.
tokens = "".join(token_list)
tokens = re.sub('\n', " ", tokens)
with open(sys.argv[2] + "token_" + filename, 'w') as file_out:
print(f'Writing to {sys.argv[1]}token_{filename}')
file_out.write(filename.replace('.txt', '') + " = [" + tokens + "]")