-
Notifications
You must be signed in to change notification settings - Fork 0
/
erkpoly.py
executable file
·98 lines (54 loc) · 1.58 KB
/
erkpoly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/home/narkem/Anaconda3/bin/python
import os
import re
from subprocess import call
def ner(text, filename, folder):
ft = open(".tmptext.txt", "w+")
ft.write(text)
ft.close()
f1 = open(".polytmp.txt", "w+")
#polyglot --lang nl ner --input text.txt > poly.txt;
call(["polyglot", "--lang", "nl", "ner", "--input", ".tmptext.txt"], stdout=f1)
f1.close()
os.remove(".tmptext.txt")
f1.close()
polytl = []
tokenlist = []
f2 = open(".polytmp.txt", "r+")
f3 = open(folder + "poly-tags_for_" + filename + ".txt", "w+")
lines = f2.readlines()
for i, line in enumerate(lines):
if(i == len(lines)-1):
break
regex = re.findall('(\w+|.)\s*(O|I-PER|I-ORG|I-LOC).*', line)[0]
token = regex[0]
tag = regex[1]
if(not tag == "O"):
intag = re.findall('I-(PER|ORG|LOC)', tag)[0]
next = lines[i+1]
regex = re.findall('(\w+|.)\s*(O|I-PER|I-ORG|I-LOC).*', next)[0]
nexttag = regex[1]
if(not i == 0):
prev = lines[i-1]
regex = re.findall('(\w+|.)\s*(O|I-PER|I-ORG|I-LOC).*', prev)[0]
prevtag = regex[1]
if(not nexttag == tag and not prevtag == tag):
tag = "(" + intag + ")"
elif(not nexttag == tag and prevtag == tag):
tag = intag + ")"
elif(nexttag == tag and not prevtag == tag):
tag = "(" + intag
elif(nexttag == tag and prevtag == tag):
tag = intag
else:
if(not nexttag == tag):
tag = "(" + intag + ")"
else:
tag = "(" + intag
f3.write(token + "\t" + tag + "\n")
tokenlist.append(token)
polytl.append(tag)
f2.close()
os.remove(".polytmp.txt")
f3.close()
return polytl