-
Notifications
You must be signed in to change notification settings - Fork 1
/
ner.py
135 lines (104 loc) · 3.43 KB
/
ner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from bs4 import BeautifulSoup as bs
from sklearn.model_selection import train_test_split
from bs4.element import Tag
import codecs
import nltk
import json
# Read data file and parse the XML
with codecs.open("reuters.xml", "r", "utf-8") as infile:
soup = bs(infile, "html5lib")
docs = []
for elem in soup.find_all("document"):
texts = []
# Loop through each child of the element under "textwithnamedentities"
for c in elem.find("textwithnamedentities").children:
if type(c) == Tag:
if c.name == "namedentityintext":
label = "N" # part of a named entity
else:
label = "I" # irrelevant word
for w in c.text.split(" "):
if len(w) > 0:
texts.append((w, label))
docs.append(texts)
data = []
for i, doc in enumerate(docs):
# Obtain the list of tokens in the document
tokens = [t for t, label in doc]
# Perform POS tagging
tagged = nltk.pos_tag(tokens)
# Take the word, POS tag, and its label
data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])
'''
np = open("labelledtaggeddata" + ".txt", "w")
for s in data:
data1 = s
np.write(str(data))
np.close()
'''
def word2features(doc, i):
word = doc[i][0]
postag = doc[i][1]
# Common features for all words
features = [
'bias',
'word.lower=' + word.lower(),
'word[-3:]=' + word[-3:],
'word[-2:]=' + word[-2:],
'word.isupper=%s' % word.isupper(),
'word.istitle=%s' % word.istitle(),
'word.isdigit=%s' % word.isdigit(),
'postag=' + postag
]
# Features for words that are not
# at the beginning of a document
if i > 0:
word1 = doc[i - 1][0]
postag1 = doc[i - 1][1]
features.extend([
'-1:word.lower=' + word1.lower(),
'-1:word.istitle=%s' % word1.istitle(),
'-1:word.isupper=%s' % word1.isupper(),
'-1:word.isdigit=%s' % word1.isdigit(),
'-1:postag=' + postag1
])
else:
# Indicate that it is the 'beginning of a document'
features.append('BOS')
# Features for words that are not
# at the end of a document
if i < len(doc) - 1:
word1 = doc[i + 1][0]
postag1 = doc[i + 1][1]
features.extend([
'+1:word.lower=' + word1.lower(),
'+1:word.istitle=%s' % word1.istitle(),
'+1:word.isupper=%s' % word1.isupper(),
'+1:word.isdigit=%s' % word1.isdigit(),
'+1:postag=' + postag1
])
else:
# Indicate that it is the 'end of a document'
features.append('EOS')
return features
# A function for extracting features in documents
def extract_features(doc):
return [word2features(doc, i) for i in range(len(doc))]
# A function fo generating the list of labels for each document
def get_labels(doc):
return [label for (token, postag, label) in doc]
X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
thefile = open('x_test1.txt', 'w')
json.dump(X_test, thefile)
thefile.close()
thefile1 = open('x_train1.txt', 'w')
json.dump(X_train, thefile1)
thefile1.close()
thefile2 = open('y_train1.txt', 'w')
json.dump(y_train, thefile2)
thefile2.close()
thefile3 = open('y_test1.txt', 'w')
json.dump(y_test, thefile3)
thefile3.close()