forked from pielstroem/Topics
-
Notifications
You must be signed in to change notification settings - Fork 13
/
mallet.py
526 lines (465 loc) · 22.3 KB
/
mallet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
# -*- coding: utf-8 -*-
"""MALLET wrapper for Python.
This module contains various `MALLET`_ related functions for Topic Modeling
provided by `DARIAH-DE`_.
.. _MALLET:
http://mallet.cs.umass.edu/
.. _DARIAH-DE:
https://de.dariah.eu
https://github.com/DARIAH-DE
"""
__author__ = "DARIAH-DE"
__authors__ = "Steffen Pielstroem, Sina Bock, Severin Simmler"
__email__ = "pielstroem@biozentrum.uni-wuerzburg.de"
import itertools
import logging
import numpy as np
import operator
import os
import pandas as pd
from platform import system
from subprocess import Popen, PIPE
log = logging.getLogger('mallet')
log.addHandler(logging.NullHandler())
logging.basicConfig(level=logging.ERROR,
format='%(levelname)s %(name)s: %(message)s')
def create_mallet_binary(path_to_mallet='mallet', path_to_file=False,
path_to_corpus=False, output_file=os.path.join('mallet_output', 'binary.mallet'),
encoding=None, token_regex=None, preserve_case=False,
remove_stopwords=True, stoplist=None, extra_stopwords=None,
stop_pattern_file=None, skip_header=False, skip_html=False,
replacement_files=None, deletion_files=None, gram_sizes=None,
keep_sequence=True, keep_sequence_bigrams=False,
binary_features=False, save_text_in_source=False,
print_output=False):
"""Creates a MALLET binary file.
Description:
bla
Args:
path_to_mallet (str): Path to MALLET. Defaults to 'mallet'. If MALLET is
not properly installed, use absolute path, e.g. '/home/workspace/mallet/bin/mallet'.
path_to_file (str): Absolute path to text file, e.g. '/home/workspace/testfile.txt'.
path_to_corpus (str): Absolute path to corpus folder, e.g. '/home/workspace/corpus_txt'.
output_file (str): Path to output plus filename, e.g. '/home/workspace/mallet_output/binary.mallet'.
encoding (str): Character encoding for input file. Defaults to UTF-8.
token_regex (str): Divides documents into tokens using a regular
expression (supports Unicode regex). Defaults to \p{L}[\p{L}\p{P}]+\p{L}.
preserve_case (bool): If false, converts all word features to lowercase. Defaults to False.
remove_stopwords (bool): Ignores a standard list of very common English tokens. Defaults to True.
stoplist (str): Absolute path to plain text stopword list. Defaults to None.
extra_stopwords (str): Read whitespace-separated words from this file,
and add them to either the default English stoplist or the list
specified by `stoplist`. Defaults to None.
stop_pattern_file (str): Read regular expressions from a file, one per
line. Tokens matching these regexps will be removed. Defaults to None.
skip_header (bool): If true, in each document, remove text occurring
before a blank line. This is useful for removing email or UseNet
headers. Defaults to False.
skip_html (bool): If true, remove text occurring inside <...>, as in
HTML or SGML. Defaults to False.
replacement_files (str): Files containing string replacements, one per
line: 'A B [tab] C' replaces A B with C, 'A B' replaces A B with A_B.
Defaults to None.
deletion_files (str): Files containing strings to delete after
`replacements_files` but before tokenization (i.e. multiword stop
terms). Defaults to False.
gram_sizes (int): Include among the features all n-grams of sizes
specified. For example, to get all unigrams and bigrams, use `gram_sizes=1,2`.
This option occurs after the removal of stop words, if removed.
Defaults to None.
keep_sequence (bool): Preserves the document as a sequence of word features,
rather than a vector of word feature counts. Use this option for sequence
labeling tasks. MALLET also requires feature sequences rather than
feature vectors. Defaults to True.
keep_sequence_bigrams (bool): If true, final data will be a
FeatureSequenceWithBigrams rather than a FeatureVector. Defaults to False.
binary_features (bool): If true, features will be binary. Defaults to False.
save_text_in_source (bool): If true, save original text of document in source.
Defaults to False.
print_output (bool): If true, print a representation of the processed data
to standard output. This option is intended for debugging. Defaults to
False.
Returns:
String. Absolute path to created MALLET binary file.
"""
if system() == 'Windows':
shell = True
else:
shell = False
if not os.path.exists(os.path.dirname(output_file)):
os.makedirs(os.path.dirname(output_file))
param = [path_to_mallet]
if not path_to_file:
param.append('import-dir')
param.append('--input')
param.append(path_to_corpus)
else:
param.append('import-file')
param.append('--input')
param.append(path_to_file)
if encoding is not None:
param.append('--encoding')
param.append(encoding)
if token_regex is not None:
param.append('--token-regex')
param.append(token_regex)
if preserve_case:
param.append('--preserve-case')
if remove_stopwords:
param.append('--remove-stopwords')
if stoplist is not None:
param.append('--stoplist-file')
param.append(stoplist)
if extra_stopwords is not None:
param.append('--extra-stopwords')
param.append(extra_stopwords)
if stop_pattern_file is not None:
param.append('--stop-pattern-file')
param.append(stop_pattern_file)
if skip_header:
param.append('--skip-header')
if skip_html:
param.append('--skip-html')
if replacement_files is not None:
param.append('--replacement-files')
param.append(replacement_files)
if deletion_files is not None:
param.append('--deletion-files')
param.append(deletion_files)
if gram_sizes is not None:
param.append('--gram-sizes')
param.append(str(gram_sizes))
if keep_sequence:
param.append('--keep-sequence')
if keep_sequence_bigrams:
param.append('--keep-sequence-bigrams')
if binary_features:
param.append('--binary-features')
if save_text_in_source:
param.append('--save-text-in-source')
if print_output:
param.append('--print-output')
param.append('--output')
param.append(output_file)
try:
log.info("Running MALLET with %s ...", param)
log.info("Saving MALLET binary to %s ...", output_file)
p = Popen(param, stdout=PIPE, stderr=PIPE, shell=shell)
mallet_info = p.communicate()[0].decode('utf-8')
if print_output:
log.info(mallet_info)
except KeyboardInterrupt:
p.terminate()
log.error(mallet_info)
return output_file
def create_mallet_model(path_to_mallet='mallet', path_to_binary=None, input_model=None,
input_state=None, folder_for_output='mallet_output',
output_model=True, output_model_interval=0, output_state=True,
output_state_interval=0, inferencer_file=True, evaluator_file=True,
output_topic_keys=True, topic_word_weights_file=True,
word_topic_counts_file=True, diagnostics_file=True, xml_topic_report=True,
xml_topic_phrase_report=True, output_topic_docs=False, num_top_docs=100,
output_doc_topics=True, doc_topics_threshold=0.0,
num_topics=10, num_top_words=10,
num_iterations=1000, num_threads=1, num_icm_iterations=0,
no_inference=False, random_seed=0, optimize_interval=0,
optimize_burn_in=200, use_symmetric_alpha=False, alpha=5.0,
beta=0.01):
"""Creates MALLET model.
Description:
bla
Args:
path_to_mallet (str): Path to MALLET. Defaults to 'mallet'. If MALLET is
not properly installed use absolute path, e.g. '/home/workspace/mallet/bin/mallet'.
path_to_binary (str): Path to previously created MALLET binary.
input_model (str): Absolute path to the binary topic model created by `output_model`.
input_state (str): Absolute path to the gzipped Gibbs sampling state created by `output_state`.
folder_for_output (str): Folder for MALLET output.
output_model (bool): Write a serialized MALLET topic trainer object.
This type of output is appropriate for pausing and restarting training,
but does not produce data that can easily be analyzed. Defaults to True.
output_model_interval (int): The number of iterations between writing the
model (and its Gibbs sampling state) to a binary file. You must also
set the `output_model` parameter to use this option, whose argument
will be the prefix of the filenames. Defaults to 0.
output_state (bool): Write a compressed text file containing the words
in the corpus with their topic assignments. The file format can easily
be parsed and used by non-Java-based software. Defaults to True.
output_state_interval (int): The number of iterations between writing the
sampling state to a text file. You must also set the `output_state`
to use this option, whose argument will be the prefix of the filenames.
Defaults to 0.
inference_file (bool): A topic inferencer applies a previously trained
topic model to new documents. Defaults to False.
evaluator_file (bool): A held-out likelihood evaluator for new documents.
Defaults to False.
output_topic_keys (bool): Write the top words for each topic and any
Dirichlet parameters. Defaults to True.
topic_word_weights_file (bool): Write unnormalized weights for every
topic and word type. Defaults to True.
word_topic_counts_file (bool): Write a sparse representation of topic-word
assignments. By default this is null, indicating that no file will
be written. Defaults to True.
diagnostics_file (bool): Write measures of topic quality, in XML format.
Defaults to True.
xml_topic_report (bool): Write the top words for each topic and any
Dirichlet parameters in XML format. Defaults to True.
xml_topic_phrase_report (bool): Write the top words and phrases for each
topic and any Dirichlet parameters in XML format. Defaults to True.
output_topic_docs (bool): Currently not available. Write the most prominent
documents for each topic, at the end of the iterations. Defaults to False.
num_top_docs (int): Currently not available. Number of top documents for
`output_topic_docs`. Defaults to False.
output_doc_topics (bool): Write the topic proportions per document, at
the end of the iterations. Defaults to True.
doc_topics_threshold (float): Do not print topics with proportions less
than this threshold value within `output_doc_topics`. Defaults to 0.0.
num_topics (int): Number of topics. Defaults to 10.
num_top_words (int): Number of keywords for each topic. Defaults to 10.
num_interations (int): Number of iterations. Defaults to 1000.
num_threads (int): Number of threads for parallel training. Defaults to 1.
num_icm_iterations (int): Number of iterations of iterated conditional
modes (topic maximization). Defaults to 0.
no_inference (bool): Load a saved model and create a report. Equivalent
to `num_iterations = 0`. Defaults to False.
random_seed (int): Random seed for the Gibbs sampler. Defaults to 0.
optimize_interval (int): Number of iterations between reestimating
dirichlet hyperparameters. Defaults to 0.
optimize_burn_in (int): Number of iterations to run before first
estimating dirichlet hyperparameters. Defaults to 200.
use_symmetric_alpha (bool): Only optimize the concentration parameter of
the prior over document-topic distributions. This may reduce the
number of very small, poorly estimated topics, but may disperse common
words over several topics. Defaults to False.
alpha (float): Sum over topics of smoothing over doc-topic distributions.
alpha_k = [this value] / [num topics]. Defaults to 5.0.
beta (float): Smoothing parameter for each topic-word. Defaults to 0.01.
ToDo:
Param 'output_topic_docs' is causing an internal error
-> Exception in thread "main" java.lang.ClassCastException: java.net.URI cannot be cast to java.lang.String
-> at cc.mallet.topics.ParallelTopicModel.printTopicDocuments(ParallelTopicModel.java:1773)
-> at cc.mallet.topics.tui.TopicTrainer.main(TopicTrainer.java:281)
Param, 'num_top_docs' is obsolete, refering to 'output_topic_docs'
Returns:
Nothing.
"""
if system() == 'Windows':
shell = True
else:
shell = False
os.makedirs(folder_for_output, exist_ok=True)
param = [path_to_mallet, 'train-topics']
if input_model is None:
param.append('--input')
else:
param.append('--input-model')
param.append(input_model)
if path_to_binary is not None:
param.append(path_to_binary)
if input_state is not None:
param.append('--input-state')
param.append(input_state)
log.debug("Choosing parameters ...")
if num_topics is not False:
param.append('--num-topics')
param.append(str(num_topics))
if num_iterations is not False:
param.append('--num-iterations')
param.append(str(num_iterations))
if num_threads is not False:
param.append('--num-threads')
param.append(str(num_threads))
if num_top_words is not False:
param.append('--num-top-words')
param.append(str(num_top_words))
if num_icm_iterations is not False:
param.append('--num-icm-iterations')
param.append(str(num_icm_iterations))
if no_inference is not False:
param.append('--no-inference')
param.append(str(no_inference))
if random_seed is not False:
param.append('--random-seed')
param.append(str(random_seed))
log.debug("Choosing hyperparameters ...")
if optimize_interval is not None:
param.append('--optimize-interval')
param.append(str(optimize_interval))
if optimize_burn_in is not None:
param.append('--optimize-burn-in')
param.append(str(optimize_burn_in))
if use_symmetric_alpha is not None:
param.append('--use-symmetric-alpha')
if alpha is not None:
param.append('--alpha')
param.append(str(alpha))
if beta is not None:
param.append('--beta')
param.append(str(beta))
log.debug("Choosing output parameters ...")
if output_topic_keys:
param.append('--output-topic-keys')
param.append(os.path.join(folder_for_output, 'topic_keys.txt'))
if output_doc_topics:
param.append('--output-doc-topics')
param.append(os.path.join(folder_for_output, 'doc_topics.txt'))
if doc_topics_threshold is not None:
param.append('--doc-topics-threshold')
param.append(str(doc_topics_threshold))
if topic_word_weights_file:
param.append('--topic-word-weights-file')
param.append(os.path.join(folder_for_output, 'topic_word_weights.txt'))
if word_topic_counts_file:
param.append('--word-topic-counts-file')
param.append(os.path.join(folder_for_output, 'word_topic_counts.txt'))
if diagnostics_file:
param.append('--diagnostics-file')
param.append(os.path.join(folder_for_output, 'diagnostics.xml'))
if xml_topic_report:
param.append('--xml-topic-report')
param.append(os.path.join(folder_for_output, 'topic_report.xml'))
if xml_topic_phrase_report:
param.append('--xml-topic-phrase-report')
param.append(os.path.join(folder_for_output, 'topic_phrase_report.xml'))
if output_model:
param.append('--output-model')
param.append(os.path.join(folder_for_output, 'mallet.model'))
if output_model_interval is not None:
param.append('--output-model-interval')
param.append(str(output_model_interval))
if output_state:
param.append('--output-state')
param.append(os.path.join(folder_for_output, 'state.gz'))
if output_state_interval is not None:
param.append('--output-state-interval')
param.append(str(output_state_interval))
if inferencer_file:
param.append('--inferencer-filename')
param.append(os.path.join(folder_for_output, 'inferencer'))
if evaluator_file:
param.append('--evaluator-filename')
param.append(os.path.join(folder_for_output, 'evaluator'))
# not yet working
if output_topic_docs:
param.append('--output-topic-docs')
param.append(os.path.join(folder_for_output, 'topic_docs.txt'))
if num_top_docs is not None:
param.append('--num-top-docs')
param.append(str(topic_word_weights_file))
try:
log.info("Accessing Mallet with %s ...", param)
p = Popen(param, stdout=PIPE, stderr=PIPE, shell=shell)
out = p.communicate()[1].decode('utf-8')
log.debug(out)
except KeyboardInterrupt:
p.terminate()
log.error(out)
def _grouper(n, iterable, fillvalue=None):
"""Collects data into fixed-length chunks or blocks.
Args:
Returns:
"""
args=[iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)
def show_doc_topic_matrix(output_folder, doc_topics='doc_topics.txt', topic_keys='topic_keys.txt',
easy_file_format=False):
"""Shows document-topic-mapping.
Args:
outfolder (str): Folder for MALLET output.
doc_topics (str): Name of MALLET's doc_topic file. Defaults to 'doc_topics.txt'.
topic_keys (str): Name of MALLET's topic_keys file. Defaults to 'topic_keys.txt'.
ToDo: Prettify docnames
"""
doc_topics=os.path.join(output_folder, doc_topics)
assert doc_topics
topic_keys=os.path.join(output_folder, topic_keys)
assert topic_keys
doctopic_triples=[]
mallet_docnames=[]
topics=[]
df=pd.read_csv(topic_keys, sep='\t', header=None, encoding='utf-8')
labels=[]
for index, item in df.iterrows():
label=' '.join(item[2].split()[:3])
labels.append(label)
with open(doc_topics, encoding='utf-8') as f:
for line in f:
li=line.lstrip()
if li.startswith("#"):
lines=f.readlines()
for line in lines:
docnum, docname, *values=line.rstrip().split('\t')
mallet_docnames.append(docname)
for topic, share in _grouper(2, values):
triple=(docname, int(topic), float(share))
topics.append(int(topic))
doctopic_triples.append(triple)
else:
easy_file_format=True
break
if easy_file_format:
newindex=[]
doc_topic_matrix=pd.read_csv(
doc_topics, sep='\t', names=labels[0:], encoding='utf-8')
for eins, zwei in doc_topic_matrix.index:
newindex.append(os.path.basename(zwei))
doc_topic_matrix.index=newindex
else:
# sort the triples
# triple is (docname, topicnum, share) so sort(key=operator.itemgetter(0,1))
# sorts on (docname, topicnum) which is what we want
doctopic_triples=sorted(
doctopic_triples, key=operator.itemgetter(0, 1))
# sort the document names rather than relying on MALLET's ordering
mallet_docnames=sorted(mallet_docnames)
# collect into a document-term matrix
num_docs=len(mallet_docnames)
num_topics=max(topics) + 1
# the following works because we know that the triples are in
# sequential order
data=np.zeros((num_docs, num_topics))
for triple in doctopic_triples:
docname, topic, share=triple
row_num=mallet_docnames.index(docname)
data[row_num, topic]=share
topicLabels=[]
# creates list of topic lables consisting of the 3 most weighed topics
df=pd.read_csv(topic_keys, sep='\t', header=None, encoding='utf-8')
labels=[]
for index, item in df.iterrows():
topicLabel=' '.join(item[2].split()[:3])
topicLabels.append(topicLabel)
shortened_docnames=[]
for item in mallet_docnames:
shortened_docnames.append(os.path.basename(item))
'''
for topic in range(max(topics)+1):
topicLabels.append("Topic_" + str(topic))
'''
doc_topic_matrix=pd.DataFrame(data=data[0:, 0:],
index=shortened_docnames[0:],
columns=topicLabels[0:])
return doc_topic_matrix.T
def show_topics_keys(output_folder, topicsKeyFile="topic_keys.txt", num_topics=10):
"""Show topic-key-mapping.
Args:
outfolder (str): Folder for Mallet output,
topicsKeyFile (str): Name of Mallets' topic_key file, default "topic_keys"
#topic-model-mallet
Note: FBased on DARIAH-Tutorial -> https://de.dariah.eu/tatom/topic_model_mallet.html
ToDo: Prettify index
"""
path_to_topic_keys=os.path.join(output_folder, topicsKeyFile)
assert path_to_topic_keys
with open(path_to_topic_keys, encoding='utf-8') as input:
topic_keys_lines=input.readlines()
topic_keys=[]
#topicLabels=[]
for line in topic_keys_lines:
_, _, words=line.split('\t') # tab-separated
words=words.rstrip().split(' ') # remove the trailing '\n'
topic_keys.append(words)
topicKeysMatrix=pd.DataFrame(topic_keys)
topicKeysMatrix.index=['Topic ' + str(x + 1) for x in range(num_topics)]
topicKeysMatrix.columns=['Key ' + str(x + 1) for x in range(10)]
return topicKeysMatrix