Skip to content

Commit

Permalink
mallet.py updated
Browse files Browse the repository at this point in the history
  • Loading branch information
sinabock committed Feb 22, 2017
1 parent 27d4854 commit 7c32eb4
Showing 1 changed file with 42 additions and 45 deletions.
87 changes: 42 additions & 45 deletions dariah_topics/mallet.py
Expand Up @@ -33,7 +33,7 @@
format = '%(levelname)s %(name)s: %(message)s')

def create_mallet_model(outfolder, path_to_corpus = os.path.join(os.path.abspath('.'), 'corpus_txt'), path_to_mallet="mallet", outfile = "malletModel.mallet",
remove_stopwords=True, stoplist = None):
stoplist = None):
"""Create a mallet binary file
Args:
Expand Down Expand Up @@ -68,20 +68,19 @@ def create_mallet_model(outfolder, path_to_corpus = os.path.join(os.path.abspath
param.append("--output")
param.append(output)
param.append ("--keep-sequence")
param.append("--remove-stopwords")

if remove_stopwords:
param.append("--remove-stopwords")
#if(tokens == "True"):
#param.append("--token-regex")
#token_regex = "'\p{L}[\p{L}\p{P}]*\p{L}'"
#param.append(token_regex)

if(stoplist != None):
param.append("--stoplist-file")
param.append(stoplist)
print(param)


try:
print("Accessing Mallet ...")
log.info("Accessing Mallet ...")
p = Popen(param, stdout=PIPE, stderr=PIPE, shell=shell)
out = p.communicate()
log.debug("Mallet file available.")
Expand All @@ -95,7 +94,7 @@ def create_mallet_model(outfolder, path_to_corpus = os.path.join(os.path.abspath


def create_mallet_output(path_to_malletModel, outfolder, path_to_mallet="mallet", num_topics = "10",
num_top_words = "10", #num_iterations = 10
#num_iterations = "10", num_top_words = "10"
):
"""Create mallet model
Expand All @@ -122,16 +121,16 @@ def create_mallet_output(path_to_malletModel, outfolder, path_to_mallet="mallet"
param.append(num_topics)
#param.append("--num-iterations")
#param.append(num_iterations)
param.append("--num-top-words")
param.append(num_top_words)
#param.append("--num-top-words")
#param.append(num_top_words)

sys = system()
if sys == 'Windows':
doc_topics = outfolder + "\\" + "doc_topics.txt"
topic_keys = outfolder + "\\" + "topic_keys.txt"
state = outfolder + "\\" + "state.gz"
word_topics_counts = outfolder + "\\" + "word_topic_counts.txt"
word_topics_weights = outfolder + "/" + "word_topic_weights.txt"
word_topics_weights = outfolder + "\\" + "word_topic_weights.txt"
log.debug(outfolder)
shell = True
else:
Expand Down Expand Up @@ -166,51 +165,51 @@ def create_mallet_output(path_to_malletModel, outfolder, path_to_mallet="mallet"
p.terminate()
log.debug("Mallet terminated.")

return outfolder

#return outfolder

def grouper(n, iterable, fillvalue=None):
"""Collect data into fixed-length chunks or blocks
Args:
Note:
Note:
ToDo: Args, From: DARIAH-Tutorial -> https://de.dariah.eu/tatom/topic_model_mallet.html#topic-model-mallet
"""

args = [iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)


def show_docTopicMatrix(output_folder, docTopicsFile = "doc_topics.txt"):
"""Show document-topic-mapping
Args:
outfolder (str): Folder for Mallet output, default = 'tutorial_supplementals/mallet_output'
docTopicsFile (str): Name of Mallets' doc_topic file, default doc_topics.txt
Note: Based on DARIAH-Tutorial -> https://de.dariah.eu/tatom/topic_model_mallet.html#topic-model-mallet
ToDo: Prettify docnames
"""

doc_topics = os.path.join(output_folder, docTopicsFile)
assert doc_topics

topic_keys = os.path.join(output_folder, "topic_keys.txt")
assert topic_keys

doctopic_triples = []
mallet_docnames = []
topics = []

df = pd.read_csv(topic_keys, sep='\t', header=None)
labels=[]
for index, item in df.iterrows():
label= ' '.join(item[2].split()[:3])
labels.append(label)

easy_file_format = False

with open(doc_topics) as f:
Expand All @@ -221,10 +220,10 @@ def show_docTopicMatrix(output_folder, docTopicsFile = "doc_topics.txt"):
for line in lines:
docnum, docname, *values = line.rstrip().split('\t')
mallet_docnames.append(docname)
for topic, share in grouper(2, values):
triple = (docname, int(topic), float(share))
topics.append(int(topic))
doctopic_triples.append(triple)
for topic, share in grouper(2, values):
triple = (docname, int(topic), float(share))
topics.append(int(topic))
doctopic_triples.append(triple)
else:
easy_file_format = True
break
Expand All @@ -237,7 +236,7 @@ def show_docTopicMatrix(output_folder, docTopicsFile = "doc_topics.txt"):
for eins, zwei in docTopicMatrix.index:
newindex.append(os.path.basename(zwei))
docTopicMatrix.index = newindex

else:
# sort the triples
# triple is (docname, topicnum, share) so sort(key=operator.itemgetter(0,1))
Expand All @@ -248,7 +247,7 @@ def show_docTopicMatrix(output_folder, docTopicsFile = "doc_topics.txt"):
mallet_docnames = sorted(mallet_docnames)

# collect into a document-term matrix
num_docs = len(mallet_docnames)
num_docs = len(mallet_docnames)

num_topics = max(topics) + 1

Expand All @@ -259,62 +258,60 @@ def show_docTopicMatrix(output_folder, docTopicsFile = "doc_topics.txt"):
docname, topic, share = triple
row_num = mallet_docnames.index(docname)
data[row_num, topic] = share

topicLabels = []

#creates list of topic lables consisting of the 3 most weighed topics
df = pd.read_csv('tutorial_supplementals/mallet_output/topic_keys.txt', sep='\t', header=None)
labels=[]
for index, item in df.iterrows():

topicLabel= ' '.join(item[2].split()[:3])
topicLabels.append(topicLabel)

shortened_docnames=[]
for item in mallet_docnames:
shortened_docnames.append(os.path.basename(item))



'''
for topic in range(max(topics)+1):
topicLabels.append("Topic_" + str(topic))
'''
'''
docTopicMatrix = pd.DataFrame(data=data[0:,0:],
index=shortened_docnames[0:],
columns=topicLabels[0:])

return docTopicMatrix.T

def show_topics_keys(output_folder, topicsKeyFile = "topic_keys.txt", topic_num=10, num_top_words=10):

def show_topics_keys(output_folder, topicsKeyFile = "topic_keys.txt"):
"""Show topic-key-mapping
Args:
outfolder (str): Folder for Mallet output,
topicsKeyFile (str): Name of Mallets' topic_key file, default "topic_keys"
Note: FBased on DARIAH-Tutorial -> https://de.dariah.eu/tatom/topic_model_mallet.html#topic-model-mallet
ToDo: Prettify index
"""

path_to_topic_keys = os.path.join(output_folder, topicsKeyFile)
assert path_to_topic_keys

with open(path_to_topic_keys) as input:
topic_keys_lines = input.readlines()


topic_keys = []
topicLabels = []


for line in topic_keys_lines:
_, _, words = line.split('\t') # tab-separated
words = words.rstrip().split(' ') # remove the trailing '\n'
topic_keys.append(words)

topicKeysMatrix = pd.DataFrame(topic_keys, index=["Topic " + str(x+1) for x in range(topic_num)], columns=["Key " + str(x+1) for x in range(num_top_words)])
topic_keys.append(words)
topicKeysMatrix = pd.DataFrame(topic_keys)

return topicKeysMatrix


0 comments on commit 7c32eb4

Please sign in to comment.