Skip to content

Commit

Permalink
Update MALLET and demonstrator stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
Severin Simmler committed Mar 28, 2017
1 parent 891d6cb commit 321a1f6
Show file tree
Hide file tree
Showing 1,852 changed files with 143,084 additions and 338 deletions.
95 changes: 40 additions & 55 deletions IntegrationTest_txt_Mallet.ipynb

Large diffs are not rendered by default.

502 changes: 297 additions & 205 deletions dariah_topics/mallet.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion demonstrator/README.md
@@ -1,3 +1,3 @@
1. Make sure `dariah_topics` is locally installed (if not, run `pip3 install 'git+https://github.com/DARIAH-DE/Topics#egg=dariah_topics[demonstrator]'`)
2. Make sure MALLET is installed
2. Make sure MALLET is installed (if not, copy `mallet/bin/mallet` into the folder `demonstrator`)
3. Run `demonstrator.py`
98 changes: 48 additions & 50 deletions demonstrator/demonstrator.py
Expand Up @@ -13,8 +13,6 @@
https://github.com/DARIAH-DE
"""

import matplotlib
matplotlib.use('Agg')
from dariah_topics import preprocessing
from dariah_topics import visualization
from dariah_topics import mallet
Expand All @@ -28,58 +26,62 @@
import shutil
import threading
import webbrowser
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from werkzeug.utils import secure_filename

__author__ = "Severin Simmler"
__email__ = "severin.simmler@stud-mail.uni-wuerzburg.de"
__date__ = "2017-02-22"

app = Flask(__name__)

def tei(file):
ns = dict(tei='http://www.tei-c.org/ns/1.0')
text = etree.parse(file)
text = text.xpath('//tei:text', namespaces=ns)[0]
text = "".join(text.xpath('.//text()'))
return text

@app.route('/')
def index():
print("Rendering index.html ...")
return render_template('index.html')

@app.route('/upload', methods=['POST'])
def upload_file():
# access input
files = request.files.getlist('files')
lda = request.form['lda']
num_topics = int(request.form['number_topics'])
num_iterations = int(request.form['number_iterations'])
threshold = int(request.form['mfws'])

if request.files.get('stoplist', None):
stopwords = request.files['stoplist']

if 'gensim' in lda:
corpus = pd.Series()

print("Accessing and tokenizing files ...")
for file in files:
filename, extension = os.path.splitext(secure_filename(file.filename))

if 'mallet' in lda:
os.makedirs('./tmp_files', exist_ok=True)
os.makedirs('tmp_files', exist_ok=True)
if extension == '.txt':
file.save('./tmp_files/' + secure_filename(file.filename))
file.save('tmp_files/' + secure_filename(file.filename))
elif extension == '.xml':
ns = dict(tei="http://www.tei-c.org/ns/1.0")
text = etree.parse(file)
text = text.xpath('//tei:text', namespaces=ns)[0]
text = "".join(text.xpath('.//text()'))
with open('./tmp_files/' + secure_filename(file.filename), 'w+', encoding='utf-8') as f:
text = tei(file)
with open('tmp_files/' + secure_filename(file.filename), 'w+', encoding='utf-8') as f:
f.writelines(text)


else:
print("Error: File format is not supported.")

elif 'gensim' in lda:
if extension == '.txt':
text = file.read().decode('utf-8')
elif extension == '.xml':
ns = dict(tei="http://www.tei-c.org/ns/1.0")
text = etree.parse(file)
text = text.xpath('//tei:text', namespaces=ns)[0]
text = "".join(text.xpath('.//text()'))
text = tei(file)
else:
print("Error: File format is not supported.")
tokens = list(preprocessing.tokenize(text))
Expand All @@ -90,44 +92,42 @@ def upload_file():
if 'mallet' in lda:
print("Creating MALLET binary ...")
if request.files.get('stoplist', None):
os.makedirs('./stopwordlist', exist_ok=True)
stopwords = request.files['stoplist']
stopwords.save('./stopwordlist/' + secure_filename(stopwords.filename))
os.makedirs('stopwordlist', exist_ok=True)
stopwords.save('stopwordlist/' + secure_filename(stopwords.filename))
try:
mallet.create_mallet_model("./mallet_output", "./tmp_files", 'mallet', stoplist='./stopwordlist/' + secure_filename(stopwords.filename))
mallet.create_mallet_model('mallet_output', 'tmp_files', 'mallet', stoplist='stopwordlist/'+secure_filename(stopwords.filename))
except:
mallet.create_mallet_model("./mallet_output", "./tmp_files", './mallet/bin/mallet', stoplist='./stopwordlist/' + secure_filename(stopwords.filename))
shutil.rmtree('./stopwordlist')
mallet.create_mallet_model('mallet_output', 'tmp_files', 'mallet/bin/mallet', stoplist='stopwordlist/'+secure_filename(stopwords.filename))
shutil.rmtree('stopwordlist')
else:
try:
mallet.create_mallet_model("./mallet_output", "./tmp_files", 'mallet')
mallet.create_mallet_model('mallet_output', 'tmp_files', 'mallet')
except:
mallet.create_mallet_model("./mallet_output", "./tmp_files", './mallet/bin/mallet')
mallet.create_mallet_model('mallet_output', 'tmp_files', 'mallet/bin/mallet')

print("Training MALLET LDA model ...")
try:
mallet.create_mallet_output('./mallet_output/malletModel.mallet', './mallet_output', 'mallet', num_topics=str(num_topics), num_iterations=str(num_iterations))
mallet.create_mallet_output('mallet_output/malletModel.mallet', 'mallet_output', 'mallet', num_topics=str(num_topics), num_iterations=str(num_iterations))
except:
mallet.create_mallet_output('./mallet_output/malletModel.mallet', './mallet_output', './mallet/bin/mallet', num_topics=str(num_topics), num_iterations=str(num_iterations))
df = mallet.show_topics_keys('./mallet_output', num_topics=num_topics)
doc_topic = mallet.show_docTopicMatrix('./mallet_output')
mallet.create_mallet_output('mallet_output/malletModel.mallet', 'mallet_output', 'mallet/bin/mallet', num_topics=str(num_topics), num_iterations=str(num_iterations))
df = mallet.show_topics_keys('mallet_output', num_topics=num_topics)
doc_topic = mallet.show_docTopicMatrix('mallet_output')
heatmap = visualization.doc_topic_heatmap(doc_topic)
heatmap.savefig('./static/heatmap.png')
heatmap.savefig('static/heatmap.png')
heatmap.close()


with open ('./mallet_output/topic_keys.txt', 'r', encoding='utf-8') as f:
with open ('mallet_output/topic_keys.txt', 'r', encoding='utf-8') as f:
text = f.read()
wordcloud = WordCloud(width=800, height=600, background_color='white').generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('./static/cloud.png')
plt.axis('off')
plt.savefig('static/cloud.png')
plt.close()
shutil.rmtree('./tmp_files')
shutil.rmtree('./mallet_output')
shutil.rmtree('tmp_files')
shutil.rmtree('mallet_output')
print("Rendering result.hml ...")
return render_template('result.html', tables=[df.to_html(classes='df')])

elif 'gensim' in lda:
labels = corpus.index.tolist()
tokens = corpus.tolist()
Expand All @@ -137,7 +137,6 @@ def upload_file():

if request.files.get('stoplist', None):
print("Accessing external stopword list and cleaning corpus ...")
stopwords = request.files['stoplist']
words = stopwords.read().decode('utf-8')
words = set(preprocessing.tokenize(words))
hapax = preprocessing.find_hapax(sparse_bow, id_types)
Expand All @@ -150,15 +149,14 @@ def upload_file():
hapax = preprocessing.find_hapax(sparse_bow, id_types)
feature_list = set(stopwords).union(hapax)
sparse_bow = preprocessing.remove_features(sparse_bow, id_types, feature_list)

print("Creating matrix market model ...")
preprocessing.save_bow_mm(sparse_bow, 'matrixmarket')

mm = MmCorpus('matrixmarket.mm')
doc2id = {value : key for key, value in doc_ids.items()}
type2id = {value : key for key, value in id_types.items()}


print("Training Gensim LDA with", num_topics, "topics ...")
model = LdaModel(corpus=mm, id2word=type2id, num_topics=num_topics, iterations=num_iterations, passes=10)

Expand All @@ -170,22 +168,22 @@ def upload_file():

wordcloud = WordCloud(width=800, height=600, background_color='white').fit_words(model.show_topic(1,100))
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('./static/cloud.png')
plt.axis('off')
plt.savefig('static/cloud.png')
plt.close()

# Todo: replace by DataFrame.to_html():
print("Accessing topics for HTML table ...")
df = preprocessing.gensim2dataframe(model)
print("Rendering result.html ...")
return render_template('result.html', tables=[df.to_html(classes='df')])


@app.after_request
def add_header(r):
r.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
r.headers["Pragma"] = "no-cache"
r.headers["Expires"] = "0"
r.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
r.headers['Pragma'] = 'no-cache'
r.headers['Expires'] = '0'
r.headers['Cache-Control'] = 'public, max-age=0'
return r

Expand Down

0 comments on commit 321a1f6

Please sign in to comment.