Update MALLET and demonstrator stuff

DARIAH-DE · Mar 28, 2017 · 321a1f6 · 321a1f6
1 parent 891d6cb
commit 321a1f6
Show file tree

Hide file tree

Showing 1,852 changed files with 143,084 additions and 338 deletions.
diff --git a/IntegrationTest_txt_Mallet.ipynb b/IntegrationTest_txt_Mallet.ipynb
diff --git a/dariah_topics/mallet.py b/dariah_topics/mallet.py
diff --git a/demonstrator/README.md b/demonstrator/README.md
@@ -1,3 +1,3 @@
 1. Make sure `dariah_topics` is locally installed (if not, run `pip3 install 'git+https://github.com/DARIAH-DE/Topics#egg=dariah_topics[demonstrator]'`)
-2. Make sure MALLET is installed
+2. Make sure MALLET is installed (if not, copy `mallet/bin/mallet` into the folder `demonstrator`)
 3. Run `demonstrator.py`
diff --git a/demonstrator/demonstrator.py b/demonstrator/demonstrator.py
@@ -13,8 +13,6 @@
     https://github.com/DARIAH-DE
 """
 
-import matplotlib
-matplotlib.use('Agg')
 from dariah_topics import preprocessing
 from dariah_topics import visualization
 from dariah_topics import mallet
@@ -28,58 +26,62 @@
 import shutil
 import threading
 import webbrowser
+import matplotlib
+matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 from werkzeug.utils import secure_filename
 
 __author__ = "Severin Simmler"
 __email__ = "severin.simmler@stud-mail.uni-wuerzburg.de"
-__date__ = "2017-02-22"
 
 app = Flask(__name__)
 
+def tei(file):
+    ns = dict(tei='http://www.tei-c.org/ns/1.0')
+    text = etree.parse(file)
+    text = text.xpath('//tei:text', namespaces=ns)[0]
+    text = "".join(text.xpath('.//text()'))
+    return text
+
 @app.route('/')
 def index():
     print("Rendering index.html ...")
     return render_template('index.html')
 
 @app.route('/upload', methods=['POST'])
 def upload_file():
-    # access input
     files = request.files.getlist('files')
     lda = request.form['lda']
     num_topics = int(request.form['number_topics'])
     num_iterations = int(request.form['number_iterations'])
     threshold = int(request.form['mfws'])
-
+    if request.files.get('stoplist', None):
+        stopwords = request.files['stoplist']
+
     if 'gensim' in lda:
         corpus = pd.Series()
 
     print("Accessing and tokenizing files ...")
     for file in files:
         filename, extension = os.path.splitext(secure_filename(file.filename))
-        
+
         if 'mallet' in lda:
-            os.makedirs('./tmp_files', exist_ok=True)
+            os.makedirs('tmp_files', exist_ok=True)
             if extension == '.txt':
-                file.save('./tmp_files/' + secure_filename(file.filename))
+                file.save('tmp_files/' + secure_filename(file.filename))
             elif extension == '.xml':
-                ns = dict(tei="http://www.tei-c.org/ns/1.0")
-                text = etree.parse(file)
-                text = text.xpath('//tei:text', namespaces=ns)[0]
-                text = "".join(text.xpath('.//text()'))
-                with open('./tmp_files/' + secure_filename(file.filename), 'w+', encoding='utf-8') as f:
+                text = tei(file)
+                with open('tmp_files/' + secure_filename(file.filename), 'w+', encoding='utf-8') as f:
                     f.writelines(text)
-
-
+            else:
+                print("Error: File format is not supported.")
+
         elif 'gensim' in lda:
             if extension == '.txt':
                 text = file.read().decode('utf-8')
             elif extension == '.xml':
-                ns = dict(tei="http://www.tei-c.org/ns/1.0")
-                text = etree.parse(file)
-                text = text.xpath('//tei:text', namespaces=ns)[0]
-                text = "".join(text.xpath('.//text()'))
+                text = tei(file)
             else:
                 print("Error: File format is not supported.")
             tokens = list(preprocessing.tokenize(text))
@@ -90,44 +92,42 @@ def upload_file():
     if 'mallet' in lda:
         print("Creating MALLET binary ...")
         if request.files.get('stoplist', None):
-            os.makedirs('./stopwordlist', exist_ok=True)
-            stopwords = request.files['stoplist']
-            stopwords.save('./stopwordlist/' + secure_filename(stopwords.filename))
+            os.makedirs('stopwordlist', exist_ok=True)
+            stopwords.save('stopwordlist/' + secure_filename(stopwords.filename))
             try:
-                mallet.create_mallet_model("./mallet_output", "./tmp_files", 'mallet', stoplist='./stopwordlist/' + secure_filename(stopwords.filename))
+                mallet.create_mallet_model('mallet_output', 'tmp_files', 'mallet', stoplist='stopwordlist/'+secure_filename(stopwords.filename))
             except:
-                mallet.create_mallet_model("./mallet_output", "./tmp_files", './mallet/bin/mallet', stoplist='./stopwordlist/' + secure_filename(stopwords.filename))
-            shutil.rmtree('./stopwordlist')
+                mallet.create_mallet_model('mallet_output', 'tmp_files', 'mallet/bin/mallet', stoplist='stopwordlist/'+secure_filename(stopwords.filename))
+            shutil.rmtree('stopwordlist')
         else:
             try:
-                mallet.create_mallet_model("./mallet_output", "./tmp_files", 'mallet')
+                mallet.create_mallet_model('mallet_output', 'tmp_files', 'mallet')
             except:
-                mallet.create_mallet_model("./mallet_output", "./tmp_files", './mallet/bin/mallet')
-            
+                mallet.create_mallet_model('mallet_output', 'tmp_files', 'mallet/bin/mallet')
+
         print("Training MALLET LDA model ...")
         try:
-            mallet.create_mallet_output('./mallet_output/malletModel.mallet', './mallet_output', 'mallet', num_topics=str(num_topics), num_iterations=str(num_iterations))
+            mallet.create_mallet_output('mallet_output/malletModel.mallet', 'mallet_output', 'mallet', num_topics=str(num_topics), num_iterations=str(num_iterations))
         except:
-            mallet.create_mallet_output('./mallet_output/malletModel.mallet', './mallet_output', './mallet/bin/mallet', num_topics=str(num_topics), num_iterations=str(num_iterations))
-        df = mallet.show_topics_keys('./mallet_output', num_topics=num_topics)
-        doc_topic = mallet.show_docTopicMatrix('./mallet_output')
+            mallet.create_mallet_output('mallet_output/malletModel.mallet', 'mallet_output', 'mallet/bin/mallet', num_topics=str(num_topics), num_iterations=str(num_iterations))
+        df = mallet.show_topics_keys('mallet_output', num_topics=num_topics)
+        doc_topic = mallet.show_docTopicMatrix('mallet_output')
         heatmap = visualization.doc_topic_heatmap(doc_topic)
-        heatmap.savefig('./static/heatmap.png')
+        heatmap.savefig('static/heatmap.png')
         heatmap.close()
-
 
-        with open ('./mallet_output/topic_keys.txt', 'r', encoding='utf-8') as f:
+        with open ('mallet_output/topic_keys.txt', 'r', encoding='utf-8') as f:
             text = f.read()
             wordcloud = WordCloud(width=800, height=600, background_color='white').generate(text)
             plt.imshow(wordcloud)
-            plt.axis("off")
-            plt.savefig('./static/cloud.png')
+            plt.axis('off')
+            plt.savefig('static/cloud.png')
             plt.close()
-        shutil.rmtree('./tmp_files')
-        shutil.rmtree('./mallet_output')
+        shutil.rmtree('tmp_files')
+        shutil.rmtree('mallet_output')
         print("Rendering result.hml ...")
         return render_template('result.html', tables=[df.to_html(classes='df')])
-    
+
     elif 'gensim' in lda:
         labels = corpus.index.tolist()
         tokens = corpus.tolist()
@@ -137,7 +137,6 @@ def upload_file():
 
         if request.files.get('stoplist', None):
             print("Accessing external stopword list and cleaning corpus ...")
-            stopwords = request.files['stoplist']
             words = stopwords.read().decode('utf-8')
             words = set(preprocessing.tokenize(words))
             hapax = preprocessing.find_hapax(sparse_bow, id_types)
@@ -150,15 +149,14 @@ def upload_file():
             hapax = preprocessing.find_hapax(sparse_bow, id_types)
             feature_list = set(stopwords).union(hapax)
             sparse_bow = preprocessing.remove_features(sparse_bow, id_types, feature_list)
-    
+
         print("Creating matrix market model ...")
         preprocessing.save_bow_mm(sparse_bow, 'matrixmarket')
 
         mm = MmCorpus('matrixmarket.mm')
         doc2id = {value : key for key, value in doc_ids.items()}
         type2id = {value : key for key, value in id_types.items()}
 
-
         print("Training Gensim LDA with", num_topics, "topics ...")
         model = LdaModel(corpus=mm, id2word=type2id, num_topics=num_topics, iterations=num_iterations, passes=10)
 
@@ -170,22 +168,22 @@ def upload_file():
 
         wordcloud = WordCloud(width=800, height=600, background_color='white').fit_words(model.show_topic(1,100))
         plt.imshow(wordcloud)
-        plt.axis("off")
-        plt.savefig('./static/cloud.png')
+        plt.axis('off')
+        plt.savefig('static/cloud.png')
         plt.close()
 
         # Todo: replace by DataFrame.to_html():
         print("Accessing topics for HTML table ...")
         df = preprocessing.gensim2dataframe(model)
         print("Rendering result.html ...")
         return render_template('result.html', tables=[df.to_html(classes='df')])
-    
-    
+
+
 @app.after_request
 def add_header(r):
-    r.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
-    r.headers["Pragma"] = "no-cache"
-    r.headers["Expires"] = "0"
+    r.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
+    r.headers['Pragma'] = 'no-cache'
+    r.headers['Expires'] = '0'
     r.headers['Cache-Control'] = 'public, max-age=0'
     return r