Skip to content

Commit

Permalink
some small fixes in mallet.py
Browse files Browse the repository at this point in the history
  • Loading branch information
sinabock committed Feb 28, 2017
1 parent 95754ba commit 86f3ab0
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 28 deletions.
45 changes: 31 additions & 14 deletions IntegrationTest_txt_Mallet.ipynb

Large diffs are not rendered by default.

25 changes: 11 additions & 14 deletions dariah_topics/mallet.py
Expand Up @@ -57,14 +57,12 @@ def create_mallet_model(outfolder, path_to_corpus = os.path.join(os.path.abspath

sys = system()
if sys == 'Windows':
output = os.path.join(outfolder, outfile)
log.debug(output)
shell=True
else:
output = os.path.join(outfolder, outfile)
log.debug(output)
else:
shell=False

output = os.path.join(outfolder, outfile)
log.debug(output)
param.append("--output")
param.append(output)
param.append ("--keep-sequence")
Expand All @@ -79,7 +77,7 @@ def create_mallet_model(outfolder, path_to_corpus = os.path.join(os.path.abspath
param.append("--stoplist-file")
param.append(stoplist)

print(param)
log.debug(print(param))

try:
log.info("Accessing Mallet ...")
Expand Down Expand Up @@ -140,8 +138,8 @@ def create_mallet_output(path_to_malletModel, outfolder, path_to_mallet="mallet"
doc_topics = outfolder + "/" + "doc_topics.txt"
topic_keys = outfolder + "/" + "topic_keys.txt"
state = outfolder + "/" + "state.gz"
word_topic_counts = outfolder + "/" + "word_topic_counts.txt"
word_topics_weights = outfolder + "/" + "word_topic_weights.txt"
# word_topic_counts = outfolder + "/" + "word_topic_counts.txt"
# word_topics_weights = outfolder + "/" + "word_topic_weights.txt"
log.debug(outfolder)
shell = False

Expand All @@ -151,17 +149,16 @@ def create_mallet_output(path_to_malletModel, outfolder, path_to_mallet="mallet"
param.append(state)
param.append("--output-topic-keys")
param.append(topic_keys)
param.append("--word-topic-counts-file")
param.append(word_topic_counts)
param.append("--topic-word-weights-file")
param.append(word_topics_weights)
print(param)
# param.append("--word-topic-counts-file")
# param.append(word_topic_counts)
# param.append("--topic-word-weights-file")
# param.append(word_topics_weights)
#print(param)

try:
log.info("Accessing Mallet ...")
p = Popen(param, stdout=PIPE, stderr=PIPE, shell=shell)
out = p.communicate()
log.info(out)
log.debug("Mallet file available.")


Expand Down
18 changes: 18 additions & 0 deletions tutorial_supplementals/mallet_output/doc_topics.txt
@@ -0,0 +1,18 @@
#doc name topic proportion ...
0 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_EurekaAProsePoem.txt 3 0.7704060564349622 5 0.13406744666207845 2 0.029387474191328287 7 0.025258086717136956 9 0.017825189263592566 0 0.010254645560908465 8 0.008121128699242945 1 0.0024088093599449415 4 0.0013076393668272539 6 9.635237439779766E-4
1 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_TheDevilinIron.txt 1 0.4654215976331361 7 0.27144970414201186 2 0.0963387573964497 9 0.051960059171597635 5 0.04659763313609468 6 0.02440828402366864 3 0.015902366863905327 0 0.01164940828402367 8 0.009985207100591717 4 0.00628698224852071
2 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Lovecraft_TheShunnedHouse.txt 5 0.38999794196336696 7 0.1891335665774851 9 0.12595184194278658 2 0.08849557522123894 8 0.0572134183988475 3 0.04939287919324964 0 0.04157233998765178 1 0.03580983741510599 6 0.01173080880839679 4 0.010701790491870755
3 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_SchadowsinZamboula.txt 1 0.456065457132693 7 0.31127712557808607 2 0.11028103877623621 5 0.040732835289932406 9 0.019565990750622556 0 0.017787264318747775 6 0.01672002845962291 3 0.012984702952685877 8 0.008537886872998933 4 0.006047669868374244
4 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_AStudyinScarlet.txt 0 0.28074815794445496 2 0.2085773663328925 7 0.17627054600415643 8 0.15926695635745325 5 0.09175640783424649 3 0.02481264563259651 9 0.02008942628629007 1 0.016058945777441904 6 0.013099061653756534 4 0.00932048617671138
5 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_TheCaskofAmontillado.txt 5 0.37987012987012986 7 0.20562770562770563 2 0.13095238095238096 0 0.08008658008658008 3 0.05844155844155844 8 0.04220779220779221 9 0.03787878787878788 1 0.027056277056277056 4 0.025974025974025976 6 0.011904761904761904
6 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_TheMasqueoftheRedDeath.txt 7 0.3899308983218164 1 0.13820335636722605 5 0.12931885488647582 2 0.08884501480750247 0 0.06712734452122408 8 0.05725567620927937 9 0.055281342546890426 3 0.04343534057255676 6 0.019743336623889437 4 0.01085883514313919
7 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_GodsoftheNorth.txt 1 0.5100882723833543 7 0.2604035308953342 2 0.08448928121059268 9 0.05737704918032787 5 0.02459016393442623 6 0.02080706179066835 8 0.012610340479192938 3 0.012610340479192938 4 0.008827238335435058 0 0.00819672131147541
8 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Kipling_TheEndofthePassage.txt 2 0.5574482297929192 7 0.11656646626586506 5 0.06579826319305278 0 0.061790247160988646 8 0.05143620574482298 6 0.047762191048764197 1 0.02905811623246493 9 0.02839011356045424 3 0.02404809619238477 4 0.017702070808283232
9 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_TheSignoftheFour.txt 0 0.34150146246343843 2 0.23178420539486513 7 0.15996100097497562 8 0.11777705557361066 5 0.06753331166720832 3 0.021969450763730906 1 0.021319467013324666 9 0.016704582385440363 6 0.011699707507312317 4 0.009749756256093598
10 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Kipling_TheJungleBook.txt 6 0.5303192524819934 2 0.2976932061514503 7 0.11222503406657582 1 0.018103951722795407 0 0.011533969242748686 4 0.01119330348452404 8 0.005742651352929726 5 0.00559665174226202 9 0.004185322172474207 3 0.0034066575822464472
11 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_AScandalinBohemia.txt 8 0.33998038574697614 2 0.20333442301405688 0 0.14351095129127167 7 0.14220333442301406 5 0.09905197777051324 1 0.02582543314808761 3 0.020268061457992807 6 0.009480222294867604 4 0.008826413860738803 9 0.007518796992481203
12 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_ThePurloinedLetter.txt 5 0.4403528960490986 3 0.174530111238972 2 0.10855389336401995 8 0.09512850019179134 0 0.07633294975067127 7 0.06866129650939777 9 0.014959723820483314 1 0.00805523590333717 6 0.006904487917146145 4 0.00652090525508247
13 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Lovecraft_AttheMountainofMadness.txt 9 0.5579517745868329 5 0.15697642915199134 7 0.12961257111893795 2 0.06410186941208344 3 0.04882145759956651 1 0.012842048225413167 8 0.011324844215659713 0 0.010078569493362232 6 0.004714169601733947 4 0.003576266594418857
14 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Kipling_ThyServantaDog.txt 4 0.5706797966963151 2 0.23117852604828462 7 0.07703303684879288 5 0.02874841168996188 0 0.027318932655654382 6 0.021998094027954256 8 0.018424396442185513 1 0.010403430749682339 3 0.007544472681067344 9 0.006670902160101652
15 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_ShadowsintheMoonlight.txt 1 0.4640605296343001 7 0.2702215816969915 2 0.13168798414700053 9 0.03747072599531616 6 0.03548910106287156 5 0.02269861286254729 0 0.01405152224824356 8 0.01098901098901099 3 0.007566204287515763 4 0.005764727076202486
16 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_TheHoundoftheBaskervilles.txt 8 0.42712639206341485 2 0.20585517677381704 7 0.18008072752030346 5 0.07596167874337402 0 0.057044205612021594 3 0.01833390069542382 1 0.012887224626756797 9 0.012546807372465107 4 0.008413169284637456 6 0.0017507173077858289
Binary file not shown.
Binary file not shown.
Binary file added tutorial_supplementals/mallet_output/state.gz
Binary file not shown.
10 changes: 10 additions & 0 deletions tutorial_supplementals/mallet_output/topic_keys.txt
@@ -0,0 +1,10 @@
0 5 small holmes answered hope face great hand room mr case sherlock door asked men companion treasure mind father sholto found
1 5 conan eyes door sword girl men great head feet blood olivia trees cimmerian cliffs man chamber red aram giant limbs
2 5 man time long night good day made eyes looked make side don give head put life place white heart knew
3 5 centre point matter atoms fact idea sun universe force space general principle stars law tendency distance system truth earth condition
4 5 ravager dog smallest moore slippers master proper back wented told tags rat walk hunt nose dash things gods toby cat
5 5 years thought thing part nature moment found time true form house letter finally street uncle human ground pass matter suggested
6 5 mowgli jungle bagheera big wolf sea men nag rikki tikki back thou baloo khan head shere people toomai till thee
7 5 back black hand stood hands dark found voice open light set felt brought turned fear sound house half death people
8 5 sir holmes man moor henry watson friend dr house baskerville heard charles case stapleton mortimer hall told mr doubt room
9 5 lake great land camp made things city feet world ice danforth mountains place vast antarctic earth sea life snow range

0 comments on commit 86f3ab0

Please sign in to comment.