Skip to content

Commit

Permalink
Merge branch 'testing' of github.com:DARIAH-DE/Topics into testing
Browse files Browse the repository at this point in the history
testing.py minor changes
  • Loading branch information
Philip Dürholt committed Feb 16, 2017
2 parents da346b2 + ed8359a commit 9cf2099
Show file tree
Hide file tree
Showing 11 changed files with 714 additions and 1,598 deletions.
1,139 changes: 587 additions & 552 deletions IntegrationTest_txt_Mallet.ipynb

Large diffs are not rendered by default.

512 changes: 84 additions & 428 deletions IntegrationTest_txt_gensim.ipynb

Large diffs are not rendered by default.

569 changes: 0 additions & 569 deletions Tutorial_Toolbox.ipynb

This file was deleted.

24 changes: 12 additions & 12 deletions dariah_topics/mallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,20 @@ def create_mallet_model(outfolder, path_to_corpus = os.path.join(os.path.abspath

sys = system()
if sys == 'Windows':
output = outfolder + "\\" + outfile
#output = outfolder + "\\" + outfile
output = os.path.join(outfolder, outfile)
log.debug(output)
shell=True
else:
output = outfolder + "/" + outfile
#output = outfolder + "/" + outfile
output = os.path.join(outfolder, outfile)
log.debug(output)
shell=False

param.append("--output")
param.append(output)
param.append ("--keep-sequence")
param.append("--remove-stopwords")
#param.append("--remove-stopwords")

try:
log.info("Accessing Mallet ...")
Expand All @@ -81,10 +83,9 @@ def create_mallet_model(outfolder, path_to_corpus = os.path.join(os.path.abspath
log.debug("Mallet terminated.")

return output



def create_mallet_output(path_to_malletModel, outfolder, path_to_mallet="mallet", num_topics = "10", doc_topics ="doc_topics.txt", topic_keys="topic_keys", **kwargs):
def create_mallet_output(path_to_malletModel, outfolder, path_to_mallet="mallet", num_topics = "10", doc_topics ="doc_topics.txt", topic_keys="topic_keys.txt", **kwargs):
"""Create mallet model
Args:
Expand All @@ -95,7 +96,8 @@ def create_mallet_output(path_to_malletModel, outfolder, path_to_mallet="mallet"
ToDo: **kwargs()
"""

outfolder = doc_topics = os.path.join(os.path.abspath('.'), outfolder)

param = []
param.append(path_to_mallet)
param.append("train-topics")
Expand All @@ -106,14 +108,14 @@ def create_mallet_output(path_to_malletModel, outfolder, path_to_mallet="mallet"

sys = system()
if sys == 'Windows':
doc_topics = outfolder + "\\" + doc_topics
doc_topics = outfolder + "\\" + "doc_topics.txt"
topic_keys = outfolder + "\\" + topic_keys
state = outfolder + "\\" + "state.gz"
word_top = outfolder + "\\" + "word_top.txt"
log.debug(outfolder)
shell = True
else:
doc_topics = outfolder + "/" + doc_topics
doc_topics = outfolder + "/" + "doc_topics.txt"
topic_keys = outfolder + "/" + topic_keys
state = outfolder + "/" + "state.gz"
word_top = outfolder + "/" + "word_top.txt"
Expand All @@ -126,9 +128,7 @@ def create_mallet_output(path_to_malletModel, outfolder, path_to_mallet="mallet"
param.append(state)
param.append("--output-topic-keys")
param.append(topic_keys)
param.append("–word-topic-counts")
param.append(word_top)


try:
log.info("Accessing Mallet ...")
p = Popen(param, stdout=PIPE, stderr=PIPE, shell=shell)
Expand Down Expand Up @@ -222,7 +222,7 @@ def show_docTopicMatrix(output_folder, docTopicsFile = "doc_topics.txt"):

return docTopicMatrix

def show_topics_keys(output_folder, topicsKeyFile = "topic_keys"):
def show_topics_keys(output_folder, topicsKeyFile = "topic_keys.txt"):
"""Show topic-key-mapping
Args:
Expand Down
4 changes: 4 additions & 0 deletions nano.save
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
x

x

34 changes: 17 additions & 17 deletions tutorial_supplementals/mallet_output/doc_topics.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
#doc name topic proportion ...
0 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_EurekaAProsePoem.txt 19 0.41097728836889197 10 0.2692704748795595 17 0.20388850653819685 7 0.039538885065381967 8 0.023640743289745355 16 0.0229525120440468 2 0.008912594631796283 11 0.0062284927735719205 9 0.0032002752924982793 12 0.002167928423950447 1 0.001686166551961459 5 0.0015485203028217482 15 0.0012044046799724708 0 0.0012044046799724708 3 0.0010667584308327598 6 6.538196834136269E-4 4 5.849965588437716E-4 14 5.161734342739161E-4 13 5.161734342739161E-4 18 2.4088093599449416E-4
1 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_TheDevilinIron.txt 6 0.32830991124260356 2 0.1641087278106509 1 0.13526257396449703 11 0.10031434911242604 16 0.05704511834319527 9 0.05593565088757396 8 0.0472448224852071 7 0.029863165680473373 14 0.019323224852071007 5 0.01544008875739645 17 0.014885355029585799 10 0.012666420118343196 13 0.006749260355029586 0 0.003605769230769231 3 0.002311390532544379 12 0.001941568047337278 18 0.0017566568047337279 19 0.0013868343195266271 15 0.0013868343195266271 4 4.622781065088757E-4
2 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Lovecraft_TheShunnedHouse.txt 12 0.26847087878164233 8 0.18223914385676065 11 0.10341634081086644 7 0.07131096933525417 16 0.07069355834533855 10 0.05484667627083762 2 0.04723194072854497 17 0.044762296768882485 5 0.04455649310557728 9 0.027063181724634698 1 0.024181930438361803 0 0.012862728956575427 15 0.009981477670302531 4 0.008540852027166083 3 0.008540852027166083 14 0.0073060300473348425 19 0.006894422720724429 18 0.0033957604445359127 13 0.001955134801399465 6 0.001749331138094258
3 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_SchadowsinZamboula.txt 6 0.37753468516542155 2 0.16835645677694772 11 0.11232657417289221 1 0.10734614016364283 9 0.07942013518320883 16 0.05327285663464959 8 0.01840981856990395 7 0.016986837424404128 5 0.011472785485592316 17 0.01111704019921736 0 0.009871931696905017 12 0.007559587335467805 3 0.005602988260405549 10 0.005425115617218072 15 0.005069370330843116 14 0.004357879758093206 13 0.002934898612593383 4 0.001511917467093561 19 9.782995375311276E-4 18 4.446816079686944E-4
4 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_AStudyinScarlet.txt 15 0.20930159329932616 5 0.15829082435921657 16 0.14292461741923296 7 0.09827445053214938 2 0.09191384847912337 9 0.07824800050381006 17 0.07522514012217393 11 0.02692235027394672 0 0.022765917249197052 1 0.020561748220920713 8 0.019931985641413186 4 0.015208766295106745 3 0.011052333270357075 12 0.006266137666099881 10 0.006140185150198375 18 0.004754707475281819 19 0.004439826185528056 6 0.0041249448957742935 13 0.0019837521254487058 14 0.001668870835694943
5 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_TheCaskofAmontillado.txt 12 0.29274891774891776 16 0.12608225108225107 17 0.11093073593073594 11 0.0762987012987013 2 0.06547619047619048 9 0.06222943722943723 7 0.05357142857142857 5 0.039502164502164504 8 0.02867965367965368 1 0.026515151515151516 15 0.023268398268398268 14 0.017857142857142856 19 0.013528138528138528 3 0.013528138528138528 4 0.011363636363636364 0 0.010281385281385282 13 0.008116883116883116 10 0.007034632034632035 6 0.007034632034632035 18 0.005952380952380952
6 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_TheMasqueoftheRedDeath.txt 11 0.4348469891411649 2 0.08933859822309971 16 0.0824284304047384 1 0.07156959526159921 17 0.06663376110562685 5 0.06367226061204344 19 0.03405725567620928 8 0.02912142152023692 7 0.022211253701875617 9 0.0192497532082922 12 0.01826258637709773 6 0.012339585389930898 10 0.011352418558736426 0 0.010365251727541954 18 0.009378084896347482 15 0.007403751233958539 13 0.005429417571569595 4 0.005429417571569595 14 0.004442250740375123 3 0.0024679170779861796
7 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_GodsoftheNorth.txt 1 0.5368852459016393 2 0.13146279949558637 6 0.07219419924337957 11 0.06525851197982345 9 0.05390920554854981 16 0.04129886506935687 8 0.030580075662042874 7 0.012925598991172762 15 0.007250945775535939 17 0.006620428751576293 14 0.006620428751576293 10 0.005989911727616645 4 0.005989911727616645 19 0.005359394703656999 13 0.005359394703656999 3 0.004098360655737705 12 0.0022068095838587644 5 0.0022068095838587644 0 0.0022068095838587644 18 0.0015762925598991173
8 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Kipling_TheEndofthePassage.txt 0 0.40497661990647965 16 0.13677354709418837 9 0.12207748830995324 2 0.08600534402137608 7 0.04993319973279893 5 0.042585170340681364 17 0.026887107548430194 1 0.023547094188376753 11 0.022879091516366065 13 0.016199064796259186 8 0.013527054108216433 6 0.008183032732130929 12 0.007849031396125585 4 0.007849031396125585 3 0.00751503006012024 14 0.006847027388109553 10 0.006847027388109553 15 0.003841015364061456 18 0.003173012692050768 19 0.00250501002004008
9 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_TheSignoftheFour.txt 3 0.21719207019824505 16 0.1799480012999675 5 0.13873903152421188 7 0.08271043223919403 2 0.08011049723756906 9 0.07998050048748781 17 0.06815079623009425 4 0.03129671758206045 1 0.02642183945401365 11 0.02219694507637309 15 0.016737081572960676 0 0.015372115697107572 8 0.01420214494637634 10 0.006207344816379591 12 0.0058823529411764705 19 0.004127396815079623 18 0.004062398440038999 6 0.0030874228144296394 14 0.0018524536886577836 13 0.0017224569385765355
10 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Kipling_TheJungleBook.txt 13 0.4197732139380962 9 0.23211504769320615 16 0.16485789371228343 2 0.06358283044578548 1 0.05881350983064045 0 0.011168970216079424 5 0.010876970994744014 7 0.009903640256959315 11 0.005815651158263578 17 0.005085653104925054 6 0.0032363247031341248 8 0.003041658555577185 18 0.00279832587113101 10 0.0021656608915709558 3 0.0021169943546817207 15 0.001678995522678606 14 0.0011923301537862566 19 9.003309324508468E-4 12 5.109986373369671E-4 4 3.649990266692622E-4
11 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_AScandalinBohemia.txt 5 0.3803530565544295 16 0.14563582870219025 17 0.09267734553775744 7 0.08875449493298464 2 0.06979405034324943 9 0.05835240274599542 11 0.03775743707093822 4 0.02337365152010461 0 0.018470088264138606 15 0.017816279830009808 1 0.012258908139915006 12 0.009970578620464203 19 0.008336057535142204 10 0.008336057535142204 3 0.008336057535142204 6 0.007028440666884603 14 0.004740111147433802 8 0.0037593984962406013 18 0.003432494279176201 13 8.172605426610003E-4
12 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_ThePurloinedLetter.txt 12 0.292481779823552 17 0.19198312236286919 7 0.12140391254315305 5 0.0949367088607595 19 0.07690832374376678 16 0.07575757575757576 2 0.029344073647871116 9 0.02819332566168009 11 0.0174530111238972 10 0.017069428461833524 8 0.012082853855005753 0 0.007479861910241657 15 0.0055619485999232835 3 0.0055619485999232835 6 0.005178365937859608 14 0.004794783275795934 18 0.004027617951668585 1 0.00364403528960491 4 0.003260452627541235 13 0.0028768699654775605
13 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Lovecraft_AttheMountainofMadness.txt 14 0.33879707396369546 8 0.2175833107558927 7 0.1146843673801138 16 0.06813871579517745 11 0.06201571389867245 10 0.06109455432132214 17 0.04619344351124356 2 0.038715795177458684 9 0.017420753183419128 1 0.009563803847195882 19 0.007179626117583311 12 0.005337306962882688 5 0.004524519100514766 6 0.001761040368463831 4 0.0015984827959902466 0 0.0015984827959902466 13 0.0011108100785694934 18 0.0010024383635871038 3 9.48252506095909E-4 15 7.315090761311298E-4
14 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Kipling_ThyServantaDog.txt 18 0.5152080686149937 9 0.1625238246505718 16 0.1394139135959339 2 0.038397395171537486 5 0.027199809402795427 7 0.02577033036848793 1 0.020925984752223636 17 0.016319885641677256 0 0.011157878017789072 11 0.008140088945362134 12 0.007345933926302414 8 0.0066311944091486654 13 0.006234116899618805 19 0.0036928208386277 3 0.003057496823379924 10 0.0026604193138500634 15 0.0019456797966963151 6 0.0019456797966963151 4 9.926937738246506E-4 14 4.3678526048284626E-4
15 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_ShadowsintheMoonlight.txt 6 0.24995496306971718 1 0.23770491803278687 2 0.16978922716627634 11 0.10871914970275626 9 0.07052783282291479 16 0.05683660601693388 8 0.021347504954062332 7 0.01846514141596109 17 0.010178346243920014 13 0.009637903080526032 14 0.008557016753738066 5 0.008376869032606737 10 0.007836425869212755 0 0.005494505494505495 4 0.0038731760043235453 3 0.00351288056206089 15 0.0033327328409295622 12 0.002251846514141596 18 0.0020716987930102684 19 0.0015312556296162854
16 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_TheHoundoftheBaskervilles.txt 4 0.27940961921898555 16 0.17062199095462724 5 0.11571755094101055 2 0.10686670232942665 7 0.08663619121723484 17 0.07997373924038322 9 0.053226669260321935 11 0.0239021543549093 1 0.019087681758498273 8 0.01670476097845645 15 0.01339785050819433 10 0.007513495112580849 0 0.007416233039926081 12 0.00576277780479502 6 0.003331225988425813 18 0.0030394397704615085 19 0.002893546661479356 3 0.002844915625151972 14 0.0010941983173661431 13 5.592569177649176E-4
0 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_EurekaAProsePoem.txt 6 0.5340270980371954 2 0.3495440729483283 1 0.050924733398588426 5 0.028746586986760084 3 0.02163721601153985 8 0.006310854670032456 7 0.0046880634691669676 4 0.002807686363402195 0 6.954819432280666E-4 9 6.182061717582814E-4
1 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_TheDevilinIron.txt 4 0.41422293248102277 5 0.2636835797043548 1 0.11769876148621654 2 0.09013184178985217 7 0.030283659608469835 8 0.025809029165002 0 0.02261286456252497 3 0.018058330003995204 6 0.010866959648421893 9 0.006632041550139832
2 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Lovecraft_TheShunnedHouse.txt 2 0.22905889826663037 5 0.21943915055812688 3 0.18704056629458207 7 0.15591251474725473 1 0.08403666394409656 4 0.05100281332244305 6 0.033578364642889554 8 0.024321626281876757 9 0.008076957981668027 0 0.007532443960431981
3 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_SchadowsinZamboula.txt 4 0.4406535319557905 5 0.2743072240909819 1 0.12229697260932244 2 0.0616690693576806 3 0.0304340861765177 8 0.02715040845747237 0 0.01561749159058145 7 0.014576325484542688 6 0.008569597949703668 9 0.004725292327406695
4 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_AStudyinScarlet.txt 1 0.2748863405034946 8 0.24554975006220173 5 0.16466490239985523 2 0.13781638053877995 3 0.11655470358055688 4 0.0218271470900907 0 0.011399877858451517 6 0.010676076089660945 9 0.009002284499332745 7 0.007622537377575717
5 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_TheCaskofAmontillado.txt 3 0.24031653477717618 1 0.21990837151187007 5 0.17409412744689712 2 0.1116201582673886 8 0.1028738025822574 6 0.059975010412328195 9 0.029154518950437316 4 0.028321532694710536 7 0.021657642648896292 0 0.012078300708038317
6 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_TheMasqueoftheRedDeath.txt 2 0.30553327987169204 5 0.23857257417802727 4 0.2016840417000802 1 0.08460304731355253 8 0.056134723336006415 6 0.0408981555733761 3 0.03408179631114675 7 0.024859663191659984 9 0.00681635926222935 0 0.00681635926222935
7 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_GodsoftheNorth.txt 4 0.4482658959537572 5 0.28439306358381505 1 0.10317919075144509 2 0.044797687861271675 7 0.0338150289017341 8 0.028034682080924855 0 0.02254335260115607 3 0.014450867052023121 9 0.014161849710982659 6 0.006358381502890174
8 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Kipling_TheEndofthePassage.txt 1 0.31472807702315897 3 0.20986208691126723 5 0.17369242779078845 0 0.08873276086390841 8 0.07520166536559979 2 0.052823315118397086 4 0.03304709862086911 9 0.026671870934166016 6 0.016133229247983348 7 0.009107468123861567
9 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_TheSignoftheFour.txt 1 0.3236333781842231 8 0.239458140436498 5 0.1547811808707154 2 0.1256813154233848 3 0.10241966749526785 4 0.018586512805309128 9 0.011722046112796187 6 0.010467741567652625 0 0.008095965700472075 7 0.005154051403680813
10 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Kipling_TheJungleBook.txt 0 0.3805061455072906 1 0.3356553935414917 5 0.22125734788915186 2 0.017978471639056418 4 0.01565004962210856 3 0.012978089930529048 9 0.00797770822200168 8 0.0030536682189480115 7 0.0027864722497900605 6 0.002156653179632033
11 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_AScandalinBohemia.txt 1 0.28734046222835463 8 0.27802690582959644 5 0.14556743704725766 3 0.12590548464987927 2 0.10911808669656203 4 0.02092675635276532 6 0.014717718753593192 9 0.0072438771990341495 0 0.006553984132459468 7 0.004599287110497873
12 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Poe_ThePurloinedLetter.txt 6 0.25783435926133186 2 0.2148852825965305 1 0.17095691102406269 3 0.16759932848349188 8 0.10184667039731393 5 0.06435366536094013 4 0.007134862898712927 0 0.006715165081141578 9 0.004476776720761052 7 0.004196978175713486
13 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Lovecraft_AttheMountainofMadness.txt 2 0.3214432342357572 7 0.318472722003526 5 0.1802593764339363 1 0.07240321684739295 3 0.05904798705532881 6 0.031830366846185426 4 0.010602072113410776 0 0.0023909000893568722 9 0.0021976960417320742 8 0.001352428333373584
14 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Kipling_ThyServantaDog.txt 9 0.43832244627473255 1 0.27596326801098175 5 0.1522294802612894 2 0.04711414055350437 3 0.025371580043548236 0 0.02439332260405819 8 0.01760863391082079 4 0.010382151535233046 6 0.00583798794534381 7 0.0027769888604878665
15 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Howard_ShadowsintheMoonlight.txt 4 0.41651228352799036 5 0.2833668948852195 1 0.12605718888441403 2 0.06870720902134515 0 0.025291985501409583 3 0.0220700765203383 8 0.021425694724124044 7 0.018767619814740233 9 0.011196133709222714 6 0.006604913411196134
16 file:/home/sina/Uni/Dariah/DARIAH-Topics/Topics/corpus_txt/Doyle_TheHoundoftheBaskervilles.txt 1 0.31166647241970663 8 0.2676778608414778 5 0.13457984382544413 2 0.13246532691763374 3 0.11416726328232964 4 0.01796506884667255 6 0.010855630109388788 9 0.004845074174589167 7 0.0039293385058523835 0 0.0018481210769051464
Binary file not shown.
Binary file modified tutorial_supplementals/mallet_output/malletModel.mallet
Binary file not shown.
Binary file modified tutorial_supplementals/mallet_output/state.gz
Binary file not shown.

0 comments on commit 9cf2099

Please sign in to comment.