Skip to content

Commit

Permalink
add multithreading hint, Fix #62
Browse files Browse the repository at this point in the history
  • Loading branch information
ThoraHagen committed Jul 24, 2018
1 parent e0a3112 commit d17658f
Showing 1 changed file with 81 additions and 28 deletions.
109 changes: 81 additions & 28 deletions notebooks/IntroducingMallet.ipynb
Expand Up @@ -73,7 +73,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from cophi_toolbox import preprocessing\n",
Expand All @@ -92,7 +94,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import metadata_toolbox.utils as metadata\n",
Expand All @@ -110,7 +114,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import warnings\n",
Expand Down Expand Up @@ -143,7 +149,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"path_to_corpus = Path('data', 'grenzboten_sample')"
Expand Down Expand Up @@ -173,7 +181,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pattern = '{author}_{year}_{title}'"
Expand All @@ -190,7 +200,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"meta = pd.concat([metadata.fname2metadata(str(path), pattern=pattern) for path in path_to_corpus.glob('*.txt')])\n",
Expand All @@ -207,7 +219,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"corpus = list(preprocessing.read_files(meta.index))\n",
Expand Down Expand Up @@ -239,7 +253,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tokenized_corpus = [list(preprocessing.tokenize(document)) for document in corpus]"
Expand All @@ -255,7 +271,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tokenized_corpus[0][0:13]"
Expand All @@ -282,7 +300,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"document_term_matrix, document_ids, type_ids = preprocessing.create_document_term_matrix(tokenized_corpus,\n",
Expand All @@ -302,7 +322,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"document_term_matrix = preprocessing.create_document_term_matrix(tokenized_corpus, meta['title'])\n",
Expand Down Expand Up @@ -336,7 +358,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"stopwords = preprocessing.list_mfw(document_term_matrix, most_frequent_tokens=100)"
Expand All @@ -352,7 +376,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"stopwords[:5]"
Expand All @@ -368,7 +394,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"hapax_legomena = preprocessing.find_hapax_legomena(document_term_matrix)\n",
Expand All @@ -386,7 +414,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"path_to_stopwordlist = Path('data', 'stopwords', 'de.txt')\n",
Expand All @@ -403,7 +433,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"features = stopwords + hapax_legomena + external_stopwords\n",
Expand Down Expand Up @@ -431,7 +463,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"path_to_mallet = 'mallet'"
Expand All @@ -449,7 +483,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Mallet = utils.Mallet(path_to_mallet)"
Expand All @@ -465,7 +501,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"mallet_corpus = Mallet.import_tokenized_corpus(clean_tokenized_corpus, meta['title'])"
Expand Down Expand Up @@ -527,7 +565,7 @@
"\n",
"So, now you know how to define the number of topics and the number of sampling iterations as well. A higher number of iterations will probably yield a better model, but also increases processing time. `alpha` and `beta` are so-called *hyperparameters*. They influence the model's performance, so feel free to play around with them. In the present example, we will leave the default values. Furthermore, there exist various methods for hyperparameter optimization, e.g. gridsearch or Gaussian optimization.\n",
"\n",
"**Warning: This step can take quite a while!** Meaning something between some seconds and some hours depending on corpus size and the number of iterations. Our example corpus should be done within a minute or two at `num_iterations=1000`."
"**Warning: This step can take quite a while!** Meaning something between some seconds and some hours depending on corpus size and the number of iterations. Our example corpus should be done within a minute or two at `num_iterations=1000`. You can however increment the number of threads (`num_threads`, defaults to 1) for parallel training."
]
},
{
Expand All @@ -540,7 +578,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"output = Path('data', 'mallet_output')\n",
Expand All @@ -552,7 +592,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%%time\n",
Expand All @@ -561,6 +603,7 @@
" output_topic_keys=str(Path(output, 'topic_keys.txt')),\n",
" output_doc_topics=str(Path(output, 'doc_topics.txt')),\n",
" num_topics=10,\n",
" num_threads=1,\n",
" num_iterations=1000)"
]
},
Expand All @@ -583,7 +626,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"topics = postprocessing.show_topics(topic_keys_file=str(Path(output, 'topic_keys.txt')))\n",
Expand All @@ -609,7 +654,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"document_topics = postprocessing.show_document_topics(topics=topics,\n",
Expand All @@ -636,7 +683,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from bokeh.io import output_notebook, show\n",
Expand All @@ -647,7 +696,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"PlotDocumentTopics = visualization.PlotDocumentTopics(document_topics)\n",
Expand All @@ -664,7 +715,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"static_heatmap = PlotDocumentTopics.static_heatmap()\n",
Expand All @@ -691,7 +744,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.5.1"
}
},
"nbformat": 4,
Expand Down

0 comments on commit d17658f

Please sign in to comment.