Skip to content

Commit

Permalink
Remove doclist stuff back to doclist.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Severin Simmler committed Jun 21, 2017
1 parent d432032 commit deb6535
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 663 deletions.
185 changes: 46 additions & 139 deletions IntegrationTest_Mallet_Export.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,100 +3,80 @@
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"from dariah_topics import preprocessing as pre\n",
"from dariah_topics import visualization as visual\n",
"from dariah_topics import evaluation"
"from dariah_topics import evaluation\n",
"from dariah_topics import doclist"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"## Preprocessing"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"#### Liste mit Dateinamen erzeugen"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['corpus_txt/Lovecraft_AttheMountainofMadness.txt',\n",
" 'corpus_txt/Howard_TheDevilinIron.txt',\n",
" 'corpus_txt/Poe_ThePurloinedLetter.txt',\n",
" 'corpus_txt/Lovecraft_TheShunnedHouse.txt',\n",
" 'corpus_txt/Poe_TheMasqueoftheRedDeath.txt']"
"['corpus_txt/Doyle_AScandalinBohemia.txt',\n",
" 'corpus_txt/Doyle_AStudyinScarlet.txt',\n",
" 'corpus_txt/Doyle_TheHoundoftheBaskervilles.txt',\n",
" 'corpus_txt/Doyle_TheSignoftheFour.txt',\n",
" 'corpus_txt/Howard_GodsoftheNorth.txt']"
]
},
"execution_count": 17,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path_to_corpus = \"corpus_txt\"\n",
"path_doc_list = pre.PathDocList(path_to_corpus)\n",
"path_doc_list = doclist.PathDocList(path_to_corpus)\n",
"\n",
"doclist_txt = path_doc_list.full_paths(as_str=True)\n",
"doclist_txt[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"##### Liste mit Dokumentenlabels erzeugen - (Funktion wird durch Thorsten's generischere Funktion ersetzt)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Lovecraft_AttheMountainofMadness',\n",
" 'Howard_TheDevilinIron',\n",
" 'Poe_ThePurloinedLetter',\n",
" 'Lovecraft_TheShunnedHouse',\n",
" 'Poe_TheMasqueoftheRedDeath']"
"['Doyle_AScandalinBohemia',\n",
" 'Doyle_AStudyinScarlet',\n",
" 'Doyle_TheHoundoftheBaskervilles',\n",
" 'Doyle_TheSignoftheFour',\n",
" 'Howard_GodsoftheNorth']"
]
},
"execution_count": 18,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -108,45 +88,31 @@
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"#### Corpus laden"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"corpus_txt = pre.read_from_txt(doclist_txt)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"#### Tokenisieren"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"name": "stderr",
Expand All @@ -173,22 +139,15 @@
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"#### Create Dictionaries"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"id_types = pre.create_dictionary(doc_tokens)\n",
Expand All @@ -197,22 +156,15 @@
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"#### Sparse BOW Model"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"sparse_bow = pre.create_sparse_bow(doc_labels, doc_tokens, id_types, doc_ids)"
Expand All @@ -221,11 +173,7 @@
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -291,22 +239,15 @@
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"metadata": {},
"source": [
"##### Remove Features"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"import os.path\n",
Expand All @@ -321,11 +262,7 @@
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"name": "stderr",
Expand Down Expand Up @@ -360,11 +297,7 @@
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
Expand All @@ -388,11 +321,7 @@
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
Expand All @@ -412,11 +341,7 @@
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
Expand All @@ -437,9 +362,7 @@
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
Expand Down Expand Up @@ -480,11 +403,7 @@
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
Expand All @@ -497,11 +416,7 @@
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -1521,11 +1436,7 @@
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"def create_mallet_import(doc_tokens_cleaned, doc_labels, outpath = os.path.join('tutorial_supplementals', 'mallet_input')):\n",
Expand Down Expand Up @@ -1563,11 +1474,7 @@
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"metadata": {},
"outputs": [],
"source": [
"create_mallet_import(doc_tokens_cleaned, doc_labels)"
Expand Down Expand Up @@ -1599,9 +1506,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
"nbformat_minor": 1
}
Loading

0 comments on commit deb6535

Please sign in to comment.