In [1]:
from dariah_topics import preprocessing as pre
from dariah_topics import visualization as visual
from dariah_topics import evaluation

## Preprocessing

#### Liste mit Dateinamen erzeugen

In [2]:
path_txt = "corpus_txt"

doclist_txt = pre.create_document_list(path_txt)
assert doclist_txt, "No documents found"
doclist_txt[:5]

INFO preprocessing: Creating document list from TXT files ...


['corpus_txt/Lovecraft_AttheMountainofMadness.txt',
 'corpus_txt/Howard_TheDevilinIron.txt',
 'corpus_txt/Poe_ThePurloinedLetter.txt',
 'corpus_txt/Lovecraft_TheShunnedHouse.txt',
 'corpus_txt/Poe_TheMasqueoftheRedDeath.txt']

#####  Liste mit Dokumentenlabels erzeugen - (Funktion wird durch Thorsten's generischere Funktion ersetzt)

In [3]:
doc_labels = list(pre.get_labels(doclist_txt))
doc_labels[:5]

INFO preprocessing: Creating document labels ...


['Lovecraft_AttheMountainofMadness',
 'Howard_TheDevilinIron',
 'Poe_ThePurloinedLetter',
 'Lovecraft_TheShunnedHouse',
 'Poe_TheMasqueoftheRedDeath']

#### Corpus laden

In [4]:
corpus_txt = pre.read_from_txt(doclist_txt)

#### Tokenisieren

In [5]:
doc_tokens = [list(pre.tokenize(txt)) for txt in list(corpus_txt)]
len(doc_tokens)

INFO preprocessing: Accessing TXT documents ...


17

#### Create Dictionaries

In [6]:
id_types = pre.create_dictionary(doc_tokens)
doc_ids = pre.create_dictionary(doc_labels)

#### Sparse BOW Model

In [7]:
sparse_bow = pre.create_sparse_bow(doc_labels, doc_tokens, id_types, doc_ids)

In [8]:
sparse_bow[:5]

Unnamed: 0_level_0,Unnamed: 1_level_0,0
doc_id,token_id,Unnamed: 2_level_1
1,8195,1
1,8197,1
1,8198,2
1,8,11
1,16393,19


## Topic Modeling with Gensim

##### Saving Sparse BOW

##### Remove Features

In [9]:
import os.path
basepath = os.path.abspath('.')

with open(os.path.join(basepath, "tutorial_supplementals", "stopwords", "en.txt"), 'r', encoding = 'utf-8') as f: 
    stopword_list = f.read().split('\n')
    
stopword_list = set(stopword_list)

In [10]:
hapax_from_remove = pre.find_hapax(sparse_bow, id_types)
stopwords_from_remove = pre.find_stopwords(sparse_bow, id_types, mfw=75)

features_to_be_removed = stopwords_from_remove + hapax_from_remove

sparse_bow_short = pre.remove_features(sparse_bow, id_types, features_to_be_removed)
len(stopwords_from_remove)

INFO preprocessing: Finding hapax legomena ...
INFO preprocessing: Finding stopwords ...
INFO preprocessing: Removing features ...


75

In [11]:
doc2bow_list = []
for doc in sparse_bow.index.groupby(sparse_bow.index.get_level_values('doc_id')):
    temp = [(token, count) for token, count in zip(sparse_bow.loc[doc].index, sparse_bow.loc[doc][0])]
    doc2bow_list.append(temp)
len(doc2bow_list)

17

In [12]:
len(features_to_be_removed)

9862

In [13]:
len(doc_tokens[0])

39907

In [14]:
def remove_features_from_file(doc_token_list, features_to_be_removed):
    """Removes features using feature list.

    Description:
        With this function you can remove features from ppreprocessed files. 
        Commit a list of features.
        Use the function `tokenize()` to access your files.

    Args:
        doc_token_list Union(list[str], str): List of all documents in the corpus
            and their tokens.
        features_to_be_removed list[str]: List of features that should be
        removed
    Yields:
        cleaned token array

    Todo:

    Example:
        >>> doc_tokens = [['short', 'example', 'example', 'text', 'text']]
        >>> features_to_be_removed = ['example']
        >>> test = remove_features_from_file(doc_tokens, features_to_be_removed)
        >>> list(test)
        [['short', 'text', 'text']]
    """
    #log.info("Removing features ...")
    doc_token_array = np.array(doc_token_list)
    feature_array = np.array(features_to_be_removed)
    #get indices of features that should be deleted
    indices = np.where(np.in1d(doc_token_array, feature_array,))
    doc_token_array = np.delete(doc_token_array, indices)
    yield doc_token_array.tolist()

In [15]:
import numpy as np
doc_tokens_cleaned = []
for doc_token_list in doc_tokens:
    doc_token_list_cleaned = remove_features_from_file(doc_token_list, features_to_be_removed)
    doc_tokens_cleaned.append(list(doc_token_list_cleaned))

In [16]:
doc_tokens_cleaned[0]

[['mountains',
  'madness',
  'lovecraft',
  'chapter',
  'am',
  'forced',
  'speech',
  'because',
  'men',
  'science',
  'refused',
  'follow',
  'advice',
  'without',
  'knowing',
  'why',
  'altogether',
  'against',
  'tell',
  'reasons',
  'contemplated',
  'invasion',
  'antarctic',
  'its',
  'vast',
  'fossil',
  'hunt',
  'its',
  'wholesale',
  'boring',
  'melting',
  'ancient',
  'ice',
  'caps',
  'am',
  'reluctant',
  'because',
  'may',
  'vain',
  'doubt',
  'real',
  'facts',
  'must',
  'reveal',
  'inevitable',
  'yet',
  'suppressed',
  'seem',
  'extravagant',
  'incredible',
  'nothing',
  'left',
  'hitherto',
  'withheld',
  'photographs',
  'both',
  'ordinary',
  'aerial',
  'count',
  'favor',
  'damnably',
  'vivid',
  'graphic',
  'still',
  'doubted',
  'because',
  'great',
  'lengths',
  'clever',
  'can',
  'carried',
  'ink',
  'drawings',
  'course',
  'jeered',
  'obvious',
  'notwithstanding',
  'strangeness',
  'technique',
  'art',
  'ought',

In [18]:
def create_mallet_import(doc_tokens_cleaned, doc_labels, outpath = os.path.join('tutorial_supplementals', 'mallet_input')):
    """Creates files for mallet import.

    Description:
        With this function you can create preprocessed plain text files. 
        Commit a list of full paths or one single path as argument.
        Use the function `remove_features_from_file()` to create a list of tokens
        per document.

    Args:
        doc_tokens_cleaned Union(list[str], str): List of tokens per document
        doc_labels list[str]: List of documents labels.

    Todo:
    
    Example:
        >>> doc_labels = ['examplefile']
        >>> doc_tokens_cleaned = [['short', 'example', 'text']]
        >>> create_mallet_import(doc_tokens_cleaned, doc_labels)
        >>> outpath = os.path.join('tutorial_supplementals', 'mallet_input')
        >>> os.path.isfile(os.path.join(outpath, 'examplefile.txt'))
        True
    """
    #log.info("Generating mallet input files ...")
    if not os.path.exists(outpath):
                os.makedirs(outpath)
    
    for tokens, label in zip(doc_tokens_cleaned, doc_labels):
        with open(os.path.join(outpath,label+'.txt'), 'w', encoding="utf-8") as f:
            f.write(str(tokens))

In [19]:
create_mallet_import(doc_tokens_cleaned, doc_labels)