# TOC and index

2022 01 01

__Description:__
* This notebook renames all notebook located in the same folder so that the numbering becomes continuous. The step size can be set in `cintStep`.
* Additionally, a table of contents (TOC) and an index are created.

In [1]:
# own libraries
import Utilities as u
import MachineLearning as ml

# activate changes in libraries
import importlib
importlib.reload(u)
importlib.reload(ml)

# aliases
from Utilities import TypeChecker as t
from Utilities import PrintAlias as p

## Goal
* This Jupyter notebook lists all the notebooks of the projects: __table of contents__ or TOC.
* Furthermore, a raw __index__ is extracted:
    * words that are not common to all (or most) of the notebooks.
    * words without digits, underscores and camel-case.
* This notebook __re-numbers__ the notebooks
    * in strides of 1: final version.
    * in strides of 2: to make insertions easy during development.

In [2]:
import re
def WithoutAttachments(strSource):
    strSource = strSource.replace("\n", " ")
    return re.sub(r'"attachments":.*?}', "", strSource) # ? meaning non-greedy
if False:
    strTester =     '''
       before
       "attachments": {
        "a3e2149a-747f-41e8-a98a-3d13ad17d598.png": {
         "image/png": "iVBORw0KGgoAAAANSUhEUgAABAgUKFChQoECBAgUKnhOonzUB
        } after
    '''
    print(WithoutAttachments(strTester))

In [3]:
# get names of Jupyter notebooks
import os
import sys
import re
strDirectory = os.path.abspath(os.getcwd())
lstrFilenames = []
for strFilename in os.listdir(strDirectory):
    if strFilename.endswith(".ipynb") and not "index" in strFilename and not "Heeb" in strFilename:
        lstrFilenames.append(strFilename)
lstrFilenames.sort()

# cut current numbers
lstrFilenamesWithoutNumbers = []
for strFilename in lstrFilenames:
    lstrFilenamesWithoutNumbers.append(re.sub(r"^\d\d *", '', strFilename)) 

# create new numbers
if False: # productive
    cintStep = 1
else:
    cintStep = 2 # 2 allows for manual insertion of new notebooks
intNumber = cintStep
lstrFilenamesWithNewNumbers = []
lintNewNumbers = []
for strFilename in lstrFilenamesWithoutNumbers:
    lstrFilenamesWithNewNumbers.append(str(intNumber).zfill(2) + " " + strFilename)
    lintNewNumbers.append(intNumber)
    intNumber += cintStep

# rename
for intPointer,strOriginalFilename in enumerate(lstrFilenames):
    strNewFilename = lstrFilenamesWithNewNumbers[intPointer]
    os.rename(strOriginalFilename, strNewFilename)

# collect words, drop numbers
# create sets of chapter numbers for each word
dsintChapters = {} # dictionary of sets
print("Table of contents (TOC) ------------------------------".upper())
print()
for intPointer,strFilenameWithNewNumbers in enumerate(lstrFilenamesWithNewNumbers):
    print(strFilenameWithNewNumbers)
    intChapter = lintNewNumbers[intPointer]
    with open(strFilenameWithNewNumbers,encoding="utf-8") as objTextIOWrapper:
        strContents = objTextIOWrapper.read()
        strContents = WithoutAttachments(strContents)        
        lstrWords = re.findall(r"[\w']+|[;]", strContents)
        if False:
            lstrWords = lstrWords[:10]
        for strWord in lstrWords:
            strWord = strWord.replace("'", "")
            if not u.DoesContainDigits(strWord): 
                if not u.DoesContainUnderscore(strWord):
                    if not u.IsCamelCase(strWord):
                        try:
                            # consecutive entry
                            sintChapters = dsintChapters[strWord]
                            sintChapters.add(intChapter)
                            dsintChapters[strWord] = sintChapters
                        except:
                            # first entry
                            sintChapters = {intChapter}
                            dsintChapters[strWord] = sintChapters

# remove frequent entries
cfltProportionAllowed = 0.6
intEntriesAllowed = round(len(lstrFilenames) * cfltProportionAllowed)
dsintChaptersFrequentEntries = {}
for strWord in dsintChapters.keys():
    if len(dsintChapters[strWord]) <= intEntriesAllowed:
        dsintChaptersFrequentEntries[strWord.lower()] = dsintChapters[strWord]
     
# sort index by key
import collections
objOrderedDict = collections.OrderedDict(sorted(dsintChaptersFrequentEntries.items()))

# print index
print() 
print("Index -----------------------------------------------".upper())
print() 
for strWord, sintChapters in objOrderedDict.items():
    lintChapters = list(sintChapters)
    lintChapters.sort()
    lstrChapters = [str(intChapter) for intChapter in lintChapters] 
    print(f"{strWord}:" , ', '.join(lstrChapters))

TABLE OF CONTENTS (TOC) ------------------------------

02 dh Project overview.ipynb
04 dh EDA images.ipynb
06 dh Image preparation.ipynb
08 dh High-level features.ipynb
10 dh PCA.ipynb
12 dh Binary classifiers.ipynb
14 dh Sequence analysis.ipynb

INDEX -----------------------------------------------

;: 4, 6, 8, 12
aaae: 8
aadq: 6
aae: 6
ab: 10
abn: 10
above: 4, 10, 12
absolute: 10
abspath: 6, 8
ac: 6
according: 14
acg: 14
achieved: 12
achieves: 12
action: 2
ada: 14
add: 8
adjust: 6
adjusted: 6
ads: 8, 14
ae: 10
aekc: 14
af: 6
afg: 14
afgy: 14
afno: 10
after: 6
again: 6, 8, 12
agw: 14
ahr: 14
aix: 14
ajhr: 10
algorithm: 6
algorithms: 6
align: 4, 6, 8, 12
all: 12
alltogether: 12
alpha: 10, 12
alr: 6
already: 12
also: 14
although: 10
always: 6
an: 6, 10
analysed: 14
analyses: 4
analysis: 2, 10, 14
analyze: 2
analyzed: 4, 14
analyzing: 4, 14
anc: 10
and: 14
anmdv: 10
anti: 6
antialias: 6
ap: 6
app: 2
applies: 8
apply: 14
applymap: 6, 8
aq: 14
arange: 10
around: 6, 8
array: 6, 8, 10, 14
a