# TOC and index

2022 01 01

__Description:__
* This notebook renames all notebook located in the same folder so that the numbering becomes continuous. The step size can be set in `cintStep`.
* Additionally, a table of contents (TOC) and an index are created.

In [1]:
# own libraries
import Utilities as u
import MachineLearning as ml

# activate changes in libraries
import importlib
importlib.reload(u)
importlib.reload(ml)

# aliases
from Utilities import TypeChecker as t
from Utilities import PrintAlias as p

## Goal
* This Jupyter notebook lists all the notebooks of the projects: __table of contents__ or TOC.
* Furthermore, a raw __index__ is extracted:
    * words that are not common to all (or most) of the notebooks.
    * words without digits, underscores and camel-case.
* This notebook __re-numbers__ the notebooks
    * in strides of 1: final version.
    * in strides of 2: to make insertions easy during development.

In [2]:
import re
def WithoutAttachments(strSource):
    strSource = strSource.replace("\n", " ")
    return re.sub(r'"attachments":.*?}', "", strSource) # ? meaning non-greedy
if False:
    strTester =     '''
       before
       "attachments": {
        "a3e2149a-747f-41e8-a98a-3d13ad17d598.png": {
         "image/png": "iVBORw0KGgoAAAANSUhEUgAABAgUKFChQoECBAgUKnhOonzUB
        } after
    '''
    print(WithoutAttachments(strTester))

In [3]:
# get names of Jupyter notebooks
import os
import sys
import re
strDirectory = os.path.abspath(os.getcwd())
lstrFilenames = []
for strFilename in os.listdir(strDirectory):
    if strFilename.endswith(".ipynb") and not "index" in strFilename and not "Heeb" in strFilename:
        lstrFilenames.append(strFilename)
lstrFilenames.sort()

# cut current numbers
lstrFilenamesWithoutNumbers = []
for strFilename in lstrFilenames:
    lstrFilenamesWithoutNumbers.append(re.sub(r"^\d\d *", '', strFilename)) 

# create new numbers
if False: # productive
    cintStep = 1
else:
    cintStep = 2 # 2 allows for manual insertion of new notebooks
intNumber = cintStep
lstrFilenamesWithNewNumbers = []
lintNewNumbers = []
for strFilename in lstrFilenamesWithoutNumbers:
    lstrFilenamesWithNewNumbers.append(str(intNumber).zfill(2) + " " + strFilename)
    lintNewNumbers.append(intNumber)
    intNumber += cintStep

# rename
for intPointer,strOriginalFilename in enumerate(lstrFilenames):
    strNewFilename = lstrFilenamesWithNewNumbers[intPointer]
    os.rename(strOriginalFilename, strNewFilename)

# collect words, drop numbers
# create sets of chapter numbers for each word
dsintChapters = {} # dictionary of sets
print("Table of contents (TOC) ------------------------------".upper())
print()
for intPointer,strFilenameWithNewNumbers in enumerate(lstrFilenamesWithNewNumbers):
    print(strFilenameWithNewNumbers)
    intChapter = lintNewNumbers[intPointer]
    with open(strFilenameWithNewNumbers,encoding="utf-8") as objTextIOWrapper:
        strContents = objTextIOWrapper.read()
        strContents = WithoutAttachments(strContents)        
        lstrWords = re.findall(r"[\w']+|[;]", strContents)
        if False:
            lstrWords = lstrWords[:10]
        for strWord in lstrWords:
            strWord = strWord.replace("'", "")
            if not u.DoesContainDigits(strWord): 
                if not u.DoesContainUnderscore(strWord):
                    if not u.IsCamelCase(strWord):
                        try:
                            # consecutive entry
                            sintChapters = dsintChapters[strWord]
                            sintChapters.add(intChapter)
                            dsintChapters[strWord] = sintChapters
                        except:
                            # first entry
                            sintChapters = {intChapter}
                            dsintChapters[strWord] = sintChapters

# remove frequent entries
cfltProportionAllowed = 0.6
intEntriesAllowed = round(len(lstrFilenames) * cfltProportionAllowed)
dsintChaptersFrequentEntries = {}
for strWord in dsintChapters.keys():
    if len(dsintChapters[strWord]) <= intEntriesAllowed:
        dsintChaptersFrequentEntries[strWord.lower()] = dsintChapters[strWord]
     
# sort index by key
import collections
objOrderedDict = collections.OrderedDict(sorted(dsintChaptersFrequentEntries.items()))

# print index
print() 
print("Index -----------------------------------------------".upper())
print() 
for strWord, sintChapters in objOrderedDict.items():
    lintChapters = list(sintChapters)
    lintChapters.sort()
    lstrChapters = [str(intChapter) for intChapter in lintChapters] 
    print(f"{strWord}:" , ', '.join(lstrChapters))

TABLE OF CONTENTS (TOC) ------------------------------

02 Project overview.ipynb
04 dh Fitness by scoring.ipynb
06 dh Initialize learning process.ipynb
08 dh Starring candidates.ipynb
10 dh Learning parameters.ipynb
12 dh Gallery.ipynb

INDEX -----------------------------------------------

: 4, 6, 8, 10
;: 4, 8, 10
a: 4, 10
aaba: 8, 10
abbreviation: 4
able: 4
about: 4, 12
above: 6
action: 4
activate: 4, 6, 8, 10
actually: 4
adca: 4
add: 4
adds: 8, 10
adj: 4
adjust: 4
adjusts: 8, 10
administration: 4
administrative: 4
admission: 4
admissions: 4
adp: 4
adv: 4
advisory: 4
aead: 8
afbf: 4
after: 6, 8
again: 8, 12
ahhf: 10
aj: 10
algorithm: 4
aliases: 4, 6, 8, 10
align: 4, 8, 10
all: 4, 8, 10
always: 4
america: 4, 8
amerika: 4
aml: 10
amp: 4
an: 4, 8, 12
analyst: 4
analytic: 4
analytics: 4
and: 4, 8, 10
angeles: 4, 8
annot: 10
another: 4
any: 4, 6, 8, 10
append: 4, 8, 10
applies: 8, 10
apply: 4, 8, 10
approach: 4
are: 4, 8, 10, 12
area: 4, 8
argue: 4
arise: 4
army: 4
around: 12
art: 4
art