# TOC and index

2022 01 01

__Description:__
* This notebook renames all notebook located in the same folder so that the numbering becomes continuous. The step size can be set in `cintStep`.
* Additionally, a table of contents (TOC) and an index are created.

In [1]:
# own libraries
import Utilities as u
import MachineLearning as ml

# activate changes in libraries
import importlib
importlib.reload(u)
importlib.reload(ml)

# aliases
from Utilities import TypeChecker as t
from Utilities import PrintAlias as p

## Goal
* This Jupyter notebook lists all the notebooks of the projects: __table of contents__ or TOC.
* Furthermore, a raw __index__ is extracted:
    * words that are not common to all (or most) of the notebooks.
    * words without digits, underscores and camel-case.
* This notebook __re-numbers__ the notebooks
    * in strides of 1: final version.
    * in strides of 2: to make insertions easy during development.

In [2]:
import re
def WithoutAttachments(strSource):
    strSource = strSource.replace("\n", " ")
    return re.sub(r'"attachments":.*?}', "", strSource) # ? meaning non-greedy
if False:
    strTester =     '''
       before
       "attachments": {
        "a3e2149a-747f-41e8-a98a-3d13ad17d598.png": {
         "image/png": "iVBORw0KGgoAAAANSUhEUgAABAgUKFChQoECBAgUKnhOonzUB
        } after
    '''
    print(WithoutAttachments(strTester))

In [3]:
# get names of Jupyter notebooks
import os
import sys
import re
strDirectory = os.path.abspath(os.getcwd())
lstrFilenames = []
for strFilename in os.listdir(strDirectory):
    if strFilename.endswith(".ipynb") and not "index" in strFilename and not "Heeb" in strFilename:
        lstrFilenames.append(strFilename)
lstrFilenames.sort()

# cut current numbers
lstrFilenamesWithoutNumbers = []
for strFilename in lstrFilenames:
    lstrFilenamesWithoutNumbers.append(re.sub(r"^\d\d *", '', strFilename)) 

# create new numbers
if False: # productive
    cintStep = 1
else:
    cintStep = 2 # 2 allows for manual insertion of new notebooks
intNumber = cintStep
lstrFilenamesWithNewNumbers = []
lintNewNumbers = []
for strFilename in lstrFilenamesWithoutNumbers:
    lstrFilenamesWithNewNumbers.append(str(intNumber).zfill(2) + " " + strFilename)
    lintNewNumbers.append(intNumber)
    intNumber += cintStep

# rename
for intPointer,strOriginalFilename in enumerate(lstrFilenames):
    strNewFilename = lstrFilenamesWithNewNumbers[intPointer]
    os.rename(strOriginalFilename, strNewFilename)

# collect words, drop numbers
# create sets of chapter numbers for each word
dsintChapters = {} # dictionary of sets
print("Table of contents (TOC) ------------------------------".upper())
print()
for intPointer,strFilenameWithNewNumbers in enumerate(lstrFilenamesWithNewNumbers):
    print(strFilenameWithNewNumbers)
    intChapter = lintNewNumbers[intPointer]
    with open(strFilenameWithNewNumbers,encoding="utf-8") as objTextIOWrapper:
        strContents = objTextIOWrapper.read()
        strContents = WithoutAttachments(strContents)        
        lstrWords = re.findall(r"[\w']+|[;]", strContents)
        if False:
            lstrWords = lstrWords[:10]
        for strWord in lstrWords:
            strWord = strWord.replace("'", "")
            if not u.DoesContainDigits(strWord): 
                if not u.DoesContainUnderscore(strWord):
                    if not u.IsCamelCase(strWord):
                        try:
                            # consecutive entry
                            sintChapters = dsintChapters[strWord]
                            sintChapters.add(intChapter)
                            dsintChapters[strWord] = sintChapters
                        except:
                            # first entry
                            sintChapters = {intChapter}
                            dsintChapters[strWord] = sintChapters

# remove frequent entries
cfltProportionAllowed = 0.6
intEntriesAllowed = round(len(lstrFilenames) * cfltProportionAllowed)
dsintChaptersFrequentEntries = {}
for strWord in dsintChapters.keys():
    if len(dsintChapters[strWord]) <= intEntriesAllowed:
        dsintChaptersFrequentEntries[strWord.lower()] = dsintChapters[strWord]
     
# sort index by key
import collections
objOrderedDict = collections.OrderedDict(sorted(dsintChaptersFrequentEntries.items()))

# print index
print() 
print("Index -----------------------------------------------".upper())
print() 
for strWord, sintChapters in objOrderedDict.items():
    lintChapters = list(sintChapters)
    lintChapters.sort()
    lstrChapters = [str(intChapter) for intChapter in lintChapters] 
    print(f"{strWord}:" , ', '.join(lstrChapters))

TABLE OF CONTENTS (TOC) ------------------------------

02 dh Term Deposit.ipynb

INDEX -----------------------------------------------

: 2
;: 2
a: 2
aa: 2
aaaf: 2
ability: 2
able: 2
abm: 2
about: 2
above: 2
abs: 2
abyulfklr: 2
ac: 2
acbz: 2
account: 2
accuracy: 2
achieve: 2
acp: 2
activate: 2
activity: 2
actual: 2
actually: 2
acw: 2
ad: 2
add: 2
adding: 2
additional: 2
addressed: 2
adjusting: 2
adjustments: 2
adl: 2
admin: 2
ador: 2
ae: 2
af: 2
afknd: 2
after: 2
afzi: 2
age: 2
ah: 2
ahg: 2
aj: 2
al: 2
alb: 2
algcu: 2
aliases: 2
align: 2
all: 2
allowed: 2
almost: 2
alongside: 2
alpha: 2
also: 2
alut: 2
am: 2
among: 2
amount: 2
amtde: 2
an: 2
analysis: 2
anchor: 2
and: 2
another: 2
answer: 2
any: 2
ao: 2
ap: 2
append: 2
apply: 2
apr: 2
april: 2
apziva: 2
aqa: 2
ar: 2
ara: 2
are: 2
array: 2
artificial: 2
arz: 2
as: 2
ascending: 2
ask: 2
assess: 2
assessment: 2
assume: 2
assumption: 2
at: 2
atj: 2
au: 2
aug: 2
august: 2
autopct: 2
auxa: 2
auz: 2
av: 2
avcm: 2
average: 2
avl: 2
avoiding: 