# TOC and index

2022 01 01

__Description:__
* This notebook renames all notebook located in the same folder so that the numbering becomes continuous. The step size can be set in `cintStep`.
* Additionally, a table of contents (TOC) and an index are created.

In [1]:
# own libraries
import Utilities as u
import MachineLearning as ml

# activate changes in libraries
import importlib
importlib.reload(u)
importlib.reload(ml)

# aliases
from Utilities import TypeChecker as t
from Utilities import PrintAlias as p

## Goal
* This Jupyter notebook lists all the notebooks of the projects: __table of contents__ or TOC.
* Furthermore, a raw __index__ is extracted:
    * words that are not common to all (or most) of the notebooks.
    * words without digits, underscores and camel-case.
* This notebook __re-numbers__ the notebooks
    * in strides of 1: final version.
    * in strides of 2: to make insertions easy during development.

In [2]:
import re
def WithoutAttachments(strSource):
    strSource = strSource.replace("\n", " ")
    return re.sub(r'"attachments":.*?}', "", strSource) # ? meaning non-greedy
if False:
    strTester =     '''
       before
       "attachments": {
        "a3e2149a-747f-41e8-a98a-3d13ad17d598.png": {
         "image/png": "iVBORw0KGgoAAAANSUhEUgAABAgUKFChQoECBAgUKnhOonzUB
        } after
    '''
    print(WithoutAttachments(strTester))

In [3]:
# get names of Jupyter notebooks
import os
import sys
import re
strDirectory = os.path.abspath(os.getcwd())
lstrFilenames = []
for strFilename in os.listdir(strDirectory):
    if strFilename.endswith(".ipynb") and not "index" in strFilename and not "Heeb" in strFilename:
        lstrFilenames.append(strFilename)
lstrFilenames.sort()

# cut current numbers
lstrFilenamesWithoutNumbers = []
for strFilename in lstrFilenames:
    lstrFilenamesWithoutNumbers.append(re.sub(r"^\d\d *", '', strFilename)) 

# create new numbers
if False: # productive
    cintStep = 1
else:
    cintStep = 2 # 2 allows for manual insertion of new notebooks
intNumber = cintStep
lstrFilenamesWithNewNumbers = []
lintNewNumbers = []
for strFilename in lstrFilenamesWithoutNumbers:
    lstrFilenamesWithNewNumbers.append(str(intNumber).zfill(2) + " " + strFilename)
    lintNewNumbers.append(intNumber)
    intNumber += cintStep

# rename
for intPointer,strOriginalFilename in enumerate(lstrFilenames):
    strNewFilename = lstrFilenamesWithNewNumbers[intPointer]
    os.rename(strOriginalFilename, strNewFilename)

# collect words, drop numbers
# create sets of chapter numbers for each word
dsintChapters = {} # dictionary of sets
print("Table of contents (TOC) ------------------------------".upper())
print()
for intPointer,strFilenameWithNewNumbers in enumerate(lstrFilenamesWithNewNumbers):
    print(strFilenameWithNewNumbers)
    intChapter = lintNewNumbers[intPointer]
    with open(strFilenameWithNewNumbers,encoding="utf-8") as objTextIOWrapper:
        strContents = objTextIOWrapper.read()
        strContents = WithoutAttachments(strContents)        
        lstrWords = re.findall(r"[\w']+|[;]", strContents)
        if False:
            lstrWords = lstrWords[:10]
        for strWord in lstrWords:
            strWord = strWord.replace("'", "")
            if not u.DoesContainDigits(strWord): 
                if not u.DoesContainUnderscore(strWord):
                    if not u.IsCamelCase(strWord):
                        try:
                            # consecutive entry
                            sintChapters = dsintChapters[strWord]
                            sintChapters.add(intChapter)
                            dsintChapters[strWord] = sintChapters
                        except:
                            # first entry
                            sintChapters = {intChapter}
                            dsintChapters[strWord] = sintChapters

# remove frequent entries
cfltProportionAllowed = 0.6
intEntriesAllowed = round(len(lstrFilenames) * cfltProportionAllowed)
dsintChaptersFrequentEntries = {}
for strWord in dsintChapters.keys():
    if len(dsintChapters[strWord]) <= intEntriesAllowed:
        dsintChaptersFrequentEntries[strWord.lower()] = dsintChapters[strWord]
     
# sort index by key
import collections
objOrderedDict = collections.OrderedDict(sorted(dsintChaptersFrequentEntries.items()))

# print index
print() 
print("Index -----------------------------------------------".upper())
print() 
for strWord, sintChapters in objOrderedDict.items():
    lintChapters = list(sintChapters)
    lintChapters.sort()
    lstrChapters = [str(intChapter) for intChapter in lintChapters] 
    print(f"{strWord}:" , ', '.join(lstrChapters))

TABLE OF CONTENTS (TOC) ------------------------------

02 dh Project Task.ipynb
04 dh Load data.ipynb
06 dh EDA.ipynb
08 dh Differencing.ipynb
10 dh Reduction 2020 - 2023.ipynb
12 dh Dummy prediction.ipynb
14 dh L2 regression.ipynb
16 dh ARIMA and SARIMA.ipynb
18 dh Prophet - tutorial.ipynb
20 dh Prophet - own wrapper.ipynb
22 dh LSTM.ipynb

INDEX -----------------------------------------------

aa: 18, 20
aaa: 18, 20
aaaaagv: 10
aabb: 18, 20
aadejuwx: 18, 20
aai: 18, 20
aaita: 18, 20
aaitabaar: 18, 20
aaxis: 18, 20
ab: 16
aba: 18, 20
aban: 18, 20
abandoned: 18, 20
abbe: 18
abbreviation: 16
abbreviations: 6
abc: 18, 20
abcdefghijklmnopqrst: 18, 20
ability: 18
abk: 14
able: 16
abnormally: 20
aboriginal: 18, 20
abort: 18, 20
aborted: 18, 20
aboukhadijeh: 18, 20
about: 4, 6, 16, 18, 20
above: 4, 16, 18, 20
abs: 18, 20
absolute: 14
abw: 22
abwr: 10
abyssinia: 18, 20
ac: 18, 20
acc: 18, 20
accent: 18, 20
accept: 18, 20
accepts: 18, 20
access: 18, 20
accessor: 18, 20
accesstoken: 18, 20
acc