In [None]:
# To get multiple outputs from one code cell (without using print()):
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
from IPython.display import HTML, Markdown, Image, Audio

import sys
from pathlib import Path

# For documenting the current environment:
def sys_info():
    frmt = '\nPython ver: {}\nPython env: {}\n'
    frmt += 'OS:         {}\nCurrent dir: {}\n'
    print(frmt.format(sys.version, 
                      Path(sys.prefix).name,
                      sys.platform,
                      Path.cwd()))

# For enabling imports from current project code:
def add_to_sys_path(this_path, up=False):
    """
    Prepend this_path to sys.path.
    If up=True, path refers to parent folder (1 level up).
    """
    newp = Path(this_path).as_posix() # no str method (?)
    if up:
        newp = Path(this_path).parent.as_posix()

    msg = F'Path already in sys.path: {newp}'
    if newp not in sys.path:
        sys.path.insert(1, newp)
        msg = F'Path added to sys.path: {newp}'
    print(msg)

add_to_sys_path(Path.cwd(), up=True)

# For py modules/methods discovery:
def filter_dir(mdl, filter_str=None, start_with_str='_', exclude=True):
    """Filter dir(mdl) for method discovery.
       Input:
       :param mdl (object): module, optionally with submodule path(s), e.g. mdl.submdl1.submdl2.
       :param filter_str (str, None): filter all method names containing that string.
       :param start_with_str (str, '_'), exclude (bool, True): start_with_str and exclude work 
              together to perform search on non-dunder methods (default).
       Example:
       >filter_dir(re) # lists the public methods of the re module.
    """
    search_dir = [d for d in dir(mdl) if not d.startswith(start_with_str) == exclude]
    if filter_str is None:
        return search_dir
    else:
        filter_str = filter_str.lower()
        return [d for d in search_dir if d.lower().find(filter_str) != -1]

# To create often-used subfolders:
def get_project_dirs(which=['data', 'images'],
                     use_parent=True):
    '''Create folder(s) named in `which` at the ipynb parent level.'''
    if use_parent:
        dir_fn = Path.cwd().parent.joinpath
    else:
        dir_fn = Path.cwd().joinpath
        
    dir_lst = []    
    for d in which:
        DIR = dir_fn(d)
        if not DIR.exists():
            Path.mkdir(DIR)
        dir_lst.append(DIR)
    return dir_lst

DIR_DATA, DIR_IMG = get_project_dirs()

import pandas as pd
#pd.set_option("display.max_colwidth", 200)
from pprint import pprint as pp

    
def new_section(title='New section'):
    style = "text-align:center;background:#c2d3ef;padding:16px;color:#ffffff;font-size:2em;width:98%"
    div = f'<div style="{style}">{title}</div>'
    #return HTML('<div style="{}">{}</div>'.format(style, title))
    return get_ipython().set_next_input(div, 'markdown')


# For documenting the current environment:
def show_versions():
    txt = '<pre><br>'
    txt += F'Python:\t\t{sys.version}<br>'
    txt += F'Python env:\t{Path(sys.prefix).name}<br>'
    txt += F'Numpy:\t\t{np.__version__}<br>'
    txt += F'Scipy:\t\t{sp.__version__}<br>'
    txt += F'Pandas:\t\t{pd.__version__}<br>'
    txt += F'Matplotlib:\t{mpl.__version__}<br>'
    txt += F'Currrent dir: {Path.cwd()}'
    txt += '</pre>'
    div = f"""<div class="alert alert-info"><b>Versions:</b><br>{txt}</div>"""
    return HTML(div)


# autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    get_ipython().run_line_magic('load_ext', 'autoreload')

%autoreload 2

#..................
sys_info()

no_wmark = False
try:
    %load_ext watermark
    %watermark
except ModuleNotFoundError:
    no_wmark = True

if no_wmark:
    show_versions()
else:
    %watermark -iv

---
---
# How To Amend Text Processing Files (Corrections, People, Names, Places)  
This step is at the moment not incorporated into the GUI EDIT page.  

## Use Case:
You have just launched an Editing session (via ManageGUI.ipynb) and you realize that some words/phrases that need some correcting are occurring with high frequency in the event transcript.  
Even if you have the Grammarly extension installed in your browser, the task of applying the suggested corrections - if they exist! - it quite tedious.   
This **How To** shows you how to augment any of the text processing files and re-run the transcription.

## 'Correctability' Criteria:  

### Proper casing:  
* Titlecasing: some words/phrases should be titlecased but were skipped because they are not found in the People, Names, or Places csv files.  
* Uppercasing: some acronyms/company names should be uppercased (e.g.: AMD, SMTP, SAP).  

### Corrections:  
* Errors from auto-generated captions. Examples: Reshama's name occurs in 7 flavors (all wrong); 'marco guerrelli' or 'marco gorilla' -> 'Marco Gorelli'; 'pi ladies' -> 'PyLadies'
* Special casing: e.g.: 'macbook' -> 'MacBook'; 'iot' -> 'IoT'.
* Speech utterances that need removing: e.g. 'uh'.

## Steps:
1. Read the transcript and keep a record of the frequently occurring words.
2. Gather the high-frequency terms that need amending into categories: People (first, last, or full names), Names (organizations/institutions, companies, software & libraries), Places (geographic, street names), or Corrections.
3. Continue by running the following cells using the data you want to add.
4. Redo the text processing.


## Functions: `EventTranscription.update_substitution_file` or `EventTranscription.add_corrections`
1. `.update_substitution_file`
This function takes two paramters: `which`: to indicate which file to augment (people, names, places, or upper), and `user_list`: the terms to add.  
Only new entries are added.  
2. `.add_corrections` 
This function accepts a list of tuples, e.g.: correction_lst = [('west mckinney', 'Wes McKinney'), ('rashama', 'Reshama')]

### TODO: amend `EventTranscription.add_corrections` to only add new entries

In [None]:
from manage import (EventMeta as Meta,
                    EventTranscription as TRX)

## Add to Corrections

In [None]:
# Obtain the corrections as a dict:

corrections = TRX.get_corrections_dict()

In [None]:
# Run a check to avoid duplication: an output ending with '-1' indicates the entry does not exist

correction_lst = [('west mckinney', 'Wes McKinney'),
                  ('rashama', 'Reshama'),
                  ('university of edinburg', 'University of Edinburg'),
                  ('washington dc', 'Washington DC')
                  ('dummy', 'enTRY')]

check = TRX.check_corrections(corrections, correction_lst)
if check == -1 * len(correction_lst):
    print("OK to include all.")  

In [None]:
# Remove any exisitng entry if previous check did not print "OK to include all.", then update:

corrections = TRX.add_corrections(correction_lst)

## Add Names

In [None]:
new_names = ['matplotlib', 'seaborn','plotly','fibonacci', 'wikipedia', 'markdown','windows', ]

# Update:
TRX.update_substitution_file(which='names', user_list=new_names)

# Optional: recall the list to check:
names_list = TRX.readcsv(TRX.names_file).names.tolist()

'fibonacci' in names_list

## Add Places

In [None]:
new_terms = ['columbia university']
TRX.update_substitution_file(which='places', user_list=new_terms)

#places_list = TRX.readcsv(TRX.places_file).places.tolist()

## Add People

In [None]:
new_terms = ['melissa','brian', 'jeff', 'jeff ryback', 'marco', 'marco gorelli']

TRX.update_substitution_file(which='people', user_list=new_terms)
#people_list = TRX.readcsv(TRX.people_file).people.tolist()

## Add Upper terms

In [None]:
new_terms = ['nyu']

TRX.update_substitution_file(which='upper', user_list=new_terms)
#upper_list = TRX.readcsv(TRX.upper_file).upper.tolist()

---
# Redo initial transcript

## Instantiate the event class for the event you want re-processed:

In [None]:
tr = Meta.TranscriptMeta(20, 2020)  # id, year

# This is how you access the event data:
tr.event_dict['transcript_md']
tr.event_dict['has_transcript']

## Case 1:  Redo if any of the Corrections, People, Names, Places, or Upper terms files was updated.  

In [None]:
# Re-process the captions and reinsert into transcript md file:

tr.redo_initial_transcript()
tr.insert_md_transcript()

## Case 2:  Redo if you want to change the wrap width or the time chuncking
To change these formatting parameters, assign new values to `tr.new_minutes_mark` and `tr.new_wrap_width`.  <br>
Note:  Default values in YTVAudio class are 4 (minutes) and 120 (characters).  

In [None]:
tr.new_minutes_mark = 5
tr.new_wrap_width = 100
  
# Re-process the captions and reinsert into transcript md file:
tr.redo_initial_transcript()
tr.insert_md_transcript()

## Visualize: Print text

In [None]:
print(tr.event_dict['formatted_transcript'])

## Visualize: Render updated file:

In [None]:
mdfile = Meta.REPO_PATH.joinpath(tr.event_dict['year'], tr.event_dict['transcript_md'])
Markdown(mdfile)