_Run the first cell! (collapsed in JuyterLab)_

In [2]:
# To get multiple outputs from one code cell (without using print()):
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
from IPython.display import HTML, Markdown, Image, Audio

import sys
from pathlib import Path

# For documenting the current environment:
def sys_info():
    frmt = '\nPython ver: {}\nPython env: {}\n'
    frmt += 'OS:         {}\nCurrent dir: {}\n'
    print(frmt.format(sys.version, 
                      Path(sys.prefix).name,
                      sys.platform,
                      Path.cwd()))

# For enabling imports from current project code:
def add_to_sys_path(this_path, up=False):
    """
    Prepend this_path to sys.path.
    If up=True, path refers to parent folder (1 level up).
    """
    newp = Path(this_path).as_posix() # no str method (?)
    if up:
        newp = Path(this_path).parent.as_posix()

    msg = F'Path already in sys.path: {newp}'
    if newp not in sys.path:
        sys.path.insert(1, newp)
        msg = F'Path added to sys.path: {newp}'
    print(msg)

add_to_sys_path(Path.cwd(), up=True)

# For py modules/methods discovery:
def filter_dir(mdl, filter_str=None, start_with_str='_', exclude=True):
    """Filter dir(mdl) for method discovery.
       Input:
       :param mdl (object): module, optionally with submodule path(s), e.g. mdl.submdl1.submdl2.
       :param filter_str (str, None): filter all method names containing that string.
       :param start_with_str (str, '_'), exclude (bool, True): start_with_str and exclude work 
              together to perform search on non-dunder methods (default).
       Example:
       >filter_dir(re) # lists the public methods of the re module.
    """
    search_dir = [d for d in dir(mdl) if not d.startswith(start_with_str) == exclude]
    if filter_str is None:
        return search_dir
    else:
        filter_str = filter_str.lower()
        return [d for d in search_dir if d.lower().find(filter_str) != -1]

# To create often-used subfolders:
def get_project_dirs(which=['data', 'images'],
                     use_parent=True):
    '''Create folder(s) named in `which` at the ipynb parent level.'''
    if use_parent:
        dir_fn = Path.cwd().parent.joinpath
    else:
        dir_fn = Path.cwd().joinpath
        
    dir_lst = []    
    for d in which:
        DIR = dir_fn(d)
        if not DIR.exists():
            Path.mkdir(DIR)
        dir_lst.append(DIR)
    return dir_lst

DIR_DATA, DIR_IMG = get_project_dirs()

import pandas as pd
#pd.set_option("display.max_colwidth", 200)
from pprint import pprint as pp

    
def new_section(title='New section'):
    style = "text-align:center;background:#c2d3ef;padding:16px;color:#ffffff;font-size:2em;width:98%"
    div = f'<div style="{style}">{title}</div>'
    #return HTML('<div style="{}">{}</div>'.format(style, title))
    return get_ipython().set_next_input(div, 'markdown')


# For documenting the current environment:
def show_versions():
    txt = '<pre><br>'
    txt += F'Python:\t\t{sys.version}<br>'
    txt += F'Python env:\t{Path(sys.prefix).name}<br>'
    txt += F'Numpy:\t\t{np.__version__}<br>'
    txt += F'Scipy:\t\t{sp.__version__}<br>'
    txt += F'Pandas:\t\t{pd.__version__}<br>'
    txt += F'Matplotlib:\t{mpl.__version__}<br>'
    txt += F'Currrent dir: {Path.cwd()}'
    txt += '</pre>'
    div = f"""<div class="alert alert-info"><b>Versions:</b><br>{txt}</div>"""
    return HTML(div)


# autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    get_ipython().run_line_magic('load_ext', 'autoreload')

%autoreload 2

#..................
sys_info()

no_wmark = False
try:
    %load_ext watermark
    %watermark
except ModuleNotFoundError:
    no_wmark = True

if no_wmark:
    show_versions()
else:
    %watermark -iv

Path added to sys.path: C:/Users/catch/Documents/GitHub/DU-event-transcript-demo/resources/EventManagement

Python ver: 3.7.6 | packaged by conda-forge | (default, Jun  1 2020, 18:11:50) [MSC v.1916 64 bit (AMD64)]
Python env: p37
OS:         win32
Current dir: C:\Users\catch\Documents\GitHub\DU-event-transcript-demo\resources\EventManagement\notebooks

2021-01-28T14:46:56-05:00

CPython 3.7.6
IPython 7.16.1

compiler   : MSC v.1916 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
CPU cores  : 8
interpreter: 64bit
pandas 1.0.5



---
---
# List of How-Tos:  

0. [Import required project modules](#0)
1. [How To Amend Text Processing Files (Corrections, People, Names, Places, Upper)](#01)
 * [Add to Corrections](#01.1)
 * [Add to Other Replacement Files](#01.2)
2. [How To Redo the Initial Transcript](#02)
 * [Instantiate the event class for the event you want re-processed](#02.0)
 * [Redo if any of the text processing files was updated](#02.1)
 * [Redo if you want different wrap-width or time-chuncking values](02.2)
 * [Visualize](#02.3)
3. [How To Download the Audio File](#03)


---
---
<a id='0'></a>
# 0. Import required project modules

In [3]:
from manage import (EventMeta as Meta,
                    EventTranscription as TRX)

---
---
<a id='01'></a>
# 1. How To Amend Text Processing Files (Corrections, People, Names, Places, Upper)  
#### Note 1:  This step is at the moment not incorporated into the GUI EDIT page.
#### Note 2: At the moment the `EventTranscription.clean_text` function _removes_ two speech utterrances: ('um' and 'uh'). You would need to place a PR to add other ones.


### What do this files do & what are their contents?
* **People, Names, Places** csv files:
 - List of lowercase words or phrases for _conversion_ to titlecase, e.g. 'oxford university' -> 'Oxford University'
* **Upper** csv file:
 - List of lowercase words or phrases for _conversion_ to uppercase, e.g. 'aws' -> 'AWS'
* **Corrections** csv file:
 - List of tuples for _replacement_, e.g. ('pi ladies', 'PyLadies'), ("miriam's webster's", 'Merriam-Webster')
* **File paths**: Their paths are referenced in the `EventTranscription` module:  
```
people_file = Meta.DIR_DATA.joinpath('title_people.csv')
names_file = Meta.DIR_DATA.joinpath('title_names.csv')
places_file = Meta.DIR_DATA.joinpath('title_places.csv')
upper_file = Meta.DIR_DATA.joinpath('upper_terms.csv')
# Replacement pairs: (from , to); for special str & those mangled by Google's autocaptioning:
corrections_file = Meta.DIR_DATA.joinpath('corrections.csv')
```


## Use Case:
You have just launched an Editing session (via ManageGUI.ipynb) and you realize that some words/phrases that need some correcting are occurring with high frequency in the event transcript.  
Even if you have the Grammarly extension installed in your browser, the task of applying the suggested corrections - if they exist! - it quite tedious.   
This **How To** shows you how to augment any of the text processing files and re-run the transcription.

## 'Correctability' Criteria:  

### Proper casing:  
* Titlecasing: some words/phrases should be titlecased but were skipped because they are not found in the People, Names, or Places csv files.  
* Uppercasing: some acronyms/company names should be uppercased (e.g.: AMD, SMTP, SAP).  

### Corrections:  
* Errors from auto-generated captions. Examples: Reshama's name occurs in 7 flavors (all wrong); 'marco guerrelli' or 'marco gorilla' -> 'Marco Gorelli'; 'pi ladies' -> 'PyLadies'
* Special casing: e.g.: 'macbook' -> 'MacBook'; 'iot' -> 'IoT'.

## Steps:
1. Read the transcript and keep a record of the frequently occurring words.
2. Gather the high-frequency terms that need amending into categories: People (first, last, or full names), Names (organizations/institutions, companies, software & libraries), Places (geographic, street names), or Corrections.
3. Continue by running the following cells using the data you want to add.
4. Redo the text processing.


## Functions: `EventTranscription.update_conversion_file` or `EventTranscription.add_corrections`
1. `.update_conversion_file`
This function takes two paramters: `which`: to indicate which file to augment (people, names, places, or upper), and `user_list`: the terms to add.  
Only new entries are added.  
2. `.add_corrections` 
This function accepts a list of tuples, e.g.: correction_lst = [('west mckinney', 'Wes McKinney'), ('rashama', 'Reshama')]

### TODO: amend `EventTranscription.add_corrections` to only add new entries

<a id='01.1'></a>
## Add to Corrections

# Latest additions: 01-27-2021
```
# corrections:
correction_lst = [('andreas miller', 'Andreas Mueller'),
                  ('hi ladies', 'PyLadies'),
                  ('reishma', 'Reshama'),
                  ('pi data', 'PyData'),
                  ('sk learn', 'sk-learn'),
                  ('dave umbrella', 'Data Umbrella'),
                  ('vs code', 'VSCode'),
                  ('dusk', 'Dask'),
                  ('javascript', 'JavaScript'),
                  ('java script', 'JavaScript'),
                  ('washington dc', 'Washington DC')
                 ]

# people
['hugo','hashim','emily']

# names:
['binder','kubernetes','dask','python']

# upper:
['aws']
```

In [3]:
# Obtain the corrections as a dict:

corrections = TRX.get_corrections_dict()

### Safest way: Run a check  in verbose mode to avoid duplication. You will see the existing pair with identical key but differing value.
The `reduced_list` will __not__ include these entries.  
_Note: verbose flag implemented to allow use in ipywidgets context._

In [32]:
# Run a check to avoid duplication. 
# The last line of the output of the verbose version will indicate what to run next:

# Verbose version (default):
correction_lst = [('andreas miller', 'Andreas Mueller'),
                  ('hi ladies', 'PyLadies'),
                  ('reishma', 'Reshama'),
                  ('pi data', 'PyData'),
                  ('sk learn', 'sk-learn'),
                  ('dave umbrella', 'Data Umbrella'),
                  ('vs code', 'VSCode'),
                  ('dusk', 'Dask'),
                  ('javascript', 'JavaScript'),
                  ('java script', 'JavaScript'),
                  ('washington dc', 'Washington DC'),
                  ('hiroku', 'HERoku')  # test for difference, which should not be added
                 ]
check, reduced_list = TRX.check_corrections(corrections, correction_lst)

0 ('andreas miller', 'Andreas Mueller') Andreas Mueller
1 ('hi ladies', 'PyLadies') PyLadies
2 ('reishma', 'Reshama') <not found>
3 ('pi data', 'PyData') PyData
4 ('sk learn', 'sk-learn') sk-learn
5 ('dave umbrella', 'Data Umbrella') Data Umbrella
6 ('vs code', 'VSCode') VSCode
7 ('dusk', 'Dask') Dask
8 ('javascript', 'JavaScript') JavaScript
9 ('java script', 'JavaScript') JavaScript
10 ('washington dc', 'Washington DC') Washington DC
11 ('hiroku', 'HERoku') <different val>: existing: Heroku


#### Next, __run__: `: TRX.add_corrections(<reduced list>)`

In [20]:
corrections = TRX.add_corrections(reduced_list)

### ALTERNATE WAY: Silent version (param verbose=False):

In [34]:
correction_lst = [('andreas miller', 'Andreas Mueller'),
                  ('hi ladies', 'PyLadies'),
                  ('reishma', 'Reshama'),
                  ('pi data', 'PyData'),
                  ('sk learn', 'sk-learn'),
                  ('dave umbrella', 'Data Umbrella'),
                  ('vs code', 'VSCode'),
                  ('dusk', 'Dask'),
                  ('javascript', 'JavaScript'),
                  ('java script', 'JavaScript'),
                  ('washington dc', 'Washington DC')
                 ]
check, reduced_list, msg = TRX.check_corrections(corrections, correction_lst, verbose=False)
if check:
    if 'reduced' in msg:
        if reduced_list:
            corrections = TRX.add_corrections(reduced_list)
    else:
        corrections = TRX.add_corrections(correction_lst)

<a id='01.2'></a>
## Titlecasing csv files: Names, Places, People
---
## Add Names

In [54]:
# Search the list (verbose mode):

new_names = ['binder', 'kubernetes','dask', 'python']

names_list = TRX.readcsv(TRX.names_file).names.tolist()

check, reduced_list = TRX.check_list(names_list, new_names)

	 binder <found>
	 kubernetes <found>
	 dask <found>
	 python <found>


#### All found. Nothing to add.

---
## Add Places

In [55]:
new_places = ['columbia university']
places_list = TRX.readcsv(TRX.places_file).places.tolist()

check, reduced_list = TRX.check_list(places_list, new_places)

	 columbia university <found>


#### All found. Nothing to add.

---
## Add People

In [4]:
people_list = TRX.readcsv(TRX.people_file).people.tolist()
new_people = ['hugo','hashim','emily']

check, reduced_list = TRX.check_list(people_list, new_people)

	 hugo <found>
	 hashim <found>
	 emily <found>


#### All found. Nothing to add.

---
## Add Upper terms

In [60]:
new_terms = ['nyu', 'aws']
upper_list = TRX.readcsv(TRX.upper_file).upper.tolist()

check, reduced_list = TRX.check_list(upper_list, new_terms)

	 nyu <found>
	 aws <not found>


#### Next, __run__: `TRX.update_substitution_file` with:  `which`=<one of ['names','people','places','upper']>,  `user_list`=\<reduced list\>)

---
---
<a id='02'></a>
# 2. How To Redo the Initial Transcript

<a id='02.0'></a>
## Instantiate the event class for the event you want re-processed:

In [None]:
tr = Meta.TranscriptMeta('20', 2020)  # id, year

# This is how you access the event data:
tr.event_dict['transcript_md']
tr.event_dict['has_transcript']

<a id='02.1'></a>
## Case 1:  Redo if any of the Corrections, People, Names, Places, or Upper terms files was updated.  

In [None]:
# Re-process the captions and reinsert into transcript md file:

tr.redo_initial_transcript()
tr.insert_md_transcript()

<a id='02.2'></a>
## Case 2:  Redo if you want to change the wrap width or the time chuncking
To change these formatting parameters, assign new values to `tr.new_minutes_mark` and `tr.new_wrap_width`.  <br>
Note:  Default values in YTVAudio class are 4 (minutes) and 120 (characters).  

In [None]:
tr.new_minutes_mark = 5
tr.new_wrap_width = 100
  
# Re-process the captions and reinsert into transcript md file:
tr.redo_initial_transcript()
tr.insert_md_transcript()

<a id='02.3'></a>
## Visualize: 

### Visualize the text only:

In [None]:
print(tr.event_dict['formatted_transcript'])

### Visualize the Markdown file:

In [None]:
mdfile = Meta.REPO_PATH.joinpath(tr.event_dict['year'], tr.event_dict['transcript_md'])
Markdown(mdfile)

---
---
<a id='03'></a>
# 3. How To Download the Audio File

In [4]:
tr = Meta.TranscriptMeta('01', 2021)  # str(id), year

tr.set_YT()
tr.YT.download_audio()

'01-nick-janetakis-command.md'

True

---
---
