In [1]:
# To get multiple outputs from one code cell (without using print()):
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
from IPython.display import HTML, Markdown, Image, Audio
# for presentations:
#display(HTML("<style>.container { width:100% !important; }</style>"))

import sys
from pathlib import Path

# For documenting the current environment:
def sys_info():
    frmt = '\nPython ver: {}\nPython env: {}\n'
    frmt += 'OS:         {}\nCurrent dir: {}\n'
    print(frmt.format(sys.version, 
                      Path(sys.prefix).name,
                      sys.platform,
                      Path.cwd()))

# For enabling imports from current project code:
def add_to_sys_path(this_path, up=False):
    """
    Prepend this_path to sys.path.
    If up=True, path refers to parent folder (1 level up).
    """
    newp = Path(this_path).as_posix() # no str method (?)
    if up:
        newp = Path(this_path).parent.as_posix()

    msg = F'Path already in sys.path: {newp}'
    if newp not in sys.path:
        sys.path.insert(1, newp)
        msg = F'Path added to sys.path: {newp}'
    print(msg)

# If this ipynb file is inside a folder, eg ./notebooks, 
# the project code is assumed to reside 1 level up:
nb_folder = 'notebooks'
add_to_sys_path(Path.cwd(), up=Path.cwd().name.startswith(nb_folder))


# For py modules/methods discovery:
def filter_dir(mdl, filter_str=None, start_with_str='_', exclude=True):
    """Filter dir(mdl) for method discovery.
       Input:
       :param mdl (object): module, optionally with submodule path(s), e.g. mdl.submdl1.submdl2.
       :param filter_str (str, None): filter all method names containing that string.
       :param start_with_str (str, '_'), exclude (bool, True): start_with_str and exclude work 
              together to perform search on non-dunder methods (default).
       Example:
       >filter_dir(re) # lists the public methods of the re module.
    """
    search_dir = [d for d in dir(mdl) if not d.startswith(start_with_str) == exclude]
    if filter_str is None:
        return search_dir
    else:
        filter_str = filter_str.lower()
        return [d for d in search_dir if d.lower().find(filter_str) != -1]

# To create often-used subfolders:
def get_project_dirs(which=['data', 'images'],
                     use_parent=True):
    '''Create folder(s) named in `which` at the ipynb parent level.'''
    if use_parent:
        dir_fn = Path.cwd().parent.joinpath
    else:
        dir_fn = Path.cwd().joinpath
        
    dir_lst = []    
    for d in which:
        DIR = dir_fn(d)
        if not DIR.exists():
            Path.mkdir(DIR)
        dir_lst.append(DIR)
    return dir_lst

DIR_DATA, DIR_IMG = get_project_dirs()
    
import numpy as np
import scipy as sp
from scipy import stats as sps
import pandas as pd
#pd.set_option("display.max_colwidth", 200)

import matplotlib as mpl
from matplotlib import pyplot as plt
plt.ion()
plt.style.use('seaborn-muted')

from pprint import pprint as pp

# For adding colorfull divider in the nb:
def add_div(div_class='info', div_start='Tip:', 
            div_text='Some tip here', output_string=True):
    """
    Behaviour with default `output_string=True`:
    The cell is overwritten with the output, but the cell mode is still 'code',
    not 'markdown'.
    Workaround: After running the function, click on the new cell, press ESC, 
                type 'm', then run the new cell.
    If `output_string=False`, the output is displayed in an new cell with the 
    code cell visible.
    ```
    [x]
    add_div('alert-warning', 'Tip: ', 'some tip here', output_string=True)
    [x]
    <div class="alert alert-warning"><b>Tip: </b>some tip here</div>
    ```
    """
    accepted = ['info', 'warning', 'danger']
    div_class = div_class.lower()
    if div_class not in accepted:
        msg = f'<div class="alert"><b>Wrong class:&nbsp;</b> `div_start` not in: {accepted}.</div>'
        return Markdown(msg)
    
    div = f"""<div class="alert alert-{div_class}"><b>{div_start}&nbsp;&nbsp;</b>{div_text}</div>"""
    if output_string:
        return get_ipython().set_next_input(div, 'markdown')
    else:
        return Markdown(div)

    
def new_section(title='New section'):
    style = "text-align:center;background:#c2d3ef;padding:16px;color:#ffffff;font-size:2em;width:98%"
    div = f'<div style="{style}">{title}</div>'
    #return HTML('<div style="{}">{}</div>'.format(style, title))
    return get_ipython().set_next_input(div, 'markdown')


# For documenting the current environment:
def show_versions():
    txt = '<pre><br>'
    txt += F'Python:\t\t{sys.version}<br>'
    txt += F'Python env:\t{Path(sys.prefix).name}<br>'
    txt += F'Numpy:\t\t{np.__version__}<br>'
    txt += F'Scipy:\t\t{sp.__version__}<br>'
    txt += F'Pandas:\t\t{pd.__version__}<br>'
    txt += F'Matplotlib:\t{mpl.__version__}<br>'
    txt += F'Currrent dir: {Path.cwd()}'
    txt += '</pre>'
    div = f"""<div class="alert alert-info"><b>Versions:</b><br>{txt}</div>"""
    return HTML(div)


# autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    get_ipython().run_line_magic('load_ext', 'autoreload')

%autoreload 2

#..................
sys_info()

no_wmark = False
try:
    %load_ext watermark
    %watermark
except ModuleNotFoundError:
    no_wmark = True

if no_wmark:
    show_versions()
else:
    %watermark -iv

Path added to sys.path: C:/Users/catch/Documents/GitHub/DU-event-transcript-demo/resources/EventManagement

Python ver: 3.7.6 | packaged by conda-forge | (default, Jun  1 2020, 18:11:50) [MSC v.1916 64 bit (AMD64)]
Python env: p37
OS:         win32
Current dir: C:\Users\catch\Documents\GitHub\DU-event-transcript-demo\resources\EventManagement\notebooks

2020-12-03T12:38:08-05:00

CPython 3.7.6
IPython 7.16.1

compiler   : MSC v.1916 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
CPU cores  : 8
interpreter: 64bit
matplotlib 3.3.1
scipy      1.5.0
pandas     1.0.5
numpy      1.19.0



In [2]:
fname = DIR_DATA.parents[2].joinpath('README.md')
Markdown('# This project README:\n---\n')
Markdown(filename=fname)

# This project README:
---


# DEMO's README!
## Author: Cat Chenal
---
# Purpose:
This **README** documents proposed changes to the [data-umbrella/event-transcripts](https://github.com/data-umbrella/event-transcripts) Repo (downloaded 11/14/2020), in order to enabled the semi-automation of two workflows depending on the user's task/role, i.e. either as an Administrator or as a Contributor.  
 
The code enabling this resides in the 'EventManagement' project folder, which has been added to the existing 'resources' folder. My approach was to split the events management in two main taks: 
* First, the setup of a new event in the README table (new row) along with the creation of the initial transcript Markdown file in the related year folder by an Administrator.
* Second, the _editing_ of the pre-processed video captions by a Contributor. As this steps takes care of the transcription itself &mdash;by clean up the auto-generated captions as much as possible&mdash; a Contributor's task becomes that of an Editor.

# Modifications needed for this implementation

| Step | Modification | Applies to | Reasons | Benefits |
| ---  | ---          | ---        | ---     | ---      |
| 1    | add the `EventManagement` project to `/.resources` | Repo | functionality | "automate the boring stuff" |
| 2    | add a gitignore in repo (filter out .mp4 files, among other) | Repo |best practice | avoid clutter, size limits; mp4 stay local |
| 3    | replace 'N/A' with 'N.A.' |  Repo | 'N/A' is 'N over A' (math) | written English |
| 4    | change '? [needs a transcriber]' to '?' | Repo | redundant | Occam's |
| 5    | indicate the start & end of table by a comment line | README | enable the identification of the "main table" (in case another table is added to the file in the future) | table becomes a pd.df, which is used for updating |
| 6    | split the comments from the Status & put them into Notes| README table | data standardization | implemented as an Enum, Status could be the trigger for e.g. a Github Action if it is changed to PARTIAL_HELP = "Partial (new editor requested)"|
| 7    | add a paragraph under the table to urge (plead?) contributors to only use the project for editing | it's better | maintain consistency }
| _x_  | move the Notes column at the end | README table | mostly empty column at end | esthetics | 


# Decisions to make related to the implementation

<strong>Note:</strong><br>
The added advantage of using a template is to prevent information from a different event transcript file to appear in a new one as a result of a 'cut & paste' operation when that info has escaped the editor's attention. For example, this is the case in `./2020/17-carol-python.md`:
* Presentation title: 'Contributing to Core Python'
* Video thumbnail `alt`: "Data Science and Machine Learning at Scale"
* Video link is missing thumbnail: the template would automatically create one!  
Additionally it helps prevent inconsistencies, e.g. in the README table, for the transcript '06', the Transcriber is listed as "Reshama / Mark" (which should be changed to "Reshama, Mark"), but in the transcript file, "Mark" is missing.


## Template fields (keys) and their order
As this automation project makes use of jinja templates (with an additional jinja-markdown extension), the 'starter transcript' Markdown file has been turned into a template with the most common entries under the 'Key Links' header turned into keys. Hence, the template (./resources/EventManagement/templates/transcript_header.md) has to be reviewed. 

##### The 'Key Links' portion of the header in the template are:  
```
- Transcript:  https://github.com/data-umbrella/event-transcripts/blob/main/{{year}}/{{transcript_md}}  
- Meetup Event:  {{meetup_url}}  
- Video:  https://youtu.be/{{yt_video_id}}  
- Slides:  {{slides_url}}  
- GitHub repo:  {{repo_url}}  
- Jupyter Notebook:  {{notebook_url}}  
- Transcriber:  {{transcriber}}  
{{extra_references}}
```
The `extra_references` contains any other pair (list heading, value), not in the main ones defined under the 'Key Link' H2 header.


## Status Enum class
The 'transcription' Status has been standardized via the following Enum class:
```
class TrStatus(Enum):
    NOREC = "Not recorded"                            # for 'legacy' events or new, non-recorded events
    TODO = "Not yet processed (editor needed)"        # for inital setup of the 'starter transcript'
    PARTIAL_WIP = "Partial (w.i.p.)"                  # to indicate a partial update by Contributor
    PARTIAL_HELP = "Partial (new editor requested)"   # to indicate Contributor will not complete the editing
    REVIEW = "Needs reviewer"                         # to indicate the editing needs final approval
    COMPLETE = "Complete"                             # to indicate the contribution is acceptable
```

## Transcript chunking and formatting
The automated transcription implemented in EventTranscription.py is designed to prevent the following issues seen in the current transcript files (even the 'Complete' ones), all of which could render the editing task overwhelming:
1. Too short lines (as in the auto-generated captions <= 50 characters)
2. Too wide lines
3. No or too long paragraphs  

To this end, two parameters are used:  
* A `minute_mark`: it is set with a default of 8 (as per my experimentation, 10 minutes can lead to still too long paragraphs: people talk a lot in 10 minutes!), and is currently changeable by the user.
* A `wrap_width`: set with a default of 90 and is also changeable by the user.
##### To be decided:
Whether to keep the `wrap_width` changeable by the user.

## Other considerations

### Completeness:  
The number of transcript files was small enough for me to check each one of them. It appears to me that there is a need for a definition of completeness as many of the files flagged with 'Complete' do not comply with basic publishing standard (e.g. no capitalization or punctuation, too short/wide line width, etc.). This is to keep the __reader__ as end-user in mind: reading should not be a dreadful experience!  
While the pre-processing of the initial transcript will remedy many of these shortcomings (if used consistently), there ought to be a minimal number of criteria met before marking a contribution complete.  

### People's names:
In my opinion, anyone related to an event (presenter(s), transcriber(s), reviwer(s)), should be listed with their full names (First, Last).

### Automated review/re-processing:
Upon incorporation of this project, the existing files could be re-processed in order to apply agreed standardization and/or to provide an audit about which file is problematic. See TODO at bottom.

# Demo Documentation overview (partial, W.I.P.)

## New folder structure (with parial file listing):  
```
.
|   .gitignore
|   CODE_OF_CONDUCT.md
|   CONTRIBUTING.md
|   README.md
|   README_original.md
|   
+---.github
|       
+---2020
|   |   03-ty-shaikh-webscraping.md
|   |   [...]
|   |   17-carol-python.md   [last file as per 11-14-2020 Repo copy used in Demo]
|   |       
|   +---images
|   |   |   1280px-Scikit_learn_logo_small.svg.png
|   |   |   [...]
|   |   |   sklearn_video3.png
|   |   |   
|   |   \---emily_robinson_career
|   |           erc_main.png
|   |           [...]
|   |           erc_s9.png
|   |           
|   \---meta [NEW: contains data dict (json) for starter transcript creation,   
|                  transcript editing and main table update;  
|                  format: transcript str(id).json]
|       |   03_prev.json
|               
+---images
|       .keep
|       full_logo_transparent.png
|       
\---resources
    |   plotly-code.ipynb
    |       
    \---EventManagement [NEW]
        |   README.md [INSTRUCTIONS: which file to use, reqs, etc]
        |   requirements.txt [not finalized]
        |       
        +---data
        |   |   12_dqab-FcAirA.mp4
        |   |   12_dqab-FcAirA.xml
        |   |   
        |   +---backup [to save readme before changes]
        |   |       README_original.md.bkp
        |   |       
        |   \---documentation
        |           topdir_tree.txt
        |           flowchart_examples.txt
        |           
        +---images [mostly for documentation]
        |       
        +---manage
        |   |   EventMeta.py
        |   |   EventTranscription.py
        |   |   Utils.py
        |   |   Workflow.py [not yet functional; Panel dashboard app]
        |   |       
        |   +---Documentation
        |   |       gviz.py
        |   |       port.graphml
        |           
        +---notebooks
        |   |   Documentation.ipynb
        |   |   Implementation.ipynb
        |   |   Workflow.ipynb
        |               
        \---templates [jinja-markdown template, 
                       default str replacements dicts (json)]
            |   transcript_header.md
```

# Workflow 'equations':
## Administrator:
```
   (1) Metadata (created if none exists) 
 + (2) transcript_header.md
=> (3.1) transcript.md;
   (3.2) README update}
```
## Contributor:
```
   (1) Metadata
 + (2) transcript.md
=> (3.1) new transcript.md;
   (3.2) README update}
```

# TODO:
[ ] Create a Panel dashbord with two tabs: "Add Event" | "Edit Transcript"
[ ] Add flowcharts to documentation: if done properly, one picture could explain each task workflow.
[ ] To be decided: re-process all transcript Markdown files to:
- 1. save the metadata
- 2. standardize names/flags
- 3. flag inconsistencies

---
---
# Utils for documenting the project
---

In [2]:
from manage import EventMeta as Meta

# TO DO:

1. Produce the program flow chart depending on user status, e.g

In [2]:
# test: https://nbviewer.jupyter.org/github/xflr6/graphviz/blob/master/examples/notebook.ipynb

from graphviz import Digraph, Source

In [3]:
filter_dir(Digraph)

['attr',
 'clear',
 'copy',
 'directed',
 'directory',
 'edge',
 'edges',
 'encoding',
 'engine',
 'filepath',
 'format',
 'node',
 'pipe',
 'render',
 'save',
 'source',
 'subgraph',
 'view']

```
Digraph?
Init signature:
Digraph(
    name=None,
    comment=None,
    filename=None,
    directory=None,
    format=None,
    engine=None,
    encoding='utf-8',
    graph_attr=None,
    node_attr=None,
    edge_attr=None,
    body=None,
    strict=False,
)
```

In [4]:
Digraph.render?

[1;31mSignature:[0m
[0mDigraph[0m[1;33m.[0m[0mrender[0m[1;33m([0m[1;33m
[0m    [0mself[0m[1;33m,[0m[1;33m
[0m    [0mfilename[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdirectory[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mview[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mcleanup[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mformat[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mrenderer[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mformatter[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mquiet[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mquiet_view[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Save the source to file and render with the Graphviz engine.

Args:
    filename: Filename for saving the source (defaults to ``name`` + ``'.gv'``)
    directory: (Sub)directo

In [10]:
import os

In [13]:
os.environ['PROGRAMFILES']
os.environ['CONDA_PREFIX']
#C:\Program Files\Graphviz 2.44.1\bin

'C:\\Program Files'

'C:\\Users\\catch\\Anaconda3\\envs\\p37'

In [None]:
def set_gv_envir():
    """ Ad-hoc fix to have Graphiz (v2.38) working on my system. 
    Note that in case the error ExecutableNotFound occurs, the path to 
    graphviz must be added to the PATH variable, e.g:
    > "FileNotFoundError: [WinError 2] The system cannot find the file specified" 
    > "ExecutableNotFound: 
       failed to execute ['dot', '-Tsvg'], make sure the Graphviz executables are
       on your systems' PATH"
    The above is not sufficient: the error occurred even though graphviz, dot and
    neato are all on my system path.
    Calling this function on failed `try` solved the problem. (?)
"""
    gviz = os.path.join(os.environ['PROGRAMFILES'], 'Graphviz 2.44.1', 'bin')
    os.environ["PATH"] += os.pathsep + gviz
    cnd_gv = os.path.join(os.environ['CONDA_PREFIX'], 'Library', 'bin', 'python-graphviz') #'graphviz')
    os.environ["PATH"] += os.pathsep + cnd_gv
    return gviz, cnd_gv

set_gv_envir()

In [17]:
# test:
gvfile = DIR_IMG.joinpath('tbl.gv')

dot_dg = Digraph(comment='The Round Table', filename=gvfile, engine='dot')

dot_dg.node('A', 'King Arthur')
dot_dg.node('B', 'Sir Bedevere the Wise')
dot_dg.node('L', 'Sir Lancelot the Brave')

dot_dg.edges(['AB', 'AL'])
dot_dg.edge('B', 'L', constraint='false')

In [18]:
dot_dg.render(format='png', view=True)

ExecutableNotFound: failed to execute ['dot.bat', '-Tpng', '-O', 'tbl.gv'], make sure the Graphviz executables are on your systems' PATH

In [None]:
dtree = {'User Type:':['Admin', 'Tanscriber'],
        }