_Run the first cell! (collapsed in JupyterLab)_

In [1]:
# To get multiple outputs from one code cell (without using print()):
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
from IPython.display import HTML, Markdown, Image

import sys
from pathlib import Path

# For documenting the current environment:
def sys_info():
    frmt = '\nPython ver: {}\nPython env: {}\n'
    frmt += 'OS:         {}\nCurrent dir: {}\n'
    print(frmt.format(sys.version, 
                      Path(sys.prefix).name,
                      sys.platform,
                      Path.cwd()))

# For enabling imports from current project code:
def add_to_sys_path(this_path, up=False, verbose=True):
    """
    Prepend this_path to sys.path.
    If up=True, path refers to parent folder (1 level up).
    """
    newp = Path(this_path).as_posix() # no str method (?)
    if up:
        newp = Path(this_path).parent.as_posix()

    msg = F'Path already in sys.path: {newp}'
    if newp not in sys.path:
        sys.path.insert(1, newp)
        msg = F'Path added to sys.path: {newp}'
    if verbose:
        print(msg)

# If this ipynb file is inside a folder, eg ./notebooks, 
# the project code is assumed to reside 1 level up:
nb_folder = 'notebooks'
add_to_sys_path(Path.cwd(), up=Path.cwd().name.startswith(nb_folder))


# For py modules/methods discovery:
def filter_dir(mdl, filter_str=None, start_with_str='_', exclude=True):
    """Filter dir(mdl) for method discovery.
       Input:
       :param mdl (object): module, optionally with submodule path(s), e.g. mdl.submdl1.submdl2.
       :param filter_str (str, None): filter all method names containing that string.
       :param start_with_str (str, '_'), exclude (bool, True): start_with_str and exclude work 
              together to perform search on non-dunder methods (default).
       Example:
       >filter_dir(re) # lists the public methods of the re module.
    """
    search_dir = [d for d in dir(mdl) if not d.startswith(start_with_str) == exclude]
    if filter_str is None:
        return search_dir
    else:
        filter_str = filter_str.lower()
        return [d for d in search_dir if d.lower().find(filter_str) != -1]

# To create often-used subfolders:
def get_project_dirs(which=['data', 'images'],
                     use_parent=True):
    '''Create folder(s) named in `which` at the ipynb parent level.'''
    if use_parent:
        dir_fn = Path.cwd().parent.joinpath
    else:
        dir_fn = Path.cwd().joinpath
        
    dir_lst = []    
    for d in which:
        DIR = dir_fn(d)
        if not DIR.exists():
            Path.mkdir(DIR)
        dir_lst.append(DIR)
    return dir_lst

DIR_DATA, DIR_IMG = get_project_dirs()
    
import numpy as np
import pandas as pd
#pd.set_option("display.max_colwidth", 200)
from pprint import pprint as pp


# For documenting the current environment:
def show_versions():
    txt = '<pre><br>'
    txt += F'Python:\t\t{sys.version}<br>'
    txt += F'Python env:\t{Path(sys.prefix).name}<br>'
    txt += F'Numpy:\t\t{np.__version__}<br>'
    txt += F'Scipy:\t\t{sp.__version__}<br>'
    txt += F'Pandas:\t\t{pd.__version__}<br>'
    txt += F'Matplotlib:\t{mpl.__version__}<br>'
    txt += F'Currrent dir: {Path.cwd()}'
    txt += '</pre>'
    div = f"""<div class="alert alert-info"><b>Versions:</b><br>{txt}</div>"""
    return HTML(div)


# autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    get_ipython().run_line_magic('load_ext', 'autoreload')

%autoreload 2

#..................
sys_info()

no_wmark = False
try:
    %load_ext watermark
    %watermark
except ModuleNotFoundError:
    no_wmark = True

if no_wmark:
    show_versions()
else:
    %watermark -iv

Path added to sys.path: C:/Users/catch/Documents/GitHub/DU-event-transcript-demo/resources/EventManagement

Python ver: 3.7.6 | packaged by conda-forge | (default, Jun  1 2020, 18:11:50) [MSC v.1916 64 bit (AMD64)]
Python env: p37
OS:         win32
Current dir: C:\Users\catch\Documents\GitHub\DU-event-transcript-demo\resources\EventManagement\notebooks

2021-02-02T12:47:57-05:00

CPython 3.7.6
IPython 7.16.1

compiler   : MSC v.1916 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
CPU cores  : 8
interpreter: 64bit
numpy  1.19.0
pandas 1.0.5



In [2]:
from manage import (EventMeta as Meta,
                    EventTranscription as TRX,
                    Controls as CTR,
                    Utils as UTL,
                    Audit as AUD)

import ipywidgets as ipw
import re

---
---
# Audit: coverage of split_url
RE: regex in get_id_from_YT_url not working for all md files
## Q1: Is `parse_href` working? A1: Yes
## Q2: Is video_url working? A2: Yes
# Conclusion: Updated (fixed) regex in get_id_from_YT_url.

In [47]:
UTL.test_split_url()

Test meetup...
Test youtube...
All done.


---
---
# Audit: which xml files are not lowercase?
- Answer by testing 1st paragraph => Modified `xml_caption_to_text` to obtain `Audit.audit_xml_captions`

## Audit conclusion:
The xml files have consistently been lowercase since event 12, hence this does not warrant implementing
of a by-pass to text cleaning if they are not (the corrections would still need applying but they would not
be optimal without adding special cases if text is not lowercase).

In [50]:
AUD.get_all_transcripts(audit_captions=True)

Captions case check (on 1st P with minutes_mark= 1):
03, 2020:: Lower= False
everyone I am doing a recording of the scraping presentation and the original recording from the webinar didn't come out well so this is just a recording in today's presentation I'm going to talk about web scraping we're going to look at the website Poshmark comm and we're going to use Python and some additional packages to gather the data so agenda I'm gonna give a quick introduction about myself and the group then we're going to talk about web scraping and high level then we'll walk through a code example I'm going to share the code files so you can walk through it on your own as well and then during the webinar there was obviously QA it's a little bit about me I'm a product manager with General Assembly I used to run operations at an online data science bootcamp and that's kind of where I picked up everything I know about Python and programming and data science and 
)
04, 2020:: Lower= False
all right it's 

---
---
## DONE: Test: Change GrispecLayout ("a regulary-spaced grid": missed that!) to GridBox

---
---
# Current task: Incorporate modification to propercasing files in Edit page.

In [5]:
AC = CTR.AppControls()  # class, GUI controls instantiation
app = AC.app            # AppLayout method
app

AppLayout(children=(Accordion(children=(VBox(children=(ToggleButtons(button_style='info', options=('Enter Info…

In [153]:
grid = AC.PC.page.children[0]  #grid header
type(grid)
grid

GridBox(children=(HTML(value='<H3>Select the Event Year, Id, AV player (and if need be, update the Transcriber…

In [172]:
from functools import partial

# global (self.) wgts:
sel_yr = ipw.Select(options=[2020, 2021, 2022, 2023, 2024, 2025,2026,
                             2027, 2028,2029],
                    value=None,
                    layout=ipw.Layout(width='55px'))
sel_idn = ipw.Select(options=['03','02','03'],
                     value=None,
                     layout=ipw.Layout(width='55px'))
sel_status = ipw.Select(options=CTR.status_opts,
                        value=None,
                        disabled=True)
sel_files = ipw.widgets.Select(options=['People','Names','Places',
                                        'Upper','Corrections'],
                               value=None,
                               disabled=False)

av_radio = ipw.RadioButtons(options=['Audio','Video'], value='Audio')
transcriber_txt = ipw.Text(value='? (transcriber)')
txt_input = ipw.Text(layout=ipw.Layout(width='420px'))

lo_out = ipw.Layout(height='30px')
sel_files_out = ipw.Output()
sel_idn_out = ipw.Output(layout=lo_out)
btn_load_out = ipw.Output()
btn_update_out = ipw.Output()
btn_redo_out = ipw.Output()

btn_load = ipw.Button(description='LOAD',
                      button_style='info',
                      disabled=True)
btn_update = ipw.Button(description='UPDATE',
                        tooltip='Validate & save your changes.',
                        button_style='info',
                        disabled=False)
btn_redo = ipw.Button(description='REPROCESS',
                      tooltip='Redo the transcription with the new files.',
                      button_style='info',
                      disabled=True)
# ........................................................... 

def click_btn_redo(b):
    btn_redo_out.clear_output()
    with btn_redo_out:
        print("Clicked!")
        
def click_btn_update(b):
    #............................. HBox . Accordion . GridBox
    footer_g = page_hdr_grid.children[3].children[0].children[0]
    v_file = footer_g.children[0].children[0].value
    v_entries = footer_g.children[0].children[1].children[1].value or None
    if v_file is None or v_entries is None:
        btn_update_out.clear_output()
        with btn_update_out:
            print('Select a file and provide new entries.')
    else:
        # Enable the other sidebar btn
        #footer_grid.children[1].children[1].children[0].disabled = False
        with btn_update_out:
            print('Doing fake update...')
        update_ok = True
        if update_ok:
            btn_redo.disabled = False
        else:
            with btn_update_out:
                print('Coud not fake update!')

        
def obs_sel_files(change):
    """Observe fn for sel_files."""
    fname = change['owner'].value
    if fname is None:
        msg = "<h5>Use this text box to enter your list of entries:</h5>"
    elif fname == 'Corrections':
        msg = "<h5>Provide your entries as a list of string tuples, "
        msg += "e.g.: <pre>('&lt;from&gt;', '&lt;to&gt;'), ...</pre></h5>"
    else:
        msg = "<h5>Separate your entries with a comma.</h5>"
    sel_files_out.clear_output()
    with sel_files_out:
        display(HTML(msg))

sel_files.observe(obs_sel_files)
btn_update.on_click(click_btn_update)
btn_redo.on_click(click_btn_redo)


def get_update_grid():        
    g = get_grid(2, 'Update the propercasing or corrections files',
                 exclude=['header', 'footer'])
    with sel_files_out:
        msg = "<h5>Use this text box to enter your entries:</h5>"
        display(HTML(msg))

    vbx = partial(ipw.VBox,
                  layout=ipw.Layout(display='flex',
                                    flex_flow='column',
                                    align_items='flex-start'))
    
    g.children[0].children = [sel_files,
                              vbx([sel_files_out, txt_input])]
    g.children[1].children = [vbx([btn_update, btn_update_out]),
                              vbx([btn_redo, btn_redo_out])]
    return g


def populate_grid(idx, g):
    """
    Populate GridBox g main, sidebar and footer (if idx==2) areas.
    :param: idx: Page index.
    :param: g: GridBox of a page header.
    """
    if  idx == 0:
        return
    
    if idx == 1:
        g.children[1].children = [sel_yr, sel_idn, sel_idn_out]
        g.children[2].children = [btn_load, btn_load_out]
    else:
        g.children[1].children = [sel_yr, sel_idn,
                                 ipw.VBox([sel_idn_out,
                                           av_radio]),
                                 ipw.VBox([transcriber_txt,
                                           sel_status])]
        g.children[2].children = [btn_load, btn_load_out]
        # load controls for updating text processing files in Accordion:
        g.children[3].children = [CTR.wgt_Accord([get_update_grid()])]


def get_grid(idx, grid_name=None, header_fn=None, exclude=None):
    """
    Wrapper to get starter GridBox with pre-defined areas to
    obtain, at most, a 3x3 grid.
    :param: idx: index referencing a caller widget, i.e. tab index.
    :param: grid_name: Name (attribute) to id the grid
    :param: exclude: None (default), or header|footer areas to exclude, e.g ['footer']
    :param: header_fn: Function populating the page header grid area if any;
                       Takes idx as param: header_fn(idx).
    """
    if idx not in [1,2]:
        return
    
    def lo_grid_area(a):
        return ipw.Layout(width='auto', grid_area=a)
        
    # always included:
    main = ipw.HBox(children=[],
                    layout=lo_grid_area('main'))
    sidebar = ipw.VBox(children=[],
                       layout=lo_grid_area('sidebar'))
               
    if exclude is not None:
        if len(exclude) == 2:
            # Assume header & footer excluded
            tpl_areas= '''"main main sidebar"'''
            kids = [main, sidebar]
        else:    
            if 'header' not in exclude:
                tpl_areas= '''
                    "header header header"
                    "main main sidebar"
                    '''
                if header_fn is None:
                    header = ipw.HBox(children=[],
                                      layout=lo_grid_area('header'))
                else:
                    header = header_fn(idx)

                kids = [header, main, sidebar]

            if 'footer' not in exclude:
                tpl_areas= '''
                    "main main sidebar"
                    "footer footer footer"
                    '''
                footer = ipw.HBox(children=[],
                                  layout=lo_grid_area('footer'))
                kids = [main, sidebar, footer]
    else:
        tpl_areas= '''
            "header header header"
            "main main sidebar"
            "footer footer footer"
            '''
        if header_fn is None:
            header = ipw.HBox(children=[],
                              layout=lo_grid_area('header'))
        else:
            header = header_fn(idx)
        footer = ipw.HBox(children=[],
                          layout=lo_grid_area('footer'))            

        kids = [header, main, sidebar, footer]

    lo_grid = ipw.Layout(grid_template_rows='auto auto auto',
                         grid_template_columns='1fr, 1fr, 1fr', 
                         grid_template_areas= tpl_areas)
    grid = ipw.GridBox(children=kids,
                       layout=lo_grid)

    setattr(grid, 'name', grid_name or '')

    return grid

page_hdr_grid = get_grid(2, header_fn=CTR.get_info_banner)
populate_grid(2, page_hdr_grid)

In [173]:
page_hdr_grid

GridBox(children=(HTML(value='<H3>Select the Event Year, Id, AV player (and if need be, update the Transcriber…

In [195]:
footer_g = page_hdr_grid.children[3].children[0].children[0]

v_file = footer_g.children[0].children[0].value
v_entries = footer_g.children[0].children[1].children[1].value or None
v_file, v_entries

('Corrections', "('dummy', 'Entry'), ('foo', 'bar'),")

In [201]:
fname = TRX.substitutions['upper']
fname

WindowsPath('C:/Users/catch/Documents/GitHub/DU-event-transcript-demo/resources/EventManagement/data/upper_terms.csv')

In [None]:
valid, msg = validate_user_list(v_entries, v_file, verbose=False)
if valid is None:
    with btn_update_out:
        print(msg)
else:
    if v_file == 'Corrections':
        corrections = TRX.get_corrections_dict()
        tot, reduced_list, msg = TRX.check_corrections(corrections, valid,
                                                       verbose=False)
        if tot:
            if 'reduced' in msg:
                if reduced_list:
                    TRX.add_corrections(reduced_list, return_dict=False)
            else:
                TRX.add_corrections(valid, return_dict=False)
        else:
            with btn_update_out:
                print(msg)
            
    else:
        v_file = v_file.lower()
        fname = TRX.substitutions[v_file]
        current_list = TRX.readcsv(fname)[v_file].tolist()

        tot, reduced_list = TRX.check_list(current_list, valid, verbose=False)
        if tot:
            if 'reduced' in msg:
                if reduced_list:
                    TRX.update_conversion_file(v_file, reduced_list)
            else:
                TRX.update_conversion_file(which=v_file, valid)
        else:
            with btn_update_out:
                print(msg)


In [179]:
from collections import Counter

In [204]:
def validate_user_list(entries, file, verbose=False):
    """
    Return a list of validated entries (None or list) along 
    with a message if verbose=False.
    """
    entries = entries.strip()
    if entries[-1] == ',':
        entries = entries[:-1]
        
    if file == 'Corrections':
        validated = []
        
        cnt = Counter(entries)
        if cnt['('] != cnt[')']:
            if cnt['('] > cnt[')']:
                msg = "Missing a closing parenthesis!"
            else:
                msg = "Missing an opening parenthesis!"
            if verbose:
                print(msg)
                return None
            else:
                return None, msg
        
        p = entries.partition(')')
        while p[0]:
            p0 = p[0][1:].split(',')
            try:
                str_from = eval(p0[0].strip().lower())
                str_to = eval(p0[1].strip())
            except:
                msg = "Could not parse tuples!"
                if verbose:
                    print(msg)
                    return None
                else:
                    return None, msg
                
            validated.append((str_from, str_to))
            if p[2] != '':
                p2 = p[2][1:].strip()
                p = p2.partition(')')
            else:
                break
    else:
        try:
            validated = [eval(e.strip().lower()) for e in entries.split(',')]
        except:
            msg = "Could not parse list!"
            if verbose:
                print(msg)
                return None
            else:
                return None, msg
            
    return validated, 'OK'
        

def test_validate_user_list():
    corr_val1 = "('dummy', 'Entry')"
    corr_val2 = "('dummy', 'Entry'), ('foo', 'list'), "
    lst_val1 = "'dummy', 'Entry'"
    lst_val2 = "'dummy', 'Entry', 'foo', 'bar', "
    lst_val3 = "'cat chenal', 'will tell', "

    validate_user_list(corr_val1, 'Corrections')
    validate_user_list(corr_val2, 'Corrections')
    validate_user_list(lst_val1, 'Names')
    validate_user_list(lst_val2, 'Places')
    validate_user_list(lst_val3, 'People')
    
    #new tests:
    validate_user_list(lst_val1, 'Corrections')
    validate_user_list(corr_val1, 'Upper')
    


---

---
---
# Test: Horizontal RadioButtons

In [8]:
lo_radio = ipw.Layout(flex_flow='row')
av_radio2 = ipw.RadioButtons(options=['Audio','Video'], value='Audio',
                            layout=lo_radio)
av_radio2

RadioButtons(layout=Layout(flex_flow='row'), options=('Audio', 'Video'), value='Audio')

---
---
# Debug new update_readme(): FIXED
- doubles up the table
- add a row even for an event update

---
---
# Fix problem with event numbering from df :: new event dict in Meta :: FIXED
```
    def new_event_dict(self):
        """
        Create a 'starter' event dict with event id generated
        from the readme table df.
        """
        new_dict = self.get_event_dict()

        # Update dict with defaults:
        new_dict['year'] = self.year
        new = self.df.index.argmax() + self.row_offset
        self.idn = idn_frmt(new)
        new_dict['idn'] = self.idn
        new_dict['transcriber'] = '?'
        new_dict['extra_references'] = ''
        new_dict['has_transcript'] = False
        new_dict['status'] = TrStatus.TODO.value
        new_dict['notes'] = ''
        new_dict['video_href_w'] = DEF_IMG_W #thumbnail
        
        v1 = self.insertion_idx(HDR_TPL.format(**new_dict))
        new_dict['trans_idx'] = v1
        return new_dict
``` 

---
---

# TO DO:

1. Produce the program flow chart depending on user status, e.g

---
---
# Utils for documenting the project
---

In [None]:
# test: https://nbviewer.jupyter.org/github/xflr6/graphviz/blob/master/examples/notebook.ipynb

import os
from graphviz import Digraph, Source

In [None]:
filter_dir(Digraph)

```
Digraph?
Init signature:
Digraph(
    name=None,
    comment=None,
    filename=None,
    directory=None,
    format=None,
    engine=None,
    encoding='utf-8',
    graph_attr=None,
    node_attr=None,
    edge_attr=None,
    body=None,
    strict=False,
)
```

In [None]:
Digraph.render?

In [None]:
os.environ['PROGRAMFILES']
os.environ['CONDA_PREFIX']
#C:\Program Files\Graphviz 2.44.1\bin

In [None]:
def set_gv_envir():
    """ Ad-hoc fix to have Graphiz (v2.38) working on my system. 
    Note that in case the error ExecutableNotFound occurs, the path to 
    graphviz must be added to the PATH variable, e.g:
    > "FileNotFoundError: [WinError 2] The system cannot find the file specified" 
    > "ExecutableNotFound: 
       failed to execute ['dot', '-Tsvg'], make sure the Graphviz executables are
       on your systems' PATH"
    The above is not sufficient: the error occurred even though graphviz, dot and
    neato are all on my system path.
    Calling this function on failed `try` solved the problem. (?)
"""
    gviz = os.path.join(os.environ['PROGRAMFILES'], 'Graphviz 2.44.1', 'bin')
    os.environ["PATH"] += os.pathsep + gviz
    cnd_gv = os.path.join(os.environ['CONDA_PREFIX'], 'Library', 'bin', 'python-graphviz') #'graphviz')
    os.environ["PATH"] += os.pathsep + cnd_gv
    return gviz, cnd_gv

set_gv_envir()

In [None]:
# test:
gvfile = DIR_IMG.joinpath('tbl.gv')

dot_dg = Digraph(comment='The Round Table', filename=gvfile, engine='dot')

dot_dg.node('A', 'King Arthur')
dot_dg.node('B', 'Sir Bedevere the Wise')
dot_dg.node('L', 'Sir Lancelot the Brave')

dot_dg.edges(['AB', 'AL'])
dot_dg.edge('B', 'L', constraint='false')

In [None]:
dot_dg.render(format='png', view=True)

In [None]:
dtree = {'User Type:':['Admin', 'Tanscriber'],
        }