_Run the first cell! (collapsed in JupyterLab)_

In [1]:
# To get multiple outputs from one code cell (without using print()):
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
from IPython.display import HTML, Markdown, Image

import sys
from pathlib import Path

# For documenting the current environment:
def sys_info():
    frmt = '\nPython ver: {}\nPython env: {}\n'
    frmt += 'OS:         {}\nCurrent dir: {}\n'
    print(frmt.format(sys.version, 
                      Path(sys.prefix).name,
                      sys.platform,
                      Path.cwd()))

# For enabling imports from current project code:
def add_to_sys_path(this_path, up=False, verbose=True):
    """
    Prepend this_path to sys.path.
    If up=True, path refers to parent folder (1 level up).
    """
    newp = Path(this_path).as_posix() # no str method (?)
    if up:
        newp = Path(this_path).parent.as_posix()

    msg = F'Path already in sys.path: {newp}'
    if newp not in sys.path:
        sys.path.insert(1, newp)
        msg = F'Path added to sys.path: {newp}'
    if verbose:
        print(msg)

# If this ipynb file is inside a folder, eg ./notebooks, 
# the project code is assumed to reside 1 level up:
nb_folder = 'notebooks'
add_to_sys_path(Path.cwd(), up=Path.cwd().name.startswith(nb_folder))


# For py modules/methods discovery:
def filter_dir(mdl, filter_str=None, start_with_str='_', exclude=True):
    """Filter dir(mdl) for method discovery.
       Input:
       :param mdl (object): module, optionally with submodule path(s), e.g. mdl.submdl1.submdl2.
       :param filter_str (str, None): filter all method names containing that string.
       :param start_with_str (str, '_'), exclude (bool, True): start_with_str and exclude work 
              together to perform search on non-dunder methods (default).
       Example:
       >filter_dir(re) # lists the public methods of the re module.
    """
    search_dir = [d for d in dir(mdl) if not d.startswith(start_with_str) == exclude]
    if filter_str is None:
        return search_dir
    else:
        filter_str = filter_str.lower()
        return [d for d in search_dir if d.lower().find(filter_str) != -1]

# To create often-used subfolders:
def get_project_dirs(which=['data', 'images'],
                     use_parent=True):
    '''Create folder(s) named in `which` at the ipynb parent level.'''
    if use_parent:
        dir_fn = Path.cwd().parent.joinpath
    else:
        dir_fn = Path.cwd().joinpath
        
    dir_lst = []    
    for d in which:
        DIR = dir_fn(d)
        if not DIR.exists():
            Path.mkdir(DIR)
        dir_lst.append(DIR)
    return dir_lst

#DIR_DATA, DIR_IMG = get_project_dirs()
    
import numpy as np
import pandas as pd
#pd.set_option("display.max_colwidth", 200)
from pprint import pprint as pp


# For documenting the current environment:
def show_versions():
    txt = '<pre><br>'
    txt += F'Python:\t\t{sys.version}<br>'
    txt += F'Python env:\t{Path(sys.prefix).name}<br>'
    txt += F'Numpy:\t\t{np.__version__}<br>'
    txt += F'Scipy:\t\t{sp.__version__}<br>'
    txt += F'Pandas:\t\t{pd.__version__}<br>'
    txt += F'Matplotlib:\t{mpl.__version__}<br>'
    txt += F'Currrent dir: {Path.cwd()}'
    txt += '</pre>'
    div = f"""<div class="alert alert-info"><b>Versions:</b><br>{txt}</div>"""
    return HTML(div)


# autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    get_ipython().run_line_magic('load_ext', 'autoreload')

%autoreload 2

#..................
sys_info()

no_wmark = False
try:
    %load_ext watermark
    %watermark
except ModuleNotFoundError:
    no_wmark = True

if no_wmark:
    show_versions()
else:
    %watermark -iv

Path added to sys.path: C:/Users/catch/Documents/GitHub/DU-event-transcript-demo/resources/EventManagement

Python ver: 3.7.11 (default, Jul 27 2021, 09:42:29) [MSC v.1916 64 bit (AMD64)]
Python env: du37
OS:         win32
Current dir: C:\Users\catch\Documents\GitHub\DU-event-transcript-demo\resources\EventManagement\notebooks

Last updated: 2021-10-14T15:36:17.268559-04:00

Python implementation: CPython
Python version       : 3.7.11
IPython version      : 7.27.0

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
CPU cores   : 8
Architecture: 64bit

sys   : 3.7.11 (default, Jul 27 2021, 09:42:29) [MSC v.1916 64 bit (AMD64)]
numpy : 1.20.3
pandas: 1.3.3



In [2]:
from manage import (EventMeta as Meta,
                    EventTranscription as TRX,
                    Controls as CTR,
                    Utils as UTL,
                    Audit as AUD)

from collections import OrderedDict, Counter
import ipywidgets as ipw
import re

## TESTS

In [3]:
from manage.tests import test_EventMeta as testMeta

In [4]:
dum = testMeta.test_add_event()

Add dummy event:
 - video url: https://www.youtube.com/watch?v=MHAjCcBfT_A
 - meetup url: https://www.meetup.com/data-umbrella/events/274778387/
.update_dict - Assigning new dummy event dict to Meta object...
Updated dict:
OrderedDict([('presenter', 'Cat Chenal, Reshama Shaikh'), ('title', 'Automating Audio Transcription.'), ('event_url', 'https://www.meetup.com/data-umbrella/events/274778387/'), ('yt_video_id', 'MHAjCcBfT_A'), ('slides_url', 'N.A.'), ('repo_url', 'https://github.com/CatChenal'), ('notebook_url', 'N.A.'), ('transcriber', '?'), ('extra_references', '## Other References\n- Binder:  <url>\n- Paper:  <Paper url or citation>  \n- Wiki:  This is an excellent [wiki on audio foo](http://en.wikipedia.org/wiki/Main_Page)  \n'), ('video_href', 'http://www.youtube.com/watch?feature=player_embedded&v=MHAjCcBfT_A'), ('video_href_src', 'http://img.youtube.com/vi/MHAjCcBfT_A/0.jpg'), ('video_href_alt', 'Automating Audio Transcription.'), ('video_href_w', '25%'), ('formatted_transcript

In [8]:
# Automated video description:

Markdown(dum.get_video_desc())
# OR:
print(dum.get_video_desc())

- Speakers: Cat Chenal and  Reshama Shaikh 
- Transcript:  https://github.com/data-umbrella/event-transcripts/blob/main/02-cat-reshama-audio-foo.md 
- Meetup Event:  https://www.meetup.com/data-umbrella/events/274778387/ 
- Video:  https://www.youtube.com/watch?v=MHAjCcBfT_A 
## Other References
- Binder:  <url>
- Paper:  <Paper url or citation>  
- Wiki:  This is an excellent [wiki on audio foo](http://en.wikipedia.org/wiki/Main_Page)  


- Speakers: Cat Chenal and  Reshama Shaikh 
- Transcript:  https://github.com/data-umbrella/event-transcripts/blob/main/02-cat-reshama-audio-foo.md 
- Meetup Event:  https://www.meetup.com/data-umbrella/events/274778387/ 
- Video:  https://www.youtube.com/watch?v=MHAjCcBfT_A 
## Other References
- Binder:  <url>
- Paper:  <Paper url or citation>  
- Wiki:  This is an excellent [wiki on audio foo](http://en.wikipedia.org/wiki/Main_Page)  



---
---
# Test: Using consecutinous event numbering instead of numbering per years (current): What has to change?

In [43]:
# Existing event; renumbered as per consecutinous numbering scheme
first_yr_21 = Meta.TranscriptMeta(idn=1, year=2021)


first_yr_21.event_dict.keys()

odict_keys(['presenter', 'title', 'event_url', 'yt_video_id', 'slides_url', 'repo_url', 'notebook_url', 'transcriber', 'extra_references', 'video_href', 'video_href_src', 'video_href_alt', 'video_href_w', 'formatted_transcript', 'year', 'idn', 'video_url', 'title_kw', 'transcript_md', 'audio_track', 'audio_text', 'has_transcript', 'trans_idx', 'status', 'notes', 'video_embed'])

In [44]:
if first_yr_21.event_dict['has_transcript']:
    print('Trx len: ', len(first_yr_21.event_dict['formatted_transcript']))

Trx len:  84380


In [46]:
#__repr__
first_yr_21
#__str__
print(first_yr_21)

EventMeta.TranscriptMeta(idn=01, year=2021)

In [50]:
mdfile = Meta.REPO_PATH.joinpath(Meta.MAIN_README)
UTL.show_md_file(mdfile)

### Local README file: README.md
---
---
<p >
 <a href="https://www.dataumbrella.org" target="_blank"> <img src="images/full_logo_transparent.png" height="30%" width="30%" /> </a>
</p>

# Event Transcripts (Demo README)

## [Videos](https://www.youtube.com/c/DataUmbrella/videos)
Subscribe to our YouTube [Data Umbrella channel](https://www.youtube.com/c/DataUmbrella/videos).

## [Contributing Guide](CONTRIBUTING.md)
Review our [Contributing Instructions](CONTRIBUTING.md) before beginning editing / transcribing work.  

<!-- main_tbl_start -->
| #  | Speaker             | Talk Transcript  | Transcriber  | Status | Notes |
|--- |---                  |---               |---           |---     |---    | 
| 01| Hugo Bowne-Anderson| Bayesian Data Science| N.A.| Not recorded| | 
| 02| Bruno Goncalves| Time Series Modeling| N.A.| Not recorded| | 
| 03| Ty Shaikh| [Webscraping Poshmark](2020/03-ty-shaikh-webscraping.md)| Ty Shaikh| Needs reviewer| | 
| 04| Ali Spittel| [Navigating Your Tech Career](2020/04-ali-spittel-career.md)| Janine| Needs reviewer| | 
| 05| Andreas Mueller| [Crash Course in Contributing to Scikit-learn](2020/05-andreas-mueller-contributing.md)| Reshama Shaikh| Complete| | 
| 06| Reshama Shaikh| [Example PR for Scikit-learn](2020/06-reshama-shaikh-sklearn-pr.md)| Reshama Shaikh, Mark| Complete| | 
| 07| Shailvi Wakhlu| [Fixing Bad Data (Using SQL)](2020/07-shailvi-wakhlu-fixing-data.md)| Juanita| Complete| | 
| 08| Matt Brems| [Data Science with Missing Data](2020/08-matt-brems-missing-data.md)| Barbara Graniello Batlle| Needs reviewer| | 
| 09| Sam Bail| [Intro to Terminal](2020/09-sam-bail-terminal.md)| Isaack| Complete| | 
| 10| Emily Robinson| [Build a Career in Data Science](2020/10-emily-robinson-career.md)| Kevin| Complete| | 
| 11| Rebecca Kelly| [Kdb Time Series Database](2020/11-rebecca-kelly-kdb.md)| Coretta| Needs reviewer| Paragraphs are too long| 
| 12| Mridu Bhatnagar| [Build a Bot](2020/12-mridu-bhatnagar-bot.md)| ?| Not yet processed (editor needed)| | 
| 13| Liz DiLuzio| [Creating Nimble Data Processes](2020/13-liz-diluzio-data-process.md)| Lily| Complete| | 
| 14| Megan Robertson| [3 Lessons From 3 Years of Data Science](2020/14-megan-robertson-career.md)| Sethupathy| Needs reviewer| Headers should not be in capital letters, etc| 
| 15| Emma Gouillart| [Data Visualization with Plotly](2020/15-emma-gouillart-plotly.md)| ?| Not yet processed (editor needed)| | 
| 16| Hugo Bowne-Anderson, James Bourbeau| [Data Science and Machine Learning at Scale](2020/16-hugo-james-dask.md)| Cynthia| Needs reviewer| | 
| 17| Carol Willing| [Contributing to Core Python](2020/17-carol-willing-python.md)| ?| Not yet processed (editor needed)| | 
| 18| Thomas Fan| [Streamlit for Data Science](2020/18-thomas-fan-streamlit.md)| ?| Not yet processed (editor needed)| | 
| 19| Matti Picus| [Contributing to NumPy](2020/19-matti-picus-numpy.md)| ?| Not yet processed (editor needed)| | 
| 20| Marco Gorelli| [Contributing to pandas](2020/20-marco-gorelli-pandas.md)| ?| Not yet processed (editor needed)| | 
| 21| Cat Chenal| [Automating Audio Tanscription.](2020/21-cat-chenal-foo-demo.md)| Billy Bop| Not yet processed (editor needed)| | 
| 01| Nick Janetakis| [Creating a Command Line Focused Development Environment](2021/01-nick-janetakis-command.md)| ?| Partial (new editor requested)| | 
| 02| Cat Chenal, Reshama Shaikh| [Automating Audio Transcription.](2021/02-cat-reshama-audio-foo.md)| ?| Not yet processed (editor needed)| Dummy entry for demo.| 
<!-- main_tbl_end -->
<!-- Note: There should not be any empty table row before the end of table marker above.-->

## NEW!
### This is the README file of the event management demo (or dev) project for the Data Umbrella transcript repo. Further details about the implementation and its benefits, i.e. the 'sales points' and specifications of the project, can be found in the [Dev Project README](./resources/EventManagement/dev_only_docs/README.md) and associated files in the `/EventManagement/dev_only_docs` folder. 


## Try this demo!
- Check the installation instructions in [DEMO.md](./DEMO.md)


### [Project README](./resources/EventManagement/README.md)

---
---

In [28]:
first_yr_21.df

Unnamed: 0,N,Speaker,Talk Transcript,Transcriber,Status,Notes,year,name
0,1,Hugo Bowne-Anderson,Bayesian Data Science,N.A.,Not recorded,,N.A.,N.A.
1,2,Bruno Goncalves,Time Series Modeling,N.A.,Not recorded,,N.A.,N.A.
2,3,Ty Shaikh,[Webscraping Poshmark](2020/03-ty-shaikh-websc...,Ty Shaikh,Needs reviewer,,2020,03-ty-shaikh-webscraping.md
3,4,Ali Spittel,[Navigating Your Tech Career](2020/04-ali-spit...,Janine,Needs reviewer,,2020,04-ali-spittel-career.md
4,5,Andreas Mueller,[Crash Course in Contributing to Scikit-learn]...,Reshama Shaikh,Complete,,2020,05-andreas-mueller-contributing.md
5,6,Reshama Shaikh,[Example PR for Scikit-learn](2020/06-reshama-...,"Reshama Shaikh, Mark",Complete,,2020,06-reshama-shaikh-sklearn-pr.md
6,7,Shailvi Wakhlu,[Fixing Bad Data (Using SQL)](2020/07-shailvi-...,Juanita,Complete,,2020,07-shailvi-wakhlu-fixing-data.md
7,8,Matt Brems,[Data Science with Missing Data](2020/08-matt-...,Barbara Graniello Batlle,Needs reviewer,,2020,08-matt-brems-missing-data.md
8,9,Sam Bail,[Intro to Terminal](2020/09-sam-bail-terminal.md),Isaack,Complete,,2020,09-sam-bail-terminal.md
9,10,Emily Robinson,[Build a Career in Data Science](2020/10-emily...,Kevin,Complete,,2020,10-emily-robinson-career.md


True

In [30]:
pp(first_yr_21.event_dict)

OrderedDict([('presenter', 'Nick Janetakis'),
             ('title',
              'Creating a Command Line Focused Development Environment'),
             ('event_url',
              'https://www.meetup.com/data-umbrella/events/274778387/'),
             ('yt_video_id', 'y4fYxmE0HZM'),
             ('slides_url',
              'https://github.com/nickjj/nyhackr-cli-dev-env/blob/master/nyhackr-cli-dev-env.pdf'),
             ('repo_url', 'https://github.com/nickjj/nyhackr-cli-dev-env'),
             ('notebook_url', 'N.A.'),
             ('transcriber', '?'),
             ('extra_references',
              '## Timestamps\n'
              '- NOTE TO EDITOR:  these timestamps should be incorporated into '
              'the transcript as paragraph headers. (They have been '
              'reformatted into list items for proper parsing.) \n'
              '- Intro:  0:00 \n'
              "- What we're going to cover:  7:14; Talk starts \n"
              '- A few practical demos of using 

---
---
# Test: text wrap fn as externally defined TW obj

In [33]:
# Instantiate existing event
idn, year = 8, 2021

tm = Meta.TranscriptMeta(idn, year)
print(tm.event_dict['transcript_md'])

08-cat-reshama-audio-demo.md


In [34]:
formatted_trx = tm.event_dict['formatted_transcript']
new_txt = tm.redo_transcript_cleanup(formatted_trx)

i = new_txt.find('Bayesian')
i

4131

In [35]:
Markdown(formatted_trx[:1024])
Markdown(new_txt[:1024])

<!-- Editing Guide: The pipe (|) position in this comment is 120:                                                       | -->
### Introduction

Okay hello and welcome to Data Umbrella's webinar for October so I'm just going to go over the agenda I'm going to do a  
brief introduction then there will be the workshop by Hugo and James and you can ask questions along the way in the chat  
or actually the best place to ask questions is the Q&A and there's an option to upvote as well so yet asking the Q&A if  
you happen to post it on the chat by mistake I can also transfer it over to Q&A so that would be fine too and this  
webinar is being recorded briefly about me I am a statistician and data scientist and I am the founder of Data Umbrella  
I am on a lot of platforms as Reshama so feel free to follow me on Twitter and LinkedIn we have a code of conduct we're  
dedicated to providing harassment free experience for everyone thank you for helping to make this a welcoming friendly  
professional community for all 

<!-- Editing Guide: The pipe (|) position in this comment is 120:                                                       | -->
### Introduction

Okay hello and welcome to Data Umbrella's webinar for October so I'm just going to go over the agenda I'm going to do a  
brief introduction then there will be the workshop by Hugo and James and you can ask questions along the way in the chat  
or actually the best place to ask questions is the Q&A and there's an option to upvote as well so yet asking the Q&A if  
you happen to post it on the chat by mistake I can also transfer it over to Q&A so that would be fine too and this  
webinar is being recorded briefly about me I am a statistician and data scientist and I am the founder of Data Umbrella  
I am on a lot of platforms as Reshama so feel free to follow me on Twitter and LinkedIn we have a code of conduct we're  
dedicated to providing harassment free experience for everyone thank you for helping to make this a welcoming friendly  
professional community for all 

In [36]:
txt = """I gave either at the end of February or early March was data umbrella's inaugural  
tutorial and Meetup if I recall correctly on bayesian thinking and hacker statistics and simulation and  
that type of stuff so it's just wonderful to be back particularly with my colleague and friend James we're  
building really cool distributed data science products at coiled we'll say a bit about that but we'll do some  
introductions in a bit I just wanted to get you all accustomed to it was February thank you Reshama we're working """

formatted_trx == new_txt

False

## Update text_processing files (see How_Tos.ipynb)
Do these with gui:
```
# For the 'upper' file, textbox entry is list of lower-case names/acronyms to upper-case
upper	nyc
# For the 'corrections' file, textbox entry is a list of tuples:
corrections	 [('ipad', iPad'),('iphone', iPhone'),('coyle','coil'),('job lib', 'joblib'),('dars', 'Dask')]
```

In [81]:
uppercase_list = TRX.readcsv(TRX.upper_file).upper.tolist()
titlecase_list = (TRX.readcsv(TRX.people_file).people.tolist()
                + TRX.readcsv(TRX.names_file).names.tolist()
                + TRX.readcsv(TRX.places_file).places.tolist())
corrections = TRX.get_corrections_dict()

current_txt = tm.event_dict['formatted_transcript']

---
---
# Test: New transcript
## 5 steps to new transcript:
```
# 0. Instantiate new event
tm = Meta.TranscriptMeta()

# 1. Get dummy or actual data:
# Load the starter kv pairs
demo_kvs = Meta.dummy_kv_pairs.copy()
# Modify at will
demo_kvs.append(('transcriber', 'cat chenal'))
d = Meta.get_dummy_data(new_kv_pairs=demo_kvs)

# 2. update starter event_dict:
tm.update_dict(d)
# updated:
#tm.event_dict

# 3. update_readme()
tm.update_readme()

# 4. save_transcript_md()
tm.save_transcript_md()
```

### OK: New event in current year:

In [11]:
# 0. Instantiate new event
tm = Meta.TranscriptMeta()

# 1. Get dummy or actual data:
# Load the starter kv pairs
demo_kvs = Meta.dummy_kv_pairs.copy()
# modify
demo_kvs.append(('transcriber', 'max chenal'))
d = Meta.get_dummy_data(new_kv_pairs=demo_kvs)

# 2. update starter event_dict:
tm.update_dict(d)
# updated:
#tm.event_dict

# 3. update_readme()
tm.update_readme()

# 4. save_transcript_md()
tm.save_transcript_md()

### OK: New event in other year:

In [24]:
# 0. Instantiate new event
tm = Meta.TranscriptMeta()

# 1. Get dummy or actual data:
# Load the starter kv pairs
demo_kvs = Meta.dummy_kv_pairs.copy()
# modify
demo_kvs[0] = (demo_kvs[0][0], 'cat chenal')
demo_kvs[2] = (demo_kvs[2][0], 'foo demo')
demo_kvs.append(('transcriber', 'Billy Bop'))

d = Meta.get_dummy_data(year=2020,
                        new_kv_pairs=demo_kvs)

# 2. update starter event_dict:
tm.update_dict(d)
# updated:
tm.event_dict['year'], tm.event_dict['transcript_md']

# 3. update_readme()
tm.update_readme()

# 4. save_transcript_md()
tm.save_transcript_md()

tm.df

('2020', '21-cat-chenal-foo-demo.md')

Unnamed: 0,N,Speaker,Talk Transcript,Transcriber,Status,Notes,year,name
0,1,Hugo Bowne-Anderson,Bayesian Data Science,N.A.,Not recorded,,N.A.,N.A.
1,2,Bruno Goncalves,Time Series Modeling,N.A.,Not recorded,,N.A.,N.A.
2,3,Ty Shaikh,[Webscraping Poshmark](2020/03-ty-shaikh-websc...,Ty Shaikh,Needs reviewer,,2020,03-ty-shaikh-webscraping.md
3,4,Ali Spittel,[Navigating Your Tech Career](2020/04-ali-spit...,Janine,Needs reviewer,,2020,04-ali-spittel-career.md
4,5,Andreas Mueller,[Crash Course in Contributing to Scikit-learn]...,Reshama Shaikh,Complete,,2020,05-andreas-mueller-contributing.md
5,6,Reshama Shaikh,[Example PR for Scikit-learn](2020/06-reshama-...,"Reshama Shaikh, Mark",Complete,,2020,06-reshama-shaikh-sklearn-pr.md
6,7,Shailvi Wakhlu,[Fixing Bad Data (Using SQL)](2020/07-shailvi-...,Juanita,Complete,,2020,07-shailvi-wakhlu-fixing-data.md
7,8,Matt Brems,[Data Science with Missing Data](2020/08-matt-...,Barbara Graniello Batlle,Needs reviewer,,2020,08-matt-brems-missing-data.md
8,9,Sam Bail,[Intro to Terminal](2020/09-sam-bail-terminal.md),Isaack,Complete,,2020,09-sam-bail-terminal.md
9,10,Emily Robinson,[Build a Career in Data Science](2020/10-emily...,Kevin,Complete,,2020,10-emily-robinson-career.md


In [None]:
df, tbl_delims = Meta.df_from_readme_tbl()
df

---
# Test: update of existing transcript: need to keep same year & idn

##  steps to update transcript:
```
# 0. Instantiate existing event
year = 2021
idn = '06'  # or 6
tm = Meta.TranscriptMeta(idn, year)

# 1. Get dummy or actual data, keeping idn:
# Load the starter kv pairs
demo_kvs = Meta.dummy_kv_pairs.copy()
demo_kvs.append(('idn', idn))

# Modify at will
demo_kvs.append(('transcriber', 'sing song'))
# change kw to get new file:
demo_kvs[2] = (demo_kvs[2][0], 'better demo')

d = Meta.get_dummy_data(new_kv_pairs=demo_kvs)

# 2. update starter event_dict:
tm.update_dict(d)
tm.to_delete  # should not be None

# 3. update_readme()
tm.update_readme()

# 4. save_transcript_md()
tm.save_transcript_md()
```

In [119]:
# 0. Instantiate existing event
year = 2021
idn = '06'  # or 6
tm = Meta.TranscriptMeta(idn, year)


# 1. Get dummy or actual data, keeping idn:

# Load the starter kv pairs
demo_kvs = Meta.dummy_kv_pairs.copy()
demo_kvs.append(('idn', idn))
# Modify at will
demo_kvs.append(('transcriber', 'mae song'))
# change kw to get new file:
demo_kvs[2] = (demo_kvs[2][0], 'better demo')

d = Meta.get_dummy_data(new_kv_pairs=demo_kvs)
d['year'], d['idn']

('2021', '06')

In [120]:
# 2. update starter event_dict:
tm.update_dict(d)
tm.to_delete

WindowsPath('C:/Users/catch/Documents/GitHub/DU-event-transcript-demo/2021/06-cat-reshama-new-demo.md')

In [121]:
# 3. update_readme()
tm.update_readme()

# 4. save_transcript_md()
tm.save_transcript_md()

---
---
# DONE: Test: Trap missing tbl delimiters in README

---
---
# Audit: coverage of split_url
RE: regex in get_id_from_YT_url not working for all md files
## Q1: Is `parse_href` working? Yes
## Q2: Is video_url working? Yes
# Conclusion: Updated (fixed) regex in get_id_from_YT_url.

In [47]:
UTL.test_split_url()

Test meetup...
Test youtube...
All done.


---
---
# Audit: which xml files are not lowercase?
- Answer by testing 1st paragraph => Modified `xml_caption_to_text` to obtain `Audit.audit_xml_captions`

## Audit conclusion:
The xml files have consistently been lowercase since event 12, hence this does not warrant implementing
of a by-pass to text cleaning if they are not (the corrections would still need applying but they would not
be optimal without adding special cases if text is not lowercase).

In [3]:
AUD.audit_all_events(audit_captions=True,
                     replace_xml=True)

AUDIT ALL EVENTS
* Replacement of xml files selected.
* Captions case check (on 1st P with minutes_mark= 1):
03, 2020:: Lower= False
everyone I am doing a recording of the scraping presentation and the original recording from the webinar didn't come out well so this is just a recording in today's presentation I'm going to talk about web scraping we're going to look at the website Poshmark comm and we're going to use Python and some additional packages to gather the data so agenda I'm gonna give a quick introduction about myself and the group then we're going to talk about web scraping and high level then we'll walk through a code example I'm going to share the code files so you can walk through it on your own as well and then during the webinar there was obviously QA it's a little bit about me I'm a product manager with General Assembly I used to run operations at an online data science bootcamp and that's kind of where I picked up everything I know about Python and programming and dat

---
---
# DONE: Test: Change GrispecLayout ("a regulary-spaced grid": missed that!) to GridBox

---
---
# DONE: Incorporate modification to propercasing files in Edit page + reprocess
**Note**: This _might_ disappear once punctuation is restored with an NLP model

---
---
# DONE: Added NullHandler: if not DEBUG_MODE, no Output widget created

---
---

In [81]:
AC = CTR.AppControls()  # class, GUI controls instantiation
gui = AC.app            # AppLayout method
gui

Output(layout=Layout(border='1px solid black', height='160px', width='100%'))

AppLayout(children=(Accordion(children=(VBox(children=(ToggleButtons(button_style='info', options=('Enter Info…

In [65]:
gui.center.children[1].children[0].clear_output()

Output(outputs=({'name': 'stdout', 'text': 'Update dict: OK!\nUpdate readme: OK!\nSave: Done!\n', 'output_type…

In [60]:
input_form = AC.PC.page.children[1].children[0]
        
d = CTR.get_accordion_entries(input_form)
d

{'year': '2021',
 'presenter': 'Cat Chenal, Reshama Shaikh',
 'title': 'Automating Audio Tanscription.',
 'title_kw': 'audio demo',
 'video_url': 'https://youtu.be/MHAjCcBfT_A',
 'video_href': 'http://www.youtube.com/watch?feature=player_embedded&v=MHAjCcBfT_A',
 'video_href_src': 'http://img.youtube.com/vi/MHAjCcBfT_A/0.jpg',
 'video_href_alt': 'Automating Audio Tanscription.',
 'event_url': 'N.A.',
 'slides_url': 'N.A.',
 'repo_url': 'N.A.',
 'notebook_url': 'N.A.',
 'transcriber': 'Mama Chenal',
 'status': 'Not yet processed (editor needed)',
 'notes': 'N.A.',
 'extra_references': '## Other References\n- Paper:  <Paper url or citation> \n'}

In [79]:
gui.data_dict

OrderedDict([('presenter', 'Cat Chenal, Reshama Shaikh'),
             ('title', 'Automating Audio Tanscription.'),
             ('event_url', 'N.A.'),
             ('yt_video_id', 'MHAjCcBfT_A'),
             ('slides_url', 'https://www.example.com'),
             ('repo_url', 'https://www.example.com'),
             ('notebook_url', 'N.A.'),
             ('transcriber', 'Bibi Chenal'),
             ('extra_references',
              '## Other References\n- Binder:  url \n- Paper:  Paper url or citation \n- Wiki:  This is an excellent [wiki](http://en.wikipedia.org/wiki/Main_Page) \n'),
             ('video_href',
              'http://www.youtube.com/watch?feature=player_embedded&v=MHAjCcBfT_A'),
             ('video_href_src', 'http://img.youtube.com/vi/MHAjCcBfT_A/0.jpg'),
             ('video_href_alt', 'Automating Audio Tanscription.'),
             ('video_href_w', '25%'),
             ('formatted_transcript',
              "<!-- Editing Guide: The pipe (|) position in this comm

In [50]:
menu_idx = AC.app.left_sidebar.selected_index
AC.app.left_sidebar.children[menu_idx].children[0].index = None

In [28]:
#AC.PC.page.children[1].children[1].value
txa = AC.PC.page.children[1].children[1]
type(txa)
len(txa.value)

ipywidgets.widgets.widget_string.Textarea

60133

In [31]:
T = """
'<!-- Editing Guide: The pipe (|) position in this comment is 120:                                                       | -->\n### Introduction\n\nOkay hello and welcome to Data Umbrella\'s webinar for October so I\'m just going to go over the agenda I\'m going to do a  \nbrief introduction then there will be the workshop by Hugo and James and you can ask questions along the way in the chat  \nor actually the best place to ask questions is the Q&A and there\'s an option to upvote as well so yet asking the Q&A if  \nyou happen to post it on the chat by mistake I can also transfer it over to Q&A so that would be fine too and this  \nwebinar is being recorded. Briefly about me. I am a statistician and data scientist and I am the founder of Data Umbrella.  \nI am on a lot of platforms as Reshama so feel free to follow me on Twitter and LinkedIn. We have a code of conduct we\'re  \ndedicated to providing a harassment-free experience for everyone. Thank you for helping to make this a welcoming friendly  \nprofessional community for all and this code of conduct applies to the chat as well. So our mission is to provide an  \ninclusive community for underrepresented persons in data science and we are an all volunteer-run organization you can  \nsupport Data Umbrella by doing the following things: You can follow our code of conduct and keep our community a place  \nwhere everybody wants to keep coming to; You can donate to our open collective and that helps to pay meet-up dues and  \nother operational costs and you can check out this link here on GitHub we have this new initiative where all the videos  \nare being transcribed and so is to make them more accessible. So we take the YouTube videos and we put the raw there and  \nso we\'ve had a number of volunteers help us transcribe it so feel free to check out this link and maybe if you do this  \nvideo maybe the two speakers will follow you on Twitter. I can\'t promise anything, but it\'s possible Data Umbrella has a  \njob board and it\'s at jobs.org and once this gets started I\'ll put some links in the chat. The job that we are  \nhighlighting today is the machine learning engineer job by Development Seed. Development Seed is based in  \nWashington DC and Lisbon Portugal and they do I\'m going to go to the next slide what they do is they\'re doing social  \ngood work and so they\'re doing for instance mapping elections from Afghanistan to the US analyzing public health and  \neconomic data from Palestine to Illinois and leading the strategy and development behind data world bank and some other  \norganizations. I will share a link to their job posting in the chat as well as soon as I finish this brief  \nintroduction. Check out our website for resources there\'s a lot of resources on learning Python and R also for  \ncontributing to open source also for guides on accessibility and responsibility and allyship. We have a monthly  \nnewsletter that goes out towards the end of the month and it has information on our upcoming events. We have two great  \nevents coming up in November and December on open source so subscribe to our newsletter to be in the know. We are on all  \nsocial media platforms as Data Umbrella Meetup is the best place to join to find out about upcoming events our website  \nhas resources follow us on Twitter we also share a lot of information on LinkedIn and if you want to subscribe to our  \nYouTube channel we record all of our talks and post them there within about a week of the talk so it\'s a good way to get  \ninformation. OK and now we are ready to get started so I will put myself on mute and I will hand it over to Hugo and James  \nand let you take over but thank you all for joining!   \n \n\n#### 00:04:03,120::\t\t4 minutes Mark -> new paragraph \n \nI just want to thank Reshama Christina and everyone else who tied all the tireless effort that  \nthat goes into putting these meet-ups and these online sessions together I think one thing I want to say is actually  \nthe last in-person workshop I gave either at the end of February or early March was Data Umbrella\'s inaugural  \ntutorial and Meetup if I recall correctly on Bayesian thinking and hacker statistics and simulation and  \nthat type of stuff so it\'s just wonderful to be back particularly with my colleague and friend James we\'re  \nbuilding really cool distributed data science products at coiled we\'ll say a bit about that but we\'ll do some  \nintroductions in a bit I just wanted to get you all accustomed to it was February thank you Reshama we\'re working  \nwith Jupyter notebooks in a GitHub repository the repository is pinned to the top of the chat this is what it looks like  \nthese are all the files this is the file system now we use something called Binder which is a project out of and related  \nto project Jupyter which provides infrastructure to run notebooks without any local installs so there are two  \nways you can code along on this tutorial the first is and I won\'t get you to do this yet is to launch Binder the  \nreason I won\'t get you to do that yet is because once you launch it we have 10 minutes to start coding or the Binder  \nsession Times out I\'ve been burnt by that before actually several Times I\'m surprised I even remembered it this time the  \nother thing you can do is install everything locally by cloning the repository downloading anaconda creating a conda  \nenvironment if you haven\'t done that I suggest you do not do that now and you launch the Binder James is going to  \nstart by telling us a few things about GAs and distributed computing in general my question for you James is  \nif we get people to launch this now will we get to execute a cell code cell in 10 minutes I would let\'s hold off for now  \nmaybe yep maybe I\'ll indicate when we should launch Binder OK fantastic cool and just what I\'m looking at right now is  \nthe GitHub repository on your browser OK exactly so I will not launch Binder now I will not get you to now I\'ve I\'m  \ndoing this locally and we see that I\'m in notebook zero and if you want to actually have a look at this notebook before  \nlaunching Binder, it\'s in the notebooks Data Umbrella subdirectory and it\'s notebook zero and we\'re going to hopefully  \nmake it through the overview then chatting about Dask, Dask delayed and data framing and machine learning great so we  \nhave Hashim has said you could open in VSCode as well. You could. I mean that would require all your local installs and  \nthat that type of stuff as well but we\'re to introduce me and James we work at coiled where we build products for  \ndistributed computing in infrastructure as we\'ll see one of the big problems with like bursting to the cloud is all the  \nlike Kubernetes AWS docker stuff so we build a one-click host of deployments for das but for data science and machine  \nlearning in general James maintains task along with Matt Matt Rocklin who created Dask with a team of people and was  \nworking with Continuum Anaconda at the Time and James is a software engineer at coiled and I run data science evangelism  \nMarketing work on a bunch of product stuff as well wear a bunch of different hats occasionally.  \n \n\n#### 00:08:01,680::\t\t4 minutes Mark -> new paragraph \n \nThere are many ways to think about distributed compute and how to do it in in Python we\'re going to present  \nhey James you\'re muted I\'m taking it I went away based on what I see in the chat you did you did but now we\'re back I\'ve  \nintroduced you I\'ve introduced me I\'ve mentioned that there are many ways to do distributed compute in the Python  \necosystem and we\'ll be chatting about one called Dask and maybe I\'ll pass you in a second but I\'ll say one thing that I  \nreally like about my background isn\'t in distributed compute my background\'s in Pythonic data science when thinking  \nabout bursting to larger data sets and larger models there are a variety of options the thing that took me attracted me  \nto desk originally I saw Cameron\'s note the ghost in the machine aren\'t playing nice tonight I think that ain\'t that the  \ntruth is that dark plays so nicely with the entire PyData ecosystem so as we\'ll see if you want to write dash code for  \ndata frames dash data frames it really mimics your Pandas code same with numpy same with scikit-learn OK and the other  \nthing is dark essentially runs the Python code under the hood so your mental model of what\'s happening is actually  \ncorresponds to the code being being executed OK now I\'d like to pass over to James but it looks like he\'s disappeared  \nagain I\'m still here if you can hear me I\'ve just turned my camera off oh yeah OK great I\'m gonna turn my camera  \nhopefully that will help yeah and I might do do the same for bandwidth bandwidth issues so if if you want to jump in and  \nand talk about dark at a high level I\'m sharing my screen and we can scroll through yeah that sounds great so that\'s  \nsort of a nutshell you can think of it as being composed of two main well components the first we call collections these  \nare the user interfaces that you use to actually construct a computation you would like to compute in parallel or on  \ndistributed hardware there are a few different interfaces that Dask implements. For instance, there\'s Dask array for doing  \nnd array computations there\'s Dask dataframe for working with tabular data you can think of those as like Dask array as  \na parallel version of numpy. Dask dataframe has a parallel version of Pandas and so on there are also a couple other  \ninterfaces that we\'ll be talking about das delayed for instance we\'ll talk about that today we\'ll also talk about the  \nfutures API those are sort of for lower level custom algorithms in sort of paralyzing existing existing code the main  \ntakeaway is that there are several sort of familiar APIs that desk implements and that will use today to actually  \nconstruct your computation so that\'s the first part of desk it is these dash collections you then take these collections  \nset up your steps for your computation and then pass them off to the second component which are desk schedulers and  \nthese will actually go through and execute your computation potentially in parallel there are two flavors of schedulers  \nthat desk offers the first is a are called single machine schedulers and these just take advantage of your local  \nhardware they will spin up a a local thread or process pool and start submitting tasks in your computation to to be  \nexecuted in parallel either on multiple threads or multiple processes there\'s also a distributed scheduler or maybe a  \nbetter term for would actually be called the advanced scheduler because it works well on a single machine but it also  \nscales out to multiple machines so for instance as you\'ll see later we will actually spin up a distributed scheduler  \nthat has workers on remote  \n \n\n#### 00:12:00,160::\t\t4 minutes Mark -> new paragraph \n \nmachines on AWS so you can actually scale out beyond your local resources like say what\'s on your laptop kind of  \nscrolling down then to the image of the cluster we can see the main components of the distributed scheduler and James I  \nmight get people to spin up the Binder now because we\'re going to execute codes now is a good point yep so just here\'s a  \nquick break point before you know a teaser for schedulers and what\'s happening there I\'ll ask you to in the repository  \nthere\'s also the link to the Binder click on launch Binder I\'m going to open it in a new tab and what this will create  \nis an environment in which you can just execute the code in in the notebooks OK so hopefully by the Time we\'ve gotten  \ngone through this section this will be ready to start executing code so if everyone wants to do that to code along  \notherwise just watch or if you\'re running things locally also cool thanks James yeah yeah no problem thank you so so  \nyeah looking at the image for the distributed scheduler we\'re not gonna have Time to go into the a lot of detail about  \nthe distributed scheduler in this workshop so but we do want to provide at least a high level overview of the the  \ndifferent parts and components of the distributed scheduler so the first part I want to talk about is in the diagram  \nwhat\'s labeled as a client so this is the user facing entry point to a cluster so wherever you are running your Python  \nsession that could be in a Jupyter lab session like we are here that could be in a Python script somewhere you will  \ncreate and instantiate a client object that connects to the second component which is the das scheduler so each desk  \ncluster has a single scheduler in it that sort of keeps track of all of the state for all of the the state of your  \ncluster and all the tasks you\'d like to compute so from your client you might start submitting tasks to the cluster the  \nschedule will receive those tasks and compute things like all the dependencies needed for that task like say you\'re  \nimplementing you say you want to compute task c but that actually requires first you have to compute task b and task a  \nlike there are some dependency structures there it\'ll compute those dependencies as well as keep track of them it\'ll  \nalso communicate with all the workers to understand what worker is working on which task and as space frees up on the  \nworkers it will start farming out new tasks to compute to the workers so in this particular diagram there are three das  \ndistributed workers here however you can have as you can have thousands of workers if you\'d like so the workers are the  \nthings that actually compute the tasks they also store the results of your tasks and then serve them back to you and the  \nclient the scheduler basically manages all the state needed to perform the computations and you submit tasks from the  \nclient so that\'s sort of a quick whirlwind tour of the different components for the distributed scheduler and at this  \npoint I think it\'d be great to actually see see some of this in action Hugo would like to take over absolutely thank you  \nfor that wonderful introduction to Dask and and the schedulers in particular and we are going to see that with dark in  \naction I\'ll just note that this tab in which I launched the Binder is up and running if you\'re going to execute code  \nhere click on notebooks click on Data Umbrella oop and then go to the overview notebook and you can drag around we\'ll  \nsee the utility of these these dashboards in a second but you can you know drag your stuff around to to make you know  \nhowever you want to want to structure it and then you can execute code in here I\'m not going to do that I\'m going to do  \nthis locally at the moment but just to see dust in action to begin with I\'m going to I\'m actually going to  \n \n\n#### 00:16:02,720::\t\t4 minutes Mark -> new paragraph \n \nrestart kernel and clear my outputs so I\'m going to import from dash distributed the client the sorry the other thing I  \nwanted to mention is we made a decision around content for this we do have a notebook that we we love to teach on  \nschedulers but we decided to switch it out for machine learning for this workshop in particular we are teaching a  \nsimilar although distinct workshop at PyData global so we may see some of you there in which we\'ll be going more in  \ndepth into schedulers as well so if you want to check that out definitely do so we instantiate the client which as James  \nmentioned is kind of what we work with as the user to submit our code so that will take take a few seconds OK it\'s got a  \nport in you so it\'s going going elsewhere what I\'ll just first get you to notice is that it tells us where our dashboard  \nis and we\'ll see those tools in a second tells us about our cluster that we have four workers eight cores between eight  \nand nine gigs of of ram OK now this is something I really love about Dask all the diagnostic tools if I click on the  \nlittle desk thing here and we\'ve modified the Binder so that that exists there as well we can see I\'ll hit search and it  \nshould that now corresponds to the the scheduler now I want to look at the task stream which will tell us in real Time  \nwhat\'s happening I also want to look at the cluster map so we see here this is already really cool we\'ve got all of our  \nworkers around here and our scheduler scheduler there and when we start doing some compute we\'ll actually see  \ninformation flowing between these and the other thing maybe I\'ll yeah I\'ll include a little progress and that can be an  \nalternate tab to ask I\'m wondering perhaps I also want to include something about the workers yeah OK great so we\'ve got  \na bunch of stuff that\'s that\'s pretty interesting there and so the next thing I\'m going to do we\'ve got a little utility  \nfile which downloads some of the data and this is what it does is if you\'re in Binder it downloads a subset of the data  \nif you\'re anywhere else it loads a larger set for this particular example we\'re dealing with a small data set you see  \nthe utility of dark and distributed compute when it generalizes to larger data sets but for pedagogical purposes we\'re  \ngoing to sit with a smaller data set so that we can actually run run the code there\'s a trade-off there so actually that  \nwas already downloaded it seems but you should all see it download I\'m actually going to run that in the Binder just to  \nyou should start seeing downloading NYC flights data set done extracting creating json data etc OK now what we\'re going  \nto do is we\'re going to read in this data as a Dask data frame and what I want you to notice is that it really the das  \ncode mimics Pandas code so instead of pd read csv we\'ve got dd read csv we\'ve got you know this is the file path the  \nfirst argument we\'re doing some parse date setting some data types OK we\'ve got a little wild card regular expression  \nthere to to join to do a bunch of them and then we\'re performing a group by OK so we\'re grouping by the origin of these  \nflight flight data we\'re looking at the the mean departure delay group by origin the the one difference I want to make  \nclear is that in das we need a compute method that\'s because das performs lazy computation it won\'t actually do anything  \nbecause you don\'t want it to do anything on really large data sets until you explicitly tell it tell it to compute so  \nI\'m going to execute this now and we should see some information  \n \n\n#### 00:20:01,520::\t\t4 minutes Mark -> new paragraph \n \ntransfer between the scheduler and the workers and we should see tasks starting starting to be done OK so moment of  \ntruth fantastic so we call this a pew pew plot because we see pew pew pew we saw a bunch of data transfer happening  \nbetween them these are all our cause and we can see tasks happening it tells us what tasks there are we can see that  \nmost of the Time was spent reading csvs then we have some group bias on chunks and and that type of stuff so  \nthat\'s a really nice diagnostic tool to see what most of your work is is actually doing under dark work as you can see  \nmemory used CPU use more fine-grained examples there so I I\'d love to know if in the Q&A I\'m going to ask were you able  \nto execute this code and if you were in Binder just a thumb up a vote would be no would be fantastic much appreciated so  \nas we\'ve mentioned I just wanted to say a few things about tutorial goals the goal is to cover the basics of dark and  \ndistributed compute we\'d love for you to walk away with an understanding of when to use it when to not what it has to  \noffer we\'re going to be covering the basics of Dask delayed which although not immediately applicable to data science  \nprovides a wonderful framework for thinking about Dask how dark works and understanding how it works under the hood then  \nwe\'re going to go into dark data frames and then machine learning hopefully due to the technical considerations with  \nwe\'ve got less Time than than we thought we would but we\'ll definitely do the best we can we may have less Time to do  \nexercises so we\'ve had two people who are able to execute this code if you if you tried to execute it in Binder and were  \nnot able to perhaps post that in the Q&A but we also have several exercises and I\'d like you to take a minute just to do  \nthis exercise the I I\'m not asking you to do this because I want to know if you\'re able to print hello world I\'m  \nessentially asking you to do it so you get a sense of how these exercises work so if you can take 30 seconds to print  \nhello world then we\'ll we\'ll move on after that so just take 30 seconds now and it seems like we have a few more people  \nwho are able to execute code which which was great OK fantastic so you will put your solution there for some reason I  \nhave an extra cell here so I\'m just going to clip that and to see a solution I\'ll just get you to execute this cell and  \nit provides the solution and then we can execute it and compare it to the the output of what you had OK hello world so  \nas as we saw I\'ve done all this locally you may have done it on Binder there is an option to work directly from the  \ncloud and I\'ll I\'ll take you through this there are many ways to do this as I mentioned we\'re working on one way with  \ncoil and I\'ll explain the rationale behind that in in a second but I\'ll show you how easy it is to get a cluster up and  \nrunning on on AWS without even interacting with AWS for free for example you can follow along by signing into coiled  \ncloud to be clear this is not a necessity and it does involve you signing up to our product so I just wanted to be  \nabsolutely transparent about that it does not involve any credit card information or anything  \n \n\n#### 00:24:01,520::\t\t4 minutes Mark -> new paragraph \n \nalong those lines and in my opinion it does give a really nice example of how to run stuff on the cloud to do so you can  \nsign in at cloud.coiled.io you can also pip install coiled and then do authentication you can also spin up this  \nthis hosted coiled notebook so I\'m going to spin that up now and I\'m going to post that here actually yep I\'m gonna post  \nthat in the ch chat if you let me get this right if you\'ve if you\'ve never logged in to code before it\'ll ask you to  \nsign up using gmail or GitHub so feel free to do that if you\'d like if not that\'s also also cool but I just wanted to be  \nexplicit about that the reason I want to do this is to show how Dask can be leveraged to do work on really large datasets  \nso you will recall that I had between eight and nine gigs of ram on my local system. Oh wow! Anthony says on iPad  \n"unable to execute" on Binder, incredible! I don\'t have a strong sense of how Binder works on iPad. I do know that I was able  \nto to check to use a Binder on my iPhone several years ago on my way to scipy doing code review for someone for Eric  \nMaher I think for what that that\'s worth but back to this we have this NYC taxi data set which is over 10 gigs it won\'t  \neven I can\'t even store that in local memory I don\'t have enough ram to store that so we do need either to do it locally  \nin an out of core mode of some sort or we can we can burst to the cloud and we\'re actually going to burst to the cloud  \nusing using coiled so the notebook is running here for me and but I\'m actually gonna do it from my local local notebook  \nbut you\'ll see and once again feel free to code along here it\'s spinning up a notebook and James who is is my co-  \ninstructor here is to be I\'m I\'m so grateful all the work is done on our notebooks in coiled you can launch the cluster  \nhere and then analyze the entire over 10 gigs of data there I\'m going to do it here so to do that I import coiled and  \nthen I import the dash distributed stuff and then I can create my own software environment cluster configuration I\'m not  \ngoing to do that because the standard coiled cluster configuration software environment works now I\'m going to spin up a  \ncluster and instantiate a client now because we\'re spinning up a cluster in in the cloud it\'ll take it\'ll take a minute  \na minute or two enough Time to make a cup of coffee but it\'s also enough Time for me to just talk a bit about why this  \nis important and there are a lot of a lot of good good people working on on similar things but part of the motivation  \nhere is that if you want to you don\'t always want to do distributed data science OK first I\'d ask you to look at instead  \nof using dark if you can optimize your Pandas code right second I\'d ask if you\'ve got big data sets it\'s a good question  \ndo you actually need all the data so I would if you\'re doing machine learning plot your learning curve see how accurate  \nsee how your accuracy or whatever your metric of interest is improves as you increase the amount of data right and if it  \nplateaus before you get to a large data size then you may as well most of the Time use your small data see if sub  \nsampling can actually give you the results you need so you can get a bigger bigger access to a bigger machine so you  \ndon\'t have to burst to the cloud but after all these things if you do need to boast burst to the cloud until recently  \nyou\'ve had to get an AWS account you\'ve had to you know set up containers with docker and or Kubernetes and do all of  \nthese kind of  \n \n\n#### 00:28:00,640::\t\t4 minutes Mark -> new paragraph \n \nI suppose devopsy software engineering foo stuff which which if you\'re into that I I absolutely encourage you encourage  \nyou to do that but a lot of working data scientists aren\'t paid to do that and I don\'t necessarily want to so that\'s  \nsomething we\'re working on is thinking about these kind of one-click hosted deployments so you don\'t have to do all of  \nthat having said that I very much encourage you to try doing that stuff if if you\'re interested we\'ll see that the the  \ncluster has just been created and what I\'m going to do we see that oh I\'m sorry I\'ve done something funny here I\'m I\'m  \nreferencing the previous client anna James yeah it looks like you should go ahead and connect a new client to the coil  \ncluster and making sure not to re-execute the cluster creation exactly so would that be how would I what\'s the call here  \nI would just open up a new cell and say client equals capital client and then pass in the cluster like open parentheses  \ncluster yeah great OK fantastic and what we\'re seeing is a slight version this we don\'t need to worry about this this is  \nessentially saying that the environment on the cloud mis is there\'s a slight mismatch with my with my local environment  \nwe\'re fine with that I\'m going to look here for a certain reason the the dashboard isn\'t quite working here at the  \nmoment James would you suggest I just click on this and open a new yeah click on the ecs dashboard link oh yes fantastic  \nso yep there\'s some bug with the local dashboards that we\'re we\'re currently currently working on but what we\'ll see now  \njust a SEC I\'m going to remove all of this we\'ll see now that I have access to 10 workers I have access to 40 cores and  \nI have access to over 170 gigs of memory OK so now I\'m actually going to import this data set and it\'s the entire year  \nof data from 2019 and we\'ll start seeing on on the diagnostics all the all the processing happening OK so oh actually  \nnot yet because we haven\'t called compute OK so it\'s done this lazily we\'ve imported it it shows kind of like Pandas  \nwhen you show a data frame the column names and data types but it doesn\'t show the data because we haven\'t loaded it yet  \nit does tell you how many partitions it is so essentially and we\'ll see this soon das data frames correspond to  \ncollections of Pandas data frames so they\'re really 127 Pandas data frames underlying this task data frame so now I\'m  \ngoing to do the compute well I\'m going to set myself up for the computation to do a group by passenger gown and look at  \nthe main tip now that took a very small amount of Time we see the IPython magic Timing there because we haven\'t computed  \nit now we\'re actually going to compute and James if you\'ll see in the chat Eliana said her coil coiled authentication  \nfailed I don\'t know if you\'re able to to help with that but if you are that would be great and it may be difficult to  \ndebug in but look as we see we have the task stream now and we see how many you know we\'ve got 40 cores working together  \nwe saw the processing we saw the bytes stored it\'s over 10 gigs as I said and we see we were able  \n \n\n#### 00:32:01,519::\t\t4 minutes Mark -> new paragraph \n \nto do our basic analytics we were able to do it on a 10 plus gig data set in in 21.3 seconds which is pretty pretty  \nexceptional if any any code based issues come up and they\'re correlated in particular so if you have questions about the  \ncode execution please ask in the Q&A not in the chat because others cannot vote it and I will definitively prioritize  \nquestions on technical stuff particularly ones that up that are upvoted but yeah I totally agree thanks thanks very much  \nso yeah let\'s jump into into data frames so of course we write here that in the last exercise we used ask delayed to  \nparallelize loading multiple csv files into a Pandas DataFrame we\'re not we we haven\'t done that but you can definitely  \ngo through and have a look at that but I think perhaps even more immediately relevant for a data science crowd and an  \nanalytics crowd is which is what I see here from the reasons people people have joined is jumping into Dask dataframes  \nand as I said before, a Dask dataframe really feels like a Pandas data frame but internally it\'s composed of many  \ndifferent data frames this is one one way to think about it that we have all these Pandas data frames and the  \ncollection of them is a dark data frame and as we saw before they\'re partitioned we saw when we loaded the taxi data set  \nin the dash data frame was 127 partitions right where each partition was a normal panda Pandas data frame and they can  \nlive on disk as they did early in the first example dark in action or they can live on other machines as when I spun up  \na coiled cluster and and did it on on AWS something I love about Dask data frames I mean I ran about this all the  \ntime it\'s how it\'s the Pandas API and and Matt Matt Rocklin actually has a post on on the blog called a brief history of  \nDask in which he talks about the technical goals of us but also talks about a social goal of task which in Matt\'s words  \nis to invent nothing he wanted and the team wanted the Dask API to be as comfortable and familiar for users as possible  \nand that\'s something I really appreciate about it so we see we have element element wires on operations we have the our  \nfavorite row eyes selections we have loc we have the common aggregations we saw group buyers before we have is-ins we  \nhave date Time string accessors oh James we forgot to I forgot to edit this and I it should be grouped by I don\'t know  \nwhat what a fruit buy is but that\'s something we\'ll make sure the next iteration to to get right at least we\'ve got it  \nright there and in the code but have a look at the dash data frame API docs to check out what\'s happening and a lot of  \nthe Time dash data frames can serve as drop in replacements for Pandas data frames the one thing that I just want to  \nmake clear as I did before is that you need to call compute because of the lazy laser compute property of das so this is  \nwonderful to talk about when to use data frames so if your data fits in memory use Pandas if your data fits in memory  \nand your code doesn\'t run super quickly I wouldn\'t go to Dask I\'d try to I\'d do my best to optimize my Pandas code  \nbefore trying to get gains gains and efficiency but dark itself becomes useful when the data set you want to analyze is  \nlarger than your machine\'s ram where you normally run into memory errors and that\'s what we saw  \n \n\n#### 00:36:01,520::\t\t4 minutes Mark -> new paragraph \n \nwith the taxicab example the other example that we\'ll see when we get to [music] machine learning is you can do machine  \nlearning on a small data set that fits in memory but if you\'re building big models or training over like a lot of  \ndifferent hyper parameters or different types of models you can you can parallelize that using using dark so there is  \nyou know you want to use dash perhaps in the big data or medium to big data limit as we see here or in the medium to big  \nmodel limit where training for example takes and takes a lot of Time OK so without further ado let\'s get started with  \ndas data frames you likely ran this preparation file to get the data in the previous notebook but if you didn\'t execute  \nthat now we\'re going to get our file names by doing doing a few joins and we see our file is a string data NYC flights a  \nwildcard to access all of them dot dot csv and we\'re going to import our Dask.dataframe and read in our dataframe  \nparsing some dates setting some sending some data types OK I\'ll execute that we\'ll see we have 10 partitions as we noted  \nbefore if this was a Pandas data frame we\'d see a bunch of entries here we don\'t we see only the column names and the  \ndata types of the columns and the reason is as we\'ve said it explicitly here is the representation of the data frame  \nobject contains no data it\'s done Dask has done enough work to read the start of the file so that we know a bit about it  \nsome of the important stuff and then further column types and column names and data types OK but we don\'t once again we  \ndon\'t let\'s say we\'ve got 100 gigs of data we don\'t want to like do this call and suddenly it\'s reading all that stuff  \nin and doing a whole bunch of compute until we explicitly tell it to OK now this is really cool if you know a bit of  \nPandas you\'ll know that you can there\'s an attribute columns which prints out it\'s well it\'s actually the columns form  \nan index right the Pandas index object and we get the we get the column names there cool Pandas in dark form we can  \ncheck out the data types as well as we would in Pandas we see we\'ve got some ins for the day of the week we\'ve got some  \nfloats for departure Time maybe we\'d actually prefer that to be you know a date Time at some point we\'ve got some  \nobjects which generally are the most general on objects so generally strings so that\'s all Pandasey type stuff in  \naddition das data frames have an attribute n partitions which tells us the number of partitions and we saw before that  \nthat\'s 10 so I\'d expect to see 10 here hey look at that now this is something that we talk about a lot in the delayed  \nnotebook is really the task graph and I don\'t want to say too much about that but really it\'s a visual schematic of of  \nthe order in which different types of compute happen OK and so the task graph for read csv tells us what happens when we  \ncall compute and essentially it reads csv 10 ten Times zero indexed of course because Python it reads csv ten different  \nTimes into these ten different Pandas Pandas data frames and if there were group buys or stuff after that we\'d see them  \nhappen in in the in the graph there and we may see an example of this in a second so once again as with Pandas we\'re  \ngoing to view the the head of the data frame great and we see a bunch of stuff you know we we see the first first five  \nrows I\'m actually also gonna gonna have a look at the  \n \n\n#### 00:40:02,240::\t\t4 minutes Mark -> new paragraph \n \nthe tail the final five rows that may take longer because it\'s accessing the the final I I there\'s a joke and it may not  \neven be a joke how much data analytics is actually biased by people looking at the first five rows before actually you  \nknow interrogating the data more more seriously so how would all of our results look different if if our files were  \nordered in in a different way that\'s another conversation for a more philosophical conversation for another Time so now  \nI want to show you some computations with dark data frames OK so since dash data frames implement a Pandas like API we  \ncan just write our familiar Pandas codes so I want to look at the column departure delay and look at the maximum of that  \ncolumn I\'m going to call that max delay so you can see we\'re selecting the column and then applying the max method as we  \nwould with Pandas. Oh what happened there gives us some Dask scalar series and what\'s happened is we haven\'t called compute  \nright so it hasn\'t actually done the compute yet we\'re going to do compute but first we\'re going to visualize the task  \ngraph like we did here and let\'s try to reason what the task graph would look like right so the task graph first it\'s  \ngoing to read in all of these things and then it\'ll probably perform this selector on each of these different Pandas  \ndata frames comprising the dash data frame and then it will compute the max of each of those and then do a max on all  \nthose maxes I think that\'s what I would assume is happening here great so that\'s what we\'re what we\'re doing we\'re  \nreading this so we read the first perform the first read csv get this das data frame get item I think is that selection  \nthen we\'re taking the max we\'re doing the same for all of them then we take all of these max\'s and aggregate them and  \nthen take the max of that OK so that that\'s essentially what\'s happening when I call compute which I\'m going to do now  \nmoment of truth OK so that took around eight seconds and it tells us the max and I I\'m sorry let\'s let\'s just get out  \nsome of our dashboards up as well huh I think in this notebook we are using the single machine scheduler Hugo so I don\'t  \nthink there is a dashboard to be seen exactly yeah thank you for that that that catch James great is even better James  \nwe have a question around using dark for reinforcement learning can you can you speak to that yeah so it depends on this  \nI mean yeah short answer yes you can use GAs to train reinforcement learning models so there\'s a package that Hugo will  \ntalk about called Dask ML that we\'ll see in the next notebook for distributing machine learning that paralyzes and and  \ndistributes some existing models using desks so for instance things like random forces forest inside kit learn so so yes  \nyou can use das to do distributed training for models I\'m not actually sure if Dask ML implements any reinforcement  \nlearning models in particular but that is certainly something that that can be done yeah and I\'ll I\'ll build on that by  \nsaying we are about to jump into machine  \n \n\n#### 00:44:00,000::\t\t4 minutes Mark -> new paragraph \n \nlearning I don\'t think as James said I don\'t think there\'s reinforcement learning explicitly that that one can do but  \nyou of course can use the das scheduler yourself to you know to distribute any reinforcement learning stuff you you have  \nas well and that\'s actually another another point to make that maybe James can speak to a bit more is that the dark team  \nof course built all of these high-level collections and task arrays and dust data frames and were pleasantly surprised  \nwhen you know maybe even up to half the people using dust came in all like we love all that but we\'re going to use the  \nscheduler for our own bespoke use cases right yeah exactly yeah the original intention was to like make basically a num  \nlike a parallel numpy so that was like the desk array stuff like run run numpy and parallel on your laptop and and yeah  \nso in order to do that we ended up building a distributed scheduler which sort of does arbitrary task computations so  \nnot just things like you know parallel numpy but really whatever you\'d like to throw at it and it turns out that ended  \nup being really useful for people and so yeah now people use that sort of on their own just using the distributed  \nscheduler to do totally custom algorithms in parallel in addition to these like nice collections like you saw Hugo  \npresents the dash data frame API is you know the same as the panda\'s API so there is this like familiar space you can  \nuse things like the high-level collections but you can also run whatever custom like Hugo said bespoke computations you  \nmight have exactly and it\'s it\'s been wonderful to see so many people so many people do that and the first thing as  \nwe\'ll see here the first thing to think about is if if you\'re doing lifestyle compute if there\'s anything you can you  \nknow parallelize embarrassingly as they say right so just if you\'re doing a hyper parameter search you just run some on  \none worker and some on the other and there there\'s no interaction effect so you don\'t need to worry about that as  \nopposed to if you\'re trying to do you know train on streaming data where you may require it all to happen on on on the  \nsame worker OK yeah so even think about trying to compute the standard deviation of a of a a univariate data set right  \nin in that case you can\'t just send you can\'t just compute the standard deviation on two workers and then combine the  \nresult in some some way you need to do something slightly slightly more nuanced and slightly slightly clever more clever  \nI mean you still can actually in in that case but you can\'t just do it as naively as that but so now we\'re talking about  \nparallel and distributed machine learning we have 20 minutes left so this is kind of going to be a whirlwind tour but  \nyou know whirlwinds when safe exciting and informative I just want to make clear the material in this notebook is based  \non the open source content from Dask\'s tutorial repository as there\'s a bunch of stuff we\'ve shown you today the reason  \nwe\'ve done that is because they did it so well so I just want to give a shout out to all the das contributors OK so what  \nwe\'re going to do now is just break down machine learning scaling problems into two categories just review a bit of  \nscikit-learn in passing solve a machine learning problem with single Michelle single Michelle I don\'t know who she is  \nbut single Michelle wow single machine and parallelism with scikit-learning joblib then solve an l problem with an ML  \nproblem with multiple machines and parallelism using dark as well and we won\'t have Time to burst for the cloud I don\'t  \nthink but you can also play play around with that OK so as I mentioned before when thinking about distributed compute a  \nlot of people do it when they have large data they don\'t necessarily think about the large model limit and this  \nschematic kind of speaks to that if you\'ve got a small model that fits in ram you don\'t need to think about  \n \n\n#### 00:48:00,480::\t\t4 minutes Mark -> new paragraph \n \ndistributed compute if your data size if your data is larger than your ram so your computer\'s ram bound then you want to  \nstart going to a distributed setting or if your model is big and CPU bound such as like large-scale hyper-parameter  \nsearches or like ensemble blended models of like machine learning algorithms whatever it is and then of course we have  \nthe you know big data big model limit where distributed computer desk is incredibly handy as I\'m sure you could imagine  \nOK and that\'s really what I\'ve what I\'ve gone through here a bird\'s-eye view of the strategies we think about if it\'s in  \nmemory in the bottom left quadrant just use scikit-learn or your favorite ML library otherwise known as scikit-learn  \nfor me anyway I was going to make a note about XGBoost but I but I won\'t for large models you can use joblib and your  \nfavorite circuit learn estimator for large data sets use our dark ML estimators so we\'re gonna do a whirlwind tour of  \nscikit-learn in in five minutes we\'re going to load in some data so we\'ll actually generate it we\'ll import scikit-  \nlearn for our ML algorithm create an estimator and then check the accuracy of the model OK so once again I\'m actually  \ngoing to clear all outputs after restarting the kernel OK so this is a utility function of scikit-learn to create some  \ndata sets so I\'m going to make a classification data set with four features and 10 000 samples and just have a quick  \nview of some of it so just a reminder on ML x is the samples matrix the size of x is the number of samples in terms of  \nrows number of features as columns and then a feature or an attribute is what we\'re trying to predict essentially OK so  \nwhy is the predictor variable which we\'re where which we\'re or the target variable which we\'re trying to predict so  \nlet\'s have a quick view of why it\'s zeros and ones in in this case OK so yep that\'s what I\'ve said here why are the  \ntargets which are real numbers for regression tasks or integers for classification or any other discrete sets of values  \nno words about unsupervised learning at the moment we\'re just going to support we\'re going to fit a support vector  \nclassifier for this example so let\'s just load the appropriate scikit-learn module we don\'t really need to discuss what  \nsupport vector classifiers are at the moment now this is one of the very beautiful things about the scikit-learn API in  \nterms of fitting the the model we instantiate a classifier and we want to fit it to the features with respect to the  \ntarget OK so the first argument is the features second argument is the target variable so we\'ve done that now I\'m not  \ngoing to worry about inspecting the learn features I just want to see how accurate it was OK and once we see how  \naccurate it was I\'m not gonna do this but then we can make a prediction right using estimator dot predict on a new a new  \ndata set so this estimator will tell us so this score will tell us the accuracy and essentially that\'s the proportion or  \npercentage a fraction of the results that were that the estimator got correct and we\'re doing this on the training data  \nset we\'ve just trained the model on this so this is telling us the accuracy on the on the training data set OK so it\'s  \n90  \n \n\n#### 00:52:01,760::\t\t4 minutes Mark -> new paragraph \n \naccurate on the training data set if you dive into this a bit more you\'ll recognize that if we we really want to know  \nthe accuracy on a holdout set or a test set and it should be probably a bit lower because this is what we use to fit it  \nOK but all that having been said I expect you know if if this is all resonating with you it means we can really move on  \nto the distributed stuff in in a second but the other thing that that\'s important to note is that we\'ve trained it but a  \nlot of model a lot of estimators and models have hyper parameters that affect the fit but you that we need to specify up  \nfront instead of being learned during training so you know there\'s a c parameter here there\'s a are we using shrinking  \nor not so we specify those we didn\'t need to specify them because there are default values but here we specify them OK  \nand then we\'re going to look at the score now OK this is amazing we\'ve got 50 accuracy which is the worst score possible  \njust think about this if if you\'ve got binary classification task and you\'ve got 40 accuracy then you just flip the  \nlabels and that changes to 60 accuracy so it\'s amazing that we\'ve actually hit 50 accuracy we\'re to be congratulated on  \nthat and what I want to note here is that we have two sets of hyper parameters we\'ve used one\'s created 90 actual model  \nwith 90 accuracy another one one with 50 accuracy so we want to find the best hyper parameters essentially and that\'s  \nwhy hyper parameter optimization is is so important there are several ways to do hyper parameter optimization one is  \ncalled grid search cross validation I won\'t talk about cross validation it\'s essentially a more robust analogue of train  \ntest split where you train on a subset of your data and compute the accuracy on a test on a holdout set or a test set  \ncross validation is a as I said a slightly more robust analog of this it\'s called grid search because we have a grid of  \nhyper parameters so we have you know in this case we have a hyper parameter c we have a hyper parameter kernel and we  \ncan imagine them in a in a grid and we\'re performing we\'re checking out the score over all this gr over this entire grid  \nof hyper parameters OK so to do that I import grid search csv now I\'m going to compute the estimator over over these  \ntrain the estimator over over this grid and as you see this is taking Time now OK and what I wanted to make clear and I  \nthink should be becoming clearer now is that if we have a large hyper parameter sweep we want to do on a small data set  \ndas can be useful for that OK because we can send some of the parameters to one worker some to another and they can  \nperform them in parallel so that\'s embarrassingly parallel because you\'re you\'re doing the same work as you would  \notherwise but sending it to a bunch of different workers we saw that took 30 seconds which is in my realm of comfort as  \na data scientist I\'m happy to wait 30 seconds if I had to wait much longer if this grid was bigger I\'d start to get  \nprobably a bit frustrated but we see that it computed it for c is equal to all combinations of these essentially OK so  \nthat\'s really all I wanted to say there and then we can see the best parameters and the best score so the best score was  \n0.098 and it was c10 and the kernel rbf a radial basis function it doesn\'t even Matter what that is though for the  \npurposes of this so we\'ve got 10 minutes left we\'re going to we\'re going to make it I can feel it I have a good I have a  \ngood sense  \n \n\n#### 00:56:00,400::\t\t4 minutes Mark -> new paragraph \n \n a good after the I mean this demo is actually going incredibly well given the initial technical hurdles so touchwood  \nHugo OK so what we\'ve done is we\'ve really segmented ML scaling problems into two categories CPU bound and ram bound and  \nI I really I can\'t emphasize that enough because I see so many people like jumping in to use new cool technologies  \nwithout perhaps taking it being a bit mindful and intentional about it and reasoning about when things are useful and  \nand when not I suppose the one point there is that sure data science is a technical discipline but there are a lot of  \nother aspects to it involving this type of reasoning as well so we then carried out a typical sklearn workflow for ML  \nproblems with small models and small data and we reviewed hyper parameters and hyper parameter optimization so in this  \nsection we\'ll see how joblib which is a set of tools to provide lightweight pipelining in Python gives us parallelism  \non our laptop and then we\'ll see how dark ML can give us awesome parallelism on on clusters OK so essentially what I\'m  \ndoing here is I\'m doing exactly the same as above with a grid search but I\'m using the quark the keyword argument n jobs  \nwhich tells you how many tasks to run in parallel using the cause available on your local workstation and specifying  \nminus one jobs means the it just runs them the maximum possible OK so I\'m going to execute that great so we should be  \ndone in a second feel free to ask any questions in the chat oh Alex has a great question in the Q&A does das have see a  \nsequel and query optimizer I\'m actually so excited that [music] and James maybe you can provide a couple of links to  \nthis we\'re really excited to have seen dark dust SQL developments there recently so that\'s dark hyphen hyphen SQL and  \nwe\'re actually we\'re working on some some content and a blog post and maybe a live live coding session about that in in  \nthe near future so if anyone if you want updates from from coil feel free to go to our website and sign up for our  \nmailing list and we\'ll let you know about all of this type of stuff but the short answer is yes Alex and it\'s getting  \nbetter and if James is able to post post a link there that would be that would be fantastic so we\'ve done link in the  \nchat fantastic [music] and so we\'ve we\'ve seen how we have [music] single machine parallelism here using the using the  \nend jobs quark and in the final minutes let\'s see multiple multi-machine parallelism with Dask OK so what I\'m going to  \ndo is I\'m going to do my imports and create my client incentive my client and check it out OK so once again I\'m working  \nlocally I hit search and that\'ll task is pretty smart in terms of like knowing which which client I want to check out do  \nthe tasks stream because it\'s my favorite I\'ll do the cluster map otherwise known as the pew pew map and then I want  \nsome progress we all we all crave progress don\'t we and maybe my workers tab OK great so we\'ve got that up and running  \nnow I\'m going to do a slightly larger hyper parameter search OK so remember we had just a couple for c a couple for  \nkernel we\'re going to do more we have some for shrinking now I\'m actually going to comment that out because I don\'t know  \nhow long that\'s going to take if you\'re coding them on Binder now this May actually take far far too long for you but  \nwe\'ll we\'ll see so I\'ll execute this code and we should see just sick no we shouldn\'t see any work happening yet but  \nwhat I\'m doing here is oh looks like OK my clusters back up great we\'re doing our grid search but we\'re going to use  \nDask as as the back end right and this is a context manager where we\'re asserting that and and we can just discuss the  \nthe syntax there but it\'s not particularly important currently I\'m going to execute this now and let\'s see fantastic  \nwe\'ll see all this data transfer happening here we\'ll see our tasks happening here we can see these big batches of fit  \nand score fit so fitting fitting the models then finding how well they perform via this k-fold cross validation which is  \nreally cool and let\'s just yep we can see what\'s happening here we can see we currently have 12 processing we\'ve got  \nseven in memory and we have several more that we need to do our desk workers we can see us oh we can see our CPU usage  \nwe can see how we can see CPU usage across all the workers which is which is pretty cool seeing that distribution is is  \nreally nice whenever some form of b swarm plot if you have enough would would be useful there or even some form of  \ncumulative distribution function or something like that not a histogram people OK you can go to my Bayesian tutorial  \nthat I\'ve taught here before to hear me rave about the the horrors of histograms so we saw that talk a minute which is  \ngreat and we split it across you know eight cores or whatever it is and now we\'ll have a look once again we get the same  \nbest performer which is which is a sanity check and that\'s pretty cool I think we have a we actually have a few minutes  \nleft so I am gonna just see if I can oh let me think yeah I will see if I can burst burst to the cloud and and and do  \nthis that will take a minute a minute or two to create the cluster again but while we\'re while we\'re doing that I\'m  \nwondering if we have any any questions or if anyone has any feedback on on this workshop I very much welcome welcome  \nthat perhaps if there are any final messages you\'d you\'d like to say James while we\'re spinning this up you can you can  \nlet me know yeah sure I just also first off wanted to say thanks everyone for attending and like bearing  \n \n\n#### 01:04:01,119::\t\t4 minutes Mark -> new paragraph \n \nbearing with us with the technical difficulties really appreciate that real quick I\'m just yeah so if you have if you  \nhave questions please post in the Q&A section while the cold cluster\'s spinning up Theodore posted in the last largest  \nexample of grid search how much performance gain did we get from using das and not just in jobs hmm that\'s a great  \nquestion and we actually didn\'t see let\'s see so it took 80 seconds ah let me get this they\'re actually not comparable  \nbecause I did the grid search over a different set of hyper parameters I did it over a larger set of hyper parameters  \nright so when I did end jobs I did it there were only it was a two by two grid of hyper parameters whereas when I did it  \nwith with Dask it was a one two three four five six six by three so let\'s just reason about that this one was eighteen  \nsix by three is eighteen which took eighty seconds and this one was two by two so it was four and it took 26 seconds so  \na minor gain I think with this hyper parameter search if you multiply that by by four you\'ll well 4.2 4.5 you\'ll need  \nthat would have taken maybe two minutes or something something like that so we saw some increase in efficiency not a  \ngreat deal but James maybe you can say more to this part of the reason for that is that we\'re doing it on kind of a very  \nsmall example so we won\'t necessarily see the gains in efficiency with a data set this size and with a small hyper  \nparameter suite like this is that right yeah yeah and yeah exactly and I guess also this is more of an kind of an  \nillustrative point here I guess so you\'re just using directly using in jobs with something like joblib by default we\'ll  \nuse local threads and processes on like whatever machine you happen to be running on so like in this case on Hugo\'s  \nlaptop one of the real advantages of using joblib with the das back in will actually dispatch back to to run tasks on a  \nDask cluster is that your cluster can expand beyond what local resources you have so you can run you know you can  \nbasically scale out like for instance using the coil cluster to have many many CPUs and a large amount of ram that you  \nwouldn\'t have on your locally table to run and there you\'ll see both large performance gains as well as you\'ll be able  \nto expand your the set of possible problems you can solve to larger than ram scenarios so you\'re out of out of core  \ntraining exactly and thank you Jack this was absolutely unplanned and we didn\'t plan that question but that\'s a  \nwonderful segue into me now performing exactly the same compute with the same code using the Dask as the parallel back  \nend on a on a coiled cluster which is an AWS cluster right so we can I\'m more currently anyway so I will execute this  \ncode and it\'s exactly the same as we did whoa OK great so we see our tasks task stream here you see once again we see  \nthe majority is being batch fit and and getting the scores out similarly we see the same result being the best I\'ll just  \nnotice that for this for this small task doing it on the cloud took 20 seconds doing it locally for me took 80 seconds  \nso that\'s a four-fold increase in performance on a very small task so imagine what that does if you can take the same  \ncode as you\'ve written  \n \n\n#### 01:08:00,240::\t\t4 minutes Mark -> new paragraph \n \nhere and burst to the cloud with with one click or however however you do it I think that that\'s incredibly powerful and  \nthat the fact that your code and what\'s happening in the back end with Dask generalizes immediately to the new setting  \nof working on a cluster I personally find very exciting and if you work with larger data sets or building larger models  \nor big hyper parameter sweeps I\'m pretty sure it\'s an exciting option for all of you also so on that note I\'d like to  \nreiterate James what James said and thanking you all so much for joining us for asking great questions and for bearing  \nwith us through some some technical technical hurdles but it made it even even funnier when when we got up and running  \nonce again I\'d love to thank Mark Christina and and the rest of the organizers for doing such a wonderful job and doing  \nsuch a great service to the data science and machine learning community and ecosystem worldwide so thank you once again  \nfor having us thank you Hugo and James I have to say like with all the technical difficulties I was actually giggling  \nbecause it was kind of funny yeah but we\'re very sorry and we thank you for your patience and sticking through it and I  \nwill be editing this video to you know make it as efficient as possible and have that available Tim supercard thank you  \ngreat and I\'ll just ask you if you are interested in checking out coiled go to our website if you want to check out our  \nproduct go to cloud.coil.io we started building this company in February we\'re really excited about building a new  \nproduct so if you\'re interested reach out we\'d love to chat with you about what we\'re doing and what we\'re up to and  \nit\'s wonderful to be in the same community as you all, so thanks!  \n   '"""

In [32]:
print(T)


'<!-- Editing Guide: The pipe (|) position in this comment is 120:                                                       | -->
### Introduction

Okay hello and welcome to Data Umbrella's webinar for October so I'm just going to go over the agenda I'm going to do a  
brief introduction then there will be the workshop by Hugo and James and you can ask questions along the way in the chat  
or actually the best place to ask questions is the Q&A and there's an option to upvote as well so yet asking the Q&A if  
you happen to post it on the chat by mistake I can also transfer it over to Q&A so that would be fine too and this  
webinar is being recorded. Briefly about me. I am a statistician and data scientist and I am the founder of Data Umbrella.  
I am on a lot of platforms as Reshama so feel free to follow me on Twitter and LinkedIn. We have a code of conduct we're  
dedicated to providing a harassment-free experience for everyone. Thank you for helping to make this a welcoming friendly 

In [154]:
formatted_transcript = AC.PC.page.children[1].children[1].value  # trx text
txt_len = len(formatted_transcript)
chunksize = (1024 * 4)
n_chunks = TRX.math.ceil(txt_len/chunksize)
n_chunks

15

In [151]:
txa = AC.PC.page.children[1].children[1]

In [140]:
grid = AC.PC.page.children[0]
footer_g = grid.children[3].children[0].children[0]
v_file = footer_g.children[0].children[0].value
v_entries = footer_g.children[0].children[1].children[1].value or None
print(v_file, '\n', v_entries)

Names 
 'bayesian'


In [141]:
valid, msg = CTR.validate_user_list(v_entries, v_file)
print(msg)
valid

OK


['bayesian']

In [145]:
import logging

class OutputWidgetHandler(logging.Handler):
    """ Custom logging handler sending logs to an output widget """

    def __init__(self, *args, **kwargs):
        super(OutputWidgetHandler, self).__init__(*args, **kwargs)
        layout = {
            'width': '100%',
            'height': '160px',
            'border': '1px solid black'
        }
        self.out = ipw.Output(layout=layout)

    def emit(self, record):
        """ Overload of logging.Handler method """
        formatted_record = self.format(record)
        new_output = {
            'name': 'stdout',
            'output_type': 'stream',
            'text': formatted_record+'\n'
        }
        self.out.outputs = (new_output, ) + self.out.outputs

    def show_logs(self):
        """ Show the logs """
        display(self.out)

    def clear_logs(self):
        """ Clear the current logs """
        self.out.clear_output()


logger = logging.getLogger('Audit_Tests')
logger.setLevel(logging.DEBUG)

handler = OutputWidgetHandler()
handler.setFormatter(logging.Formatter('%(asctime)s  - [%(levelname)s] %(message)s'))
logger.addHandler(handler)

handler.show_logs()

Output(layout=Layout(border='1px solid black', height='160px', width='100%'))

In [159]:
handler.clear_logs()
logger.info('Starting program')

try:
    logger.info('About to try something dangerous...')
    1.0/0.0
except Exception as e:
    logger.exception('Oops: ',exc_info=True)

ipywidgets.widgets.widget_string.Textarea

In [88]:
grid = AC.PC.page.children[0]  #grid header
type(grid)
main = grid.children[1]
type(main)
main_vbx1 = main.children[2]
type(main_vbx1)
main_out_idn = main_vbx1.children[0].outputs[0]['text'][5:-1]
footer = grid.children[3]
footer_acc = footer.children[0]
type(footer_acc)

ipywidgets.widgets.widget_box.GridBox

ipywidgets.widgets.widget_box.HBox

ipywidgets.widgets.widget_box.VBox

ipywidgets.widgets.widget_selectioncontainer.Accordion

---
# Current EDIT tasks:
* Problem: cannot disable footer or its Accordion => footer loading done by ckick_btn_load()?
* Problem: reprocessing the transcript after text-processing files update should not entails redoing the processing of the xml_captions.
* FEAT: Only the MODIFY page will have the option to change the time-chunking parameter (which is in the xml file); right now: not exposed.

## Solution?
* Current: TR.redo_initial_transcript() is used
* Needed: TR.YT.reprocess_text() => existing_transcript = initial_transcript + all changes saved by Editor => prior changes will be kept

### => refactor self.xml_caption_to_text


- Extract wrap function
- Remove TW from clean_text
- Add do_wrap param to clean_text

### Limitation: only the remaining lowercase text will be affected since all replacements assume lowercase text.
```
self.YT.get_initial_transcript(replace=True)
def reprocess_text(txt):
    """
    txt could be from self.txa_editarea.value or get_transcript_text(self)
    """
```    

In [18]:
grid = AC.PC.page.children[0]  #grid header
type(grid)
main = grid.children[1]
type(main)
main_out_idn = main.children[2]
type(main_out_idn)
main_out_idn.outputs[0]['text'][5:-1]

ipywidgets.widgets.widget_box.GridBox

ipywidgets.widgets.widget_box.HBox

ipywidgets.widgets.widget_output.Output

'05-mini-max-demo-foo.md'

---

---
---
# Test: Horizontal RadioButtons

In [8]:
lo_radio = ipw.Layout(flex_flow='row')
av_radio2 = ipw.RadioButtons(options=['Audio','Video'], value='Audio',
                            layout=lo_radio)
av_radio2

RadioButtons(layout=Layout(flex_flow='row'), options=('Audio', 'Video'), value='Audio')

---
---
# Fixed: problem with event numbering from df :: new event dict in Meta
```
    def new_event_dict(self):
        """
        Create a 'starter' event dict with event id generated
        from the readme table df.
        """
        new_dict = self.get_event_dict()

        # Update dict with defaults:
        new_dict['year'] = self.year
        new = self.df.index.argmax() + self.row_offset
        self.idn = idn_frmt(new)
        new_dict['idn'] = self.idn
        new_dict['transcriber'] = '?'
        new_dict['extra_references'] = ''
        new_dict['has_transcript'] = False
        new_dict['status'] = TrStatus.TODO.value
        new_dict['notes'] = ''
        new_dict['video_href_w'] = DEF_IMG_W #thumbnail
        
        v1 = self.insertion_idx(HDR_TPL.format(**new_dict))
        new_dict['trans_idx'] = v1
        return new_dict
``` 

---
---

# TO DO:

1. Produce the program flow chart depending on user status, e.g

---
---
# Utils for documenting the project - Networkx, Graphviz needed
---

In [None]:
# test: https://nbviewer.jupyter.org/github/xflr6/graphviz/blob/master/examples/notebook.ipynb

import os
from graphviz import Digraph, Source

In [None]:
filter_dir(Digraph)

```
Digraph?
Init signature:
Digraph(
    name=None,
    comment=None,
    filename=None,
    directory=None,
    format=None,
    engine=None,
    encoding='utf-8',
    graph_attr=None,
    node_attr=None,
    edge_attr=None,
    body=None,
    strict=False,
)
```

In [None]:
Digraph.render?

In [None]:
os.environ['PROGRAMFILES']
os.environ['CONDA_PREFIX']
#C:\Program Files\Graphviz 2.44.1\bin

In [None]:
def set_gv_envir():
    """ Ad-hoc fix to have Graphiz (v2.38) working on my system. 
    Note that in case the error ExecutableNotFound occurs, the path to 
    graphviz must be added to the PATH variable, e.g:
    > "FileNotFoundError: [WinError 2] The system cannot find the file specified" 
    > "ExecutableNotFound: 
       failed to execute ['dot', '-Tsvg'], make sure the Graphviz executables are
       on your systems' PATH"
    The above is not sufficient: the error occurred even though graphviz, dot and
    neato are all on my system path.
    Calling this function on failed `try` solved the problem. (?)
"""
    gviz = os.path.join(os.environ['PROGRAMFILES'], 'Graphviz 2.44.1', 'bin')
    os.environ["PATH"] += os.pathsep + gviz
    cnd_gv = os.path.join(os.environ['CONDA_PREFIX'], 'Library', 'bin', 'python-graphviz') #'graphviz')
    os.environ["PATH"] += os.pathsep + cnd_gv
    return gviz, cnd_gv

set_gv_envir()

In [None]:
# test:
gvfile = DIR_IMG.joinpath('tbl.gv')

dot_dg = Digraph(comment='The Round Table', filename=gvfile, engine='dot')

dot_dg.node('A', 'King Arthur')
dot_dg.node('B', 'Sir Bedevere the Wise')
dot_dg.node('L', 'Sir Lancelot the Brave')

dot_dg.edges(['AB', 'AL'])
dot_dg.edge('B', 'L', constraint='false')

In [None]:
dot_dg.render(format='png', view=True)

In [None]:
dtree = {'User Type:':['Admin', 'Tanscriber'],
        }