In [None]:
# export
from local.imports import *
from local.notebook.core import *
import nbformat,inspect

In [None]:
# default_exp notebook.export
# default_cls_lvl 3

# Converting notebooks to modules

> The functions that transform the dev notebooks in the fastai library

## Reading a notebook

### What's a notebook?

A jupyter notebook is a json file behind the scenes. We can just read it with the json module, which will return a nested dictionary of dictionaries/lists of dictionaries, but there are some small differences between reading the json and using the tools from `nbformat` so we'll use this one.

In [None]:
#export
def read_nb(fname):
    "Read the notebook in `fname`."
    with open(Path(fname),'r') as f: return nbformat.reads(f.read(), as_version=4)

`fname` can be a string or a pathlib object.

In [None]:
test_nb = read_nb('91_notebook_export.ipynb')

The root has four keys: `cells` contains the cells of the notebook, `metadata` some stuff around the version of python used to execute the notebook, `nbformat` and `nbformat_minor` the version of nbformat. 

In [None]:
test_nb.keys()

dict_keys(['cells', 'metadata', 'nbformat', 'nbformat_minor'])

In [None]:
test_nb['metadata']

{'kernelspec': {'display_name': 'Python 3',
  'language': 'python',
  'name': 'python3'},
 'language_info': {'codemirror_mode': {'name': 'ipython', 'version': 3},
  'file_extension': '.py',
  'mimetype': 'text/x-python',
  'name': 'python',
  'nbconvert_exporter': 'python',
  'pygments_lexer': 'ipython3',
  'version': '3.7.1'}}

In [None]:
f"{test_nb['nbformat']}.{test_nb['nbformat_minor']}"

'4.2'

The cells key then contains a list of cells. Each one is a new dictionary that contains entries like the type (code or markdown), the source (what is written in the cell) and the output (for code cells).

In [None]:
test_nb['cells'][0]

{'cell_type': 'code',
 'execution_count': 1,
 'metadata': {'hide_input': False},
 'outputs': [],
 'source': '# export\nfrom local.imports import *\nfrom local.notebook.core import *\nimport nbformat,inspect'}

### Finding patterns

In [None]:
# export
def check_re(cell, pat):
    if cell['cell_type'] != 'code': return False
    return re.search(pat, cell['source'], re.IGNORECASE | re.MULTILINE)

In [None]:
cell = test_nb['cells'][0].copy()
assert check_re(cell, '# export') is not None
assert check_re(cell, '# bla') is None

In [None]:
# export
def is_export(cell, default):
    "Check if `cell` is to be exported and returns the name of the module."
    if check_re(cell, r'^\s*#\s*exports?\s*$'): 
        if default is None: 
            set_trace()
            print(f"This cell doesn't have an export destination and was ignored:\n{cell['source'][1]}")
        return default
    tst = check_re(cell, r'^\s*#\s*exports?\s*(\S+)\s*$')
    return os.path.sep.join(tst.groups()[0].split('.')) if tst else None 

The cells to export are marked with an `#export` or `#exports` code, potentially with a module name where we want it exported. The default is given in a cell of the form `#default_exp bla` inside the notebook (usually at the top), though in this function, it needs the be passed (the final script will read the whole notebook to find it).

In [None]:
cell = test_nb['cells'][0].copy()
assert is_export(cell, 'export') == 'export'
cell['source'] = "# exports" 
assert is_export(cell, 'export') == 'export'
cell['source'] = "# export mod" 
assert is_export(cell, 'export') == 'mod'
cell['source'] = "# export mod.file" 
assert is_export(cell, 'export') == 'mod/file'
cell['source'] = "# expt mod.file"
assert is_export(cell, 'export') is None

In [None]:
# export
def find_default_export(cells):
    "Find in `cells` the default export module."
    for cell in cells:
        tst = check_re(cell, r'^\s*#\s*default_exp\s*(\S*)\s*$')
        if tst: return tst.groups()[0]

Stops at the first cell containing a `#default_exp` code and return the value behind. Returns `None` if there are no cell with that code.

In [None]:
assert find_default_export(test_nb['cells']) == 'notebook.export'
assert find_default_export(test_nb['cells'][2:]) is None

### Exporting notebooks

We're now ready to export notebooks!

In [None]:
# export
def _create_mod_file(fname, nb_path):
    "Create a module file for `fname`."
    fname.parent.mkdir(parents=True, exist_ok=True)
    with open(fname, 'w') as f: 
        f.write(f"#AUTOGENERATED! DO NOT EDIT! File to edit: dev/{nb_path.name} (unless otherwise specified).")
        f.write('\n\n__all__ = []')

In [None]:
# export
def _not_private(n):
    for t in n.split('.'):
        if t.startswith('_'): return False
    return True

def export_names(code, func_only=False):
    "Find the names of the objects, functions or classes defined in `code` that are exported."
    #Format monkey-patches with @patch
    code = re.sub(r'@patch\s*def\s+([^\(\s]*)\s*\([^:]*:\s*([^,\)\s]*)\s*(?:,|\))', r'\2.\1 = ', code)
    names = re.findall(r'^(?:def|class)\s+([^\(\s]*)\s*(?:\(|:)', code, re.MULTILINE)
    if not func_only: names += re.findall(r'^([^\(\s]*)\s*=', code, re.MULTILINE)
    return [n for n in names if _not_private(n)]

This function only picks the zero-indented objects, functions or classes (we don't want the class methods for instance) and excludes private names (that begin with `_`). It only returns func and class names when `func_only=True`.

In [None]:
assert export_names("def my_func(x):\n  pass\nclass MyClass():") == ["my_func", "MyClass"]
#Indented funcs are ignored (funcs inside a class)
assert export_names("  def my_func(x):\n  pass\nclass MyClass():") == ["MyClass"]
#Private funcs are ignored
assert export_names("def _my_func():\n  pass\nclass MyClass():") == ["MyClass"]
#trailing spaces
assert export_names("def my_func ():\n  pass\nclass MyClass():") == ["my_func", "MyClass"]
#class without parenthesis
assert export_names("def my_func ():\n  pass\nclass MyClass:") == ["my_func", "MyClass"]
#object and funcs
assert export_names("def my_func ():\n  pass\ndefault_bla = []:") == ["my_func", "default_bla"]
assert export_names("def my_func ():\n  pass\ndefault_bla = []:", func_only=True) == ["my_func"]
#Private objects are ignored
assert export_names("def my_func ():\n  pass\n_default_bla = []:") == ["my_func"]
#Objects with dots are privates if one part is private
assert export_names("def my_func ():\n  pass\ndefault.bla = []:") == ["my_func", "default.bla"]
assert export_names("def my_func ():\n  pass\ndefault._bla = []:") == ["my_func"]
#Monkey-path with @patch are properly renamed
assert export_names("@patch\ndef my_func(x:Class):\n  pass") == ["Class.my_func"]
assert export_names("some code\n@patch\ndef my_func(x:Class, y):\n  pass") == ["Class.my_func"]

In [None]:
# export
def extra_add(code):
    "Catch adds to `__all__` required by a cell with `_all_=`"
    pat = re.compile('^_all_\s*=\s*\[([^\]]*)\]', re.MULTILINE)
    if re.search(pat, code):
        names = re.search(pat, code).groups()[0]
        names = re.sub('\s*,\s*', ',', names)
        names = names.replace('"', "'")
        code = re.sub(pat, '', code)
        code = re.sub(r'([^\n]|^)\n*$', r'\1', code)
        return names.split(','),re.sub(pat, '', code)
    return [],code

In [None]:
assert extra_add('_all_ = ["func", "func1", "func2"]') == (["'func'", "'func1'", "'func2'"],'')
assert extra_add('_all_ = ["func",   "func1" , "func2"]') ==  (["'func'", "'func1'", "'func2'"],'')
assert extra_add("_all_ = ['func','func1', 'func2']\n") ==  (["'func'", "'func1'", "'func2'"],'')
assert extra_add('code\n\n_all_ = ["func", "func1", "func2"]') == (["'func'", "'func1'", "'func2'"],'code')

In [None]:
#export
def _add2add(fname, names, line_width=120):
    if len(names) == 0: return
    with open(fname, 'r') as f: text = f.read()
    tw = TextWrapper(width=120, initial_indent='', subsequent_indent=' '*11, break_long_words=False)
    re_all = re.search(r'__all__\s*=\s*\[([^\]]*)\]', text)
    start,end = re_all.start(),re_all.end()
    text_all = tw.wrap(f"{text[start:end-1]}{'' if text[end-2]=='[' else ', '}{', '.join(names)}]")
    with open(fname, 'w') as f: f.write(text[:start] + '\n'.join(text_all) + text[end:])

In [None]:
fname = 'test_add.txt'
with open(fname, 'w') as f: f.write("Bla\n__all__ = [my_file, MyClas]\nBli")
_add2add(fname, ['new_function'])
with open(fname, 'r') as f: 
    assert f.read() == "Bla\n__all__ = [my_file, MyClas, new_function]\nBli"
_add2add(fname, [f'new_function{i}' for i in range(10)])
with open(fname, 'r') as f: 
    assert f.read() == """Bla
__all__ = [my_file, MyClas, new_function, new_function0, new_function1, new_function2, new_function3, new_function4,
           new_function5, new_function6, new_function7, new_function8, new_function9]
Bli"""
os.remove(fname)

In [None]:
# export
def _relative_import(name, fname):
    mods = name.split('.')
    splits = str(fname).split(os.path.sep)
    if mods[0] not in splits: return name
    splits = splits[splits.index(mods[0]):]
    while splits[0] == mods[0]: splits,mods = splits[1:],mods[1:]
    return '.' * (len(splits)) + '.'.join(mods)

In [None]:
assert _relative_import('local.core', Path('local')/'data.py') == '.core'
assert _relative_import('local.core', Path('local')/'vision'/'data.py') == '..core'
assert _relative_import('local.vision.transform', Path('local')/'vision'/'data.py') == '.transform'
assert _relative_import('local.notebook.core', Path('local')/'data'/'external.py') == '..notebook.core'

In [None]:
# export
def _deal_import(code_lines, fname):
    pat = re.compile(r'from (local.\S*) import (\S*)$')
    lines = []
    for line in code_lines:
        line = re.sub('_'+'file_', '__'+'file__', line) #Need to break _file_ or that line will be treated
        match = re.match(pat, line)
        if match: lines.append(f"from {_relative_import(match.groups()[0], fname)} import {match.groups()[1]}")
        else: lines.append(line)
    return lines                    

In [None]:
#hide
#Tricking jupyter notebook to have a __file__ attribute. All _file_ will be replaced by __file__
_file_ = Path('local').absolute()/'notebook'/'export.py'

In [None]:
#export
def _get_index():
    if not (Path(_file_).parent/'index.txt').exists(): return {}
    return json.load(open(Path(_file_).parent/'index.txt', 'r'))

def _save_index(index): json.dump(index, open(Path(_file_).parent/'index.txt', 'w'), indent=2)
def _reset_index():
    if (Path(_file_).parent/'index.txt').exists():
        os.remove(Path(_file_).parent/'index.txt')

In [None]:
#hide
ind,ind_bak = Path(_file_).parent/'index.txt',Path(_file_).parent/'index.bak'
if ind.exists(): shutil.move(ind, ind_bak)
assert _get_index() == {}
_save_index({'foo':'bar'})
assert _get_index() == {'foo':'bar'}
if ind_bak.exists(): shutil.move(ind_bak, ind)

In [None]:
#export 
def _notebook2script(fname):
    "Finds cells starting with `#export` and puts them into a new module"
    fname = Path(fname)
    nb = read_nb(fname)
    default = find_default_export(nb['cells'])
    if default is not None: 
        default = os.path.sep.join(default.split('.'))
        _create_mod_file(Path.cwd()/'local'/f'{default}.py', fname)
    index = _get_index()
    exports = [is_export(c, default) for c in nb['cells']]
    cells = [(c,e) for (c,e) in zip(nb['cells'],exports) if e is not None]
    for (c,e) in cells:
        fname_out = Path.cwd()/'local'/f'{e}.py'
        orig = '' if e==default else f'#Comes from {fname.name}.\n'
        code = '\n\n' + orig + '\n'.join(_deal_import(c['source'].split('\n')[1:], fname_out))
        # remove trailing spaces
        names = export_names(code)
        extra,code = extra_add(code)
        _add2add(fname_out, [f"'{f}'" for f in names if '.' not in f] + extra)
        index.update({f: fname.name for f in names})
        code = re.sub(r' +$', '', code, flags=re.MULTILINE)
        with open(fname_out, 'a') as f: f.write(code)
    _save_index(index)
    print(f"Converted {fname}.")

In [None]:
_notebook2script('92_notebook_showdoc.ipynb')

Converted 92_notebook_showdoc.ipynb.


In [None]:
#export 
def _get_sorted_files(all_fs: Union[bool,str], up_to=None):
    "Return the list of files corresponding to `g` in the current dir."
    if (all_fs==True): ret = glob.glob('*.ipynb') # Checks both that is bool type and that is True
    else: ret = glob.glob(all_fs) if isinstance(g,str) else []
    if len(ret)==0: print('WARNING: No files found')
    ret = [f for f in ret if not f.startswith('_')]
    if up_to is not None: ret = [f for f in ret if str(f)<=str(up_to)]
    return sorted(ret)

In [None]:
#export 
def notebook2script(fname=None, all_fs=None, up_to=None):
    "Convert `fname` or all the notebook satisfying `all_fs`."
    # initial checks
    assert fname or all_fs
    if all_fs: _reset_index()
    if (all_fs is None) and (up_to is not None): all_fs=True # Enable allFiles if upTo is present
    fnames = _get_sorted_files(all_fs, up_to=up_to) if all_fs else [fname]
    [_notebook2script(f) for f in fnames]

Finds cells starting with `#export` and puts them into the appropriate module.
* `fname`: the filename of one notebook to convert
* `all_fs`: `True` if you want to convert all notebook files in the folder or a glob expression
* `up_to`: converts all notebooks respecting the previous arg up to a certain number

Examples of use in console:
```
notebook2script                                 # Parse all files
notebook2script --fname 00_export.ipynb         # Parse 00_export.ipynb
notebook2script --all_fs=nb*                    # Parse all files starting with nb*
notebook2script --up_to=10                      # Parse all files with (name<='10')
notebook2script --all_fs=*_*.ipynb --up_to=10   # Parse all files with an '_' and (name<='10')
```

### Finding the way back to notebooks

In [None]:
# export
def get_name(obj):
    "Get the name of `obj`"
    if hasattr(obj, '__name__'):       return obj.__name__
    elif getattr(obj, '_name', False): return obj._name
    elif hasattr(obj,'__origin__'):    return str(obj.__origin__).split('.')[-1] #for types
    else:                              return str(obj).split('.')[-1]

In [None]:
from local.data.pipeline import *
assert get_name(Pipeline) == 'Pipeline'
assert get_name(Pipeline.composed) == 'composed'
assert get_name(Union[Tensor, float]) == 'Union'

In [None]:
# export
def qual_name(obj):
    "Get the qualified name of `obj`"
    if hasattr(obj,'__qualname__'): return obj.__qualname__
    if inspect.ismethod(obj):       return f"{get_name(obj.__self__)}.{get_name(fn)}"
    return get_name(obj)

In [None]:
assert qual_name(Pipeline) == 'Pipeline'
assert qual_name(Pipeline.composed) == 'Pipeline.composed'

In [None]:
# export
def source_nb(func, is_name=None, return_all=False):
    "Return the name of the notebook where `func` was defined"
    is_name = is_name or isinstance(func, str)
    index = _get_index()
    name = func if is_name else qual_name(func)
    while len(name) > 0:
        if name in index: return (name,index[name]) if return_all else index[name]
        name = '.'.join(name.split('.')[:-1])

You can either pass an object or its name (by default `is_name` will look if `func` is a string or not, but you can override if there is some inconsistent behavior). 

If passed a method of a class, the function will return the notebook in which the largest part of the function was defined in case there is a monkey-matching that defines `class.method` in a different notebook than `class`. If `return_all=True`, the function will return a tuple with the name by which the function was found and the notebook.

In [None]:
from local.data.pipeline import Transform
from local.test import test_fail
from local.core import opt_call

assert source_nb(test_fail) == '00_test.ipynb'
assert source_nb(Transform) == '02_data_pipeline.ipynb'
assert source_nb(Transform.create) == '02_data_pipeline.ipynb'
#opt_call is in the core module but defined in 02
assert source_nb(opt_call) == '02_data_pipeline.ipynb'
assert source_nb(Tensor) is None
#Added through a monkey-patch
assert source_nb('Path.ls') == '01_core.ipynb'

#Test with name
assert source_nb('Pipeline') == '02_data_pipeline.ipynb'
assert source_nb('Pipeline.decode') == '02_data_pipeline.ipynb'

#Test return_all
assert source_nb(Pipeline, return_all=True) == ('Pipeline','02_data_pipeline.ipynb')
assert source_nb(Pipeline.decode, return_all=True) == ('Pipeline','02_data_pipeline.ipynb')

In [None]:
#hide
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 02_data_pipeline.ipynb.
Converted 03_data_external.ipynb.
Converted 04_data_core.ipynb.
Converted 05_data_source.ipynb.
Converted 06_vision_core.ipynb.
Converted 07_pets_tutorial.ipynb.
Converted 08_augmentation.ipynb.
Converted 10_layers.ipynb.
Converted 11_optimizer.ipynb.
Converted 12_learner.ipynb.
Converted 90_notebook_core.ipynb.
Converted 91_notebook_export.ipynb.
Converted 92_notebook_showdoc.ipynb.
Converted 93_notebook_export2html.ipynb.
Converted 94_index.ipynb.
