# Parsing Jupyter Notebooks

In [1]:
import nbformat
import io

# Reading Notebooks

In [107]:
def read_nb(nb):
    with io.open(nb, 'r', encoding='utf8') as f:
        nb = nbformat.read(f, nbformat.NO_CONVERT)
    return nb

def write_nb(nb, fn):
    if not fn.endswith('.ipynb'):
        fn += '.ipynb'
    with io.open(fn, 'w', encoding='utf8') as f:
        nbformat.write(nb, f, nbformat.NO_CONVERT)

def dump_nb(nb, cells=5, lines=5):
    for c in xrange(0, cells):
        print("====== " + nb.cells[c]['cell_type'] + " ======")
        src = nb.cells[c]['source'].splitlines()
        if len(src) > lines:
            print('\n'.join(src[0:lines]))
            print("...")
        else:
            print(nb.cells[c]['source'])

In [48]:
#source_nb = 'atoms/visualization/choropleth_classification.ipynb'
source_nb = 'atoms/foundations/Functions.ipynb'

inb = read_nb(source_nb)

In [16]:
type(inb)

nbformat.notebooknode.NotebookNode

In [17]:
inb.keys()

[u'nbformat_minor', u'cells', u'nbformat', u'metadata']

In [18]:
inb.metadata

{u'anaconda-cloud': {},
 u'kernelspec': {u'display_name': u'Python [Root]',
  u'language': u'python',
  u'name': u'Python [Root]'},
 u'language_info': {u'codemirror_mode': {u'name': u'ipython', u'version': 2},
  u'file_extension': u'.py',
  u'mimetype': u'text/x-python',
  u'name': u'python',
  u'nbconvert_exporter': u'python',
  u'pygments_lexer': u'ipython2',
  u'version': u'2.7.12'}}

In [19]:
cells = inb['cells']

In [20]:
type(cells)

list

In [21]:
len(cells)

45

In [22]:
type(cells)

list

In [95]:
dump_nb(inb)

# Notebook-8: Introduction to Functions
### Lesson Content 

- Function Anatomy 101
    - Function definiton & call
    - Arguments
...
## Functions 101

We've already met and used some functions, especially when we dealt with lists and dictionaries:
myList = [1,"two", False, 9.99]
print len(myList) # A function
print myList.index("two") # A different function!
print range(len(myList)) # Results of one function passed to another function!!!
That last one bears a closer look! To make it more clear we could rewrite it:
```python
range(
    len(myList)
)
...


## Let's replace the cells in the in-memory notebook to create a new one

In [26]:
# extract only every other cell
new_cells = [ c for i,c in enumerate(cells) if i%2]

In [27]:
len(new_cells)

22

In [28]:
inb['cells'] = new_cells

In [29]:
with io.open('smaller.ipynb', 'w', encoding='utf8') as f:
    nbformat.write(inb, f, nbformat.NO_CONVERT)

In [30]:
snb = read_nb('smaller.ipynb')

In [31]:
len(snb.cells)

22

## Notebook Cells

In [32]:
c0 = snb.cells[0]

In [33]:
type(c0)

nbformat.notebooknode.NotebookNode

In [34]:
c0.keys()

[u'source', u'cell_type', u'metadata']

In [35]:
c0['cell_type']

u'markdown'

In [36]:
c0['source']

u'### Lesson Content \n\n- Function Anatomy 101\n    - Function definiton & call\n    - Arguments\n    - Return statement\n- Function calling!\n    - Assign a function to a variable\n    - Function as a parameter to another function\n\n\nWelcome to the eighth Code Camp notebook! In this lesson we\'ll cover *functions* in Python, a concept that you\'ve already encountered but to which you\'ve not yet been formally introduced. Now we\'re going to dig into this a little bit more because writing functions is where lazy programmers become good programmers.\n\nIn other words, as we saw with the concept of *iteration*, programmers are lazy and they tend want to avoid doing boring tasks over and over again. The idea is to avoid "wasting time re-inventing the wheel" and programmers have abbreviated this idea to the acronym **D.R.Y.** (Do not Repeat Yourself): if you are doing something more than once or twice, ask yourself if there\'s a way to encapsulate what you are doing in a function: you w

In [37]:
c0['metadata']

{}

## Types of cells in  this notebook

In [38]:
from collections import defaultdict
def get_structure(cells):
    cell_types = defaultdict(list)
    for i, cell in enumerate(cells):
        cell_types[cell.cell_type].append(i)
    return cell_types
            

In [39]:
cell_types = get_structure(snb.cells)

In [40]:
cell_types.keys()

[u'code', u'markdown']

In [41]:
for ct, cells in cell_types.items():
    print('Cell Type: %s\t %d cells'% (ct, len(cells)))

Cell Type: code	 10 cells
Cell Type: markdown	 12 cells


In [42]:
code_cell_idx = cell_types['code'][0]
code_cell_idx

1

In [43]:
snb.cells[code_cell_idx]

{u'cell_type': u'code',
 u'execution_count': None,
 u'metadata': {u'collapsed': False},
 u'outputs': [],
 u'source': u'myList = [1,"two", False, 9.99]\nprint len(myList) # A function\nprint myList.index("two") # A different function!\nprint range(len(myList)) # Results of one function passed to another function!!!'}

In [44]:
mkd_cell_idx = cell_types['markdown'][0]
mkd_cell_idx

0

In [45]:
snb.cells[mkd_cell_idx]

{u'cell_type': u'markdown',
 u'metadata': {},
 u'source': u'### Lesson Content \n\n- Function Anatomy 101\n    - Function definiton & call\n    - Arguments\n    - Return statement\n- Function calling!\n    - Assign a function to a variable\n    - Function as a parameter to another function\n\n\nWelcome to the eighth Code Camp notebook! In this lesson we\'ll cover *functions* in Python, a concept that you\'ve already encountered but to which you\'ve not yet been formally introduced. Now we\'re going to dig into this a little bit more because writing functions is where lazy programmers become good programmers.\n\nIn other words, as we saw with the concept of *iteration*, programmers are lazy and they tend want to avoid doing boring tasks over and over again. The idea is to avoid "wasting time re-inventing the wheel" and programmers have abbreviated this idea to the acronym **D.R.Y.** (Do not Repeat Yourself): if you are doing something more than once or twice, ask yourself if there\'s a 

## Turning Output Cells OFF

In [46]:
def remove_outputs(nb):
    """Set output attribute of all code cells to be empty"""
    for cell in nb.cells:
        if cell.cell_type == 'code':
            cell.outputs = []

def clear_notebook(old_ipynb, new_ipynb):
    with io.open(old_ipynb, 'r') as f:
        nb = nbformat.read(f, nbformat.NO_CONVERT)

    remove_outputs(nb)
    
    with io.open(new_ipynb, 'w', encoding='utf8') as f:
        nbformat.write(nb, f, nbformat.NO_CONVERT)

source_nb = 'atoms/visualization/choropleth_classification.ipynb'

new_nb = 'nout.ipynb'
clear_notebook(source_nb, new_nb)

## Importing by Type

```
@include {
    'resource' = 'http://geopyter.org/atoms/fundamentals/lists.ipynb',
    'select' = 'h1.Understanding Lists'
}
```

In [185]:
import re
def transHeader(matchobj):
    return matchobj.group(1) + '#' * int(matchobj.group(2))

def parseSelect(s): 
    t = [ re.sub(r'(-?)h(\d+)', lambda matchobj: matchobj.group(1) + '#' * int(matchobj.group(2)), x, flags=re.IGNORECASE) for x in re.split(r'\s*(\-?h\d+)\.', s) ]
    for i in xrange(1, len(t), 2):
        print ' '.join([t[i], t[i+1]])
    
def includeContent(nb, select=None, ctype=None, clear=True):
    """Import content from another notebook:
    * select: specification of content within notebook to include/exclude
    * version: specify a particular version of a notebook to use (via Git/GitHub)
    * ctype: code/markdown; specify only code or only markdown cell types for including?
    * clear: boolean; clear outputs while importing?
    Other params?
    """
    
    # Read the notebook into a composition
    cnb = read_nb(nb)
    
    # If we only want to *keep* the specified
    # cell type and delete all others...
    if ctype is not None:
        print("Ctype: " + str(ctype))
        ctypes = get_structure(cnb.cells)
        for t in ctypes.keys():
            if t != ctype:
                #print("Deleting type " + t)
                for c in xrange(len(ctypes[t])-1,-1,-1):
                    #print("Deleting: " + str(ctypes[t][c]))
                    del cnb.cells[ctypes[t][c]]
    
    # Remove any code output from the included 
    # notebook as part of the process
    if clear is True:
        print("Clear: " + str(clear))
        remove_outputs(cnb)
    
    # Now the 'magic' (I'm probably not writing
    # great Python code here though...)
    if select is not None:
        # =====
        # The data structure here needs some thought,
        # but for now I'm going with a dictionary of 
        # dictionaries...
        # =====
        patterns = {}
        
        # Split on semi-colons in the select statement
        selections = select.split(";")
        
        # And now break each statement into sub-selects
        for s in selections:
            parseSelect(s)
        
        #for c in cnb.cells:
            
    
    return cnb

foo = includeContent('atoms/foundations/Functions.ipynb', select='h2.Layout of a Function h3.Function Definition')
write_nb(foo, 'test')

Clear: True
## Layout of a Function
### Function Definition
