# Parsing Jupyter Notebooks

In [1]:
import nbformat
import io

# Reading Notebooks

### Metadata Params

#### Required

* author = set[ author1, author2, ... ]
* level = pick1: beginner, novice, intermediate, advanced
* duration = 45 (implicit: minutes)

#### Derived
* libraries = (compiled from cell)
* git-info = dict{version; source; sha1; last-modified-date}

In [263]:
from git import Repo

def gitter(path='.'):
    """
    Try to collect GitHub information to use in tracking 
    authorship contributions and allow specification of
    particular versions of notebooks.
    
    Parameters
    ==========
    path: String
        The path to a GitHub repository. Defaults to '.'
    
    Returns
    =======
    rp: dict
        A dictionary containing relevant git metadata
    """
    repo = Repo(path)
    
    rp = {}
    
    rp['active_branch'] = str(repo.active_branch)
    
    hc = repo.head.commit
    rp['author.name'] = hc.author.name
    rp['authored_date'] = datetime.datetime.fromtimestamp(hc.authored_date).strftime('%y-%m-%d %H:%M:%S')
    rp['committer.name'] = hc.committer.name
    rp['committed_date'] = datetime.datetime.fromtimestamp(hc.committed_date).strftime('%y-%m-%d %H:%M:%S')
    rp['sha'] = hc.hexsha
    
    return rp

In [264]:
print(gitter())

{'author.name': u'jreades', 'committer.name': u'jreades', 'sha': u'e3386a990462c6c79c81f6ae108824e0770b7e17', 'committed_date': '17-02-22 11:03:01', 'active_branch': 'master', 'authored_date': '17-02-22 11:03:01'}


In [None]:
def read_nb(nb, ext=True):
    """
    Read a notebook file and return a notebook object.
    
    Parameters
    ==========
    nb: String
        Path to the notebook file; if the path does not end
        in '.ipynb' then this will be appended unless you
        override this by setting the 'ext' to False.
    ext: boolean
        Defaults to True, meaning that the '.ipynb'
        extension will be automatically added. If you do not
        want this behaviour for some reason then set ext to False.
    
    Returns
    =======
    An object of class nbformat.notebooknode.NotebookNode
    """
    
    # Append file extension if missing and ext is True
    if not nb.endswith('.ipynb') and ext is True:
        nb += '.ipynb'
    
    # Read-only in UTF-8, note NO_CONVERT.
    with io.open(nb, 'r', encoding='utf8') as f:
        nb = nbformat.read(f, nbformat.NO_CONVERT)
    
    return nb

def write_nb(nb, fn):
    """
    Write a notebook to the path specified.
    
    Parameters
    ==========
    nb: nbformat.notebooknode.NotebookNode
        A notebook object to write to disk.
    fn: String
        Path to which you want the notebook written. _Note:_ 
        for simplicity's sake this will automatically append 
        '.ipynb' to the filename; however we recommend that 
        you not get lazy and rely on this feature since it may
        go away in the future.
    
    Returns
    =======
    Void.
    """
    
    # Append file extension
    if not fn.endswith('.ipynb'):
        fn += '.ipynb'
    
    # Write raw notebook content
    with io.open(fn, 'w', encoding='utf8') as f:
        nbformat.write(nb, f, nbformat.NO_CONVERT)

from collections import defaultdict
def get_nb_structure(nb):
    cell_types = defaultdict(list)
    for i, cell in enumerate(nb['cells']):
        cell_types[cell.cell_type].append(i)
    return cell_types

def dump_nb(nb, cells=5, lines=5):
    """
    Dump content of a notebook to STDOUT to aid in debugging.
    
    Parameters
    ==========
    nb: nbformat.notebooknode.NotebookNode
        A notebook object from which to dump content.
    cells: int
        Select an arbitrary number of cells to output. Defaults to 5.
    lines: int
        Select an arbitrary number of lines from each cell to output. Defaults to 5.
    
    Returns
    =======
    Void.
    """
    
    # For the cell-range specified
    for c in xrange(0, cells):
        
        # Check we still have cells to read
        if c < len(nb.cells):
            
            # And dump the contents to STDOUT
            print("====== " + nb.cells[c]['cell_type'] + " ======")
            src = nb.cells[c]['source'].splitlines()
            if len(src) > lines:
                print('\n'.join(src[0:lines]))
                print("...")
            else:
                print(nb.cells[c]['source'])

def write_metadata(nb, nm, val, namespace=unicode('geopyter')):
    """
    Add or append metadata values to the geopyter parameter.
    
    Parameters
    ==========
    nb: nbformat.notebooknode.NotebookNode
        A notebook object to which to add Geopyter metadata.
    nm: String
        The name of the key within the Geopyter dictionary that we want to update.
    val: String, List, Dictionary
        The value to associate with the key.
    
    Returns
    =======
    Void.
    """
    
    # Check for the namespace in the notebook metadata
    if not namespace in nb.metadata:
        nb.metadata[namespace] = {}
    
    # And write it
    nb.metadata[namespace][nm] = val

def get_metadata(nb, nm, namespace=unicode('geopyter')):
    """
    Retrieve metadata values from the geopyter parameter.
    
    Parameters
    ==========
    nb: nbformat.notebooknode.NotebookNode
        A notebook object to which to add Geopyter metadata.
    nm: String
        The name of the key within the Geopyter dictionary that we want to retrieve.
    
    Returns
    =======
    Void.
    """
    
    # Check for the namespace in the notebook metadata
    if not nb.metadata.has_key(namespace):
        nb.metadata[namespace] = {}
    
    # And write it
    nb.metadata[namespace][nm] = val

In [244]:
import re
import importlib
def find_libraries(nb):
    """
    Utility function to find libraries imported by notebooks 
    and assemble them into a group for reporting and testing
    purposes.
    
    Parameters
    ==========
    nb: nbformat.notebooknode.NotebookNode
        A notebook object to search for import statements
        
    Returns
    =======
    libs: Set
        A set containing the libraries imported by the notebook
    """
    
    # Find and classify the cells by type [code, markdown]
    cell_types = get_nb_structure(nb)
    
    libs  = set()
    vlibs = {}
    
    # Iterate over the code cell-types
    for c in cell_types['code']:
        try:
            #print("-" * 25)
            #print(nb.cells[c]['source'])
            
            # Convert the code into a block of lines
            block = nb.cells[c]['source'].splitlines()
            # Loop over the lines looking for import-type statements
            for l in block: 
                m = re.match("(?:from|import) (\S+)", l)
                if m:
                    libs.add(m.group(1))
        except IndexError: #Catch index error (not sure where this comes from)
            pass
    
    # Try to get the versions in use on the machine
    for l in libs: 
        l = l.split('.')[0]
        #print("Checking version of " + l)
        mod = importlib.import_module(l)
        ver = None
        try:
            ver = mod.__version__
        except AttributeError:
            try: 
                ver = mod.version
            except AttributeError:
                print("Unable to determine version for: " + l)
                print("Currently we check <module>.__version__ and <moduled>.version")
                pass
        vlibs[l] = ver
    return vlibs

In [234]:
#source_nb = 'atoms/foundations/Dictionaries.ipynb'
source_nb = 'atoms/visualization/choropleth_classification.ipynb'
inb = read_nb(source_nb)

In [4]:
type(inb)

nbformat.notebooknode.NotebookNode

In [5]:
inb.keys()

[u'nbformat_minor', u'cells', u'nbformat', u'metadata']

In [158]:
print(gitter())

{'author.name': u'Jon Reades', 'committer.name': u'Jon Reades', 'sha': u'92489294558bb26a6b4571b66bf86fda0c4b497b', 'committed_date': 1487680868, 'active_branch': 'master', 'authored_date': datetime.datetime(2017, 2, 21, 12, 41, 8)}


In [245]:
libs = find_libraries(inb)
print(libs)

{u'pysal': '1.13.0', u'scipy': '0.17.1', u'numpy': '1.11.1', u'seaborn': '0.7.1', u'sklearn': '0.17.1'}


In [248]:
write_metadata(inb, unicode('author'), ['J. Reades', 'S. Rey'])
write_metadata(inb, unicode('libraries'), find_libraries(inb))
write_metadata(inb, unicode('git'), gitter())

In [249]:
inb.metadata

{u'anaconda-cloud': {},
 u'geopyter': {u'author': ['J. Reades', 'S. Rey'],
  u'git': {'active_branch': 'master',
   'author.name': u'jreades',
   'authored_date': datetime.datetime(2017, 2, 22, 11, 3, 1),
   'committed_date': 1487761381,
   'committer.name': u'jreades',
   'sha': u'e3386a990462c6c79c81f6ae108824e0770b7e17'},
  u'libraries': {u'numpy': '1.11.1',
   u'pysal': '1.13.0',
   u'scipy': '0.17.1',
   u'seaborn': '0.7.1',
   u'sklearn': '0.17.1'}},
 u'kernelspec': {u'display_name': u'Python [Root]',
  u'language': u'python',
  u'name': u'Python [Root]'},
 u'language_info': {u'codemirror_mode': {u'name': u'ipython', u'version': 3},
  u'file_extension': u'.py',
  u'mimetype': u'text/x-python',
  u'name': u'python',
  u'nbconvert_exporter': u'python',
  u'pygments_lexer': u'ipython3',
  u'version': u'3.5.2'}}

In [67]:
write_nb(inb, 'test-metadata.ipynb')

In [166]:
dump_nb(inb, cells=2)

# Notebook-6: Dictionaries
### Lesson Content 

Welcome back to the fifth Code Camp notebook! In this lesson we'll contiune our exploration of more advanced data structures. Last time we took a peek at a way to represent ordered collections of items via **lists**.

This time we'll use **dictionaries** to create collections of unordered items (this is just an easy distinction - there's much more to it - but it's a good way to start wrapping your head around the subject).
...


## Notebook Cells

In [18]:
c0 = snb.cells[0]

In [19]:
type(c0)

nbformat.notebooknode.NotebookNode

In [20]:
c0.keys()

[u'source', u'cell_type', u'metadata']

In [21]:
c0['cell_type']

u'markdown'

In [22]:
c0['source']

u'## Introduction\n\n* what is classification\n* role in choropleth mapping\n* explore classification using PySAL'

In [23]:
c0['metadata']

{u'collapsed': True}

## Types of cells in  this notebook

In [24]:
from collections import defaultdict
def get_structure(cells):
    cell_types = defaultdict(list)
    for i, cell in enumerate(cells):
        cell_types[cell.cell_type].append(i)
    return cell_types
            

In [25]:
cell_types = get_structure(snb.cells)

In [26]:
cell_types.keys()

[u'code', u'markdown']

In [27]:
for ct, cells in cell_types.items():
    print('Cell Type: %s\t %d cells'% (ct, len(cells)))

Cell Type: code	 38 cells
Cell Type: markdown	 11 cells


In [28]:
code_cell_idx = cell_types['code'][0]
code_cell_idx

2

In [29]:
snb.cells[code_cell_idx]

{u'cell_type': u'code',
 u'execution_count': 3,
 u'metadata': {u'collapsed': False},
 u'outputs': [{u'data': {u'text/plain': u'{\'description\': \'Mexican states regional income 1940-2000\',\n \'explanation\': [\'Data used in   Rey, S.J. and M.L.  Sastre Gutierrez. (2010) "Interregional inequality\',\n  \'dynamics in Mexico." Spatial Economic Analysis, 5: 277-298\',\n  \'* mexico.csv: attribute data\',\n  \'* mexico.gal: spatial weights in GAL format\',\n  \'Polygon data, n=32, k=13\'],\n \'name\': \'mexico\'}'},
   u'execution_count': 3,
   u'metadata': {},
   u'output_type': u'execute_result'}],
 u'source': u"ps.examples.explain('mexico')"}

In [30]:
mkd_cell_idx = cell_types['markdown'][0]
mkd_cell_idx

0

In [31]:
snb.cells[mkd_cell_idx]

{u'cell_type': u'markdown',
 u'metadata': {u'collapsed': True},
 u'source': u'## Introduction\n\n* what is classification\n* role in choropleth mapping\n* explore classification using PySAL'}

## Turning Output Cells OFF

In [32]:
def remove_outputs(nb):
    """Set output attribute of all code cells to be empty"""
    for cell in nb.cells:
        if cell.cell_type == 'code':
            cell.outputs = []

def clear_notebook(old_ipynb, new_ipynb):
    with io.open(old_ipynb, 'r') as f:
        nb = nbformat.read(f, nbformat.NO_CONVERT)

    remove_outputs(nb)
    
    with io.open(new_ipynb, 'w', encoding='utf8') as f:
        nbformat.write(nb, f, nbformat.NO_CONVERT)

source_nb = 'atoms/visualization/choropleth_classification.ipynb'

new_nb = 'nout.ipynb'
clear_notebook(source_nb, new_nb)

## Notebook Class for Querying

In [174]:
source_nb = 'atoms/foundations/Dictionaries-Test.ipynb'
nb = read_nb(source_nb)

In [225]:
source_nb = 'atoms/foundations/Dictionaries-Test.ipynb'
nb = read_nb(source_nb)

import re
import markdown
from bs4 import BeautifulSoup

md = markdown.Markdown()

cell_types = get_nb_structure(nb)    

# Iterate over the code cell-types
for c in cell_types['markdown']:
    
    # Delete code blocks -- this is a bit brutal 
    # and it might be better to escape them in some
    # way... but this at least works well enough
    src = re.sub(r'```.+?```', '', nb.cells[c]['source'], flags=re.S)
    
    print("-"*20 + "New Cell" + "-"*20)
    soup = BeautifulSoup(md.convert(src), 'html.parser')
    
    h1 = soup.findAll('h1')
    print( ", ".join([x.contents[0] for x in h1]))
    
    h2 = soup.findAll('h2')
    print( ", ".join([x.contents[0] for x in h2]))
    
    h3 = soup.findAll('h3')
    print( ", ".join([x.contents[0] for x in h3]))

--------------------New Cell--------------------
Notebook-6: Dictionaries


--------------------New Cell--------------------


Lesson Content, In this Notebook
--------------------New Cell--------------------
Dictionaries


--------------------New Cell--------------------



--------------------New Cell--------------------

Accessing Dictionaries

--------------------New Cell--------------------



--------------------New Cell--------------------



--------------------New Cell--------------------

Creating a Simple Phone Book

--------------------New Cell--------------------

Useful Dictionary Methods

--------------------New Cell--------------------



--------------------New Cell--------------------



--------------------New Cell--------------------


Are You On the List? (Part 2)
--------------------New Cell--------------------


What Do You Do if You're Not On the List?
--------------------New Cell--------------------



--------------------New Cell--------------------

Lists of 

In [34]:
import re
rh1 = re.compile('^# ')
rh2 = re.compile('^## ')
rh3 = re.compile('^### ')
rh4 = re.compile('^#### ')
rh = re.compile('^#+')

class NoteBook(object):
    def __init__(self, ipynb):
        self.nb = read_nb(ipynb)
        self.structure = get_structure(self.nb.cells)
        
    def get_cells_by_type(self, cell_type=None):
        if cell_type:
            cell_type = cell_type.lower()
            return [self.nb.cells[i] for i in self.structure[cell_type]]
        else:
            return self.nb.cells
    
    def get_cells_by_id(self, ids=[]):
        return [self.nb.cells[i] for i in ids]
    
    def get_header_cells(self):
        hs = []
        if 'markdown' in self.structure:
            idxs = self.structure['markdown']
            pairs = zip(idxs, self.get_cells_by_type('markdown'))
            hs = [(idx, cell) for idx, cell in pairs if rh.match(cell['source'])]
        return hs
        
        
    

In [35]:
nb = NoteBook(source_nb)

In [36]:
cid = nb.get_cells_by_id()

In [37]:
cid

[]

In [38]:
cid = nb.get_cells_by_id([7, 10, 2])

In [39]:
cid

[{u'cell_type': u'code',
  u'execution_count': 5,
  u'metadata': {u'collapsed': True},
  u'outputs': [],
  u'source': u"y = f.by_col_array('pcgdp2000')"},
 {u'cell_type': u'markdown',
  u'metadata': {},
  u'source': u'#### Sample Mean\n\n$\\bar{y} = \\sum_{i=1}^n y_i$'},
 {u'cell_type': u'code',
  u'execution_count': 1,
  u'metadata': {u'collapsed': True},
  u'outputs': [],
  u'source': u'import pysal as ps'}]

In [40]:
nb.get_header_cells()

[(0,
  {u'cell_type': u'markdown',
   u'metadata': {},
   u'source': u'# Classification for Choropleth Mapping\n'}),
 (1,
  {u'cell_type': u'markdown',
   u'metadata': {u'collapsed': True},
   u'source': u'## Introduction\n\n* what is classification\n* role in choropleth mapping\n* explore classification using PySAL'}),
 (3,
  {u'cell_type': u'markdown',
   u'metadata': {},
   u'source': u'## Data Set: Mexico State Gross Domestic Product'}),
 (9,
  {u'cell_type': u'markdown',
   u'metadata': {},
   u'source': u'### Numerical summaries'}),
 (10,
  {u'cell_type': u'markdown',
   u'metadata': {},
   u'source': u'#### Sample Mean\n\n$\\bar{y} = \\sum_{i=1}^n y_i$'}),
 (12,
  {u'cell_type': u'markdown',
   u'metadata': {},
   u'source': u'#### Sample Standard Deviation\n\n$\\hat{\\sigma} = \\sqrt{\\frac{\\sum_{i=1}^n (y_i-\\bar{y})^2}{n-1}}$'}),
 (14, {u'cell_type': u'markdown', u'metadata': {}, u'source': u'#### Median'}),
 (22,
  {u'cell_type': u'markdown',
   u'metadata': {},
   u'source

In [41]:
hdict = defaultdict(list)
for idx, cell in nb.get_header_cells():
    level = cell['source'].count("#")
    hdict[level].append(idx)
    

In [42]:
hdict

defaultdict(list,
            {1: [0],
             2: [1, 3, 27, 64],
             3: [9, 22, 28, 35, 37, 39, 41, 43, 45, 48, 50, 52],
             4: [10, 12, 14]})

In [43]:
# find the start and end cells for each H? block
keys = list(hdict.keys())
keys.sort(reverse=True)
all_keys = keys.copy()
start_end = []
last_stop = len(nb.nb.cells)
while keys:
    current = keys.pop(0)
    for element in hdict[current]:
        above = [k for k in all_keys.copy() if k <= current]
        stop = last_stop
        while above:
            key_above = above.pop()
            larger = [v for v in hdict[key_above] if v > element]
            if larger:
                if larger[0] < stop:
                    stop = larger[0]
        start_end.append([element, stop])
        

AttributeError: 'list' object has no attribute 'copy'

In [59]:
start_end # for each H? cell report the start and end cells

[[10, 12],
 [12, 14],
 [14, 22],
 [9, 22],
 [22, 27],
 [28, 35],
 [35, 37],
 [37, 39],
 [39, 41],
 [41, 43],
 [43, 45],
 [45, 48],
 [48, 50],
 [50, 52],
 [52, 64],
 [1, 3],
 [3, 27],
 [27, 64],
 [64, 99],
 [0, 99]]

In [51]:
hdict

defaultdict(list,
            {1: [0],
             2: [1, 3, 27, 64],
             3: [9, 22, 28, 35, 37, 39, 41, 43, 45, 48, 50, 52],
             4: [10, 12, 14]})

In [60]:
len(start_end)

20

In [61]:
len(nb.get_header_cells())

20

In [68]:
# second h2 section with all children
se2 = [ v for v in start_end if v[0]==3][0]
block = nb.get_cells_by_id(range(*se2))
for cell in block:
    print(cell['source'])

## Data Set: Mexico State Gross Domestic Product
ps.examples.available()
ps.examples.explain('mexico')
f = ps.open(ps.examples.get_path('mexico.csv'))
f.header
y = f.by_col_array('pcgdp2000')
y
### Numerical summaries
#### Sample Mean

$\bar{y} = \sum_{i=1}^n y_i$
y_mean = y.mean()
y_mean
#### Sample Standard Deviation

$\hat{\sigma} = \sqrt{\frac{\sum_{i=1}^n (y_i-\bar{y})^2}{n-1}}$
y_std = y.std()
y_std
#### Median
import numpy as np
y_median = np.median(y)
y_median
(y < y_mean).sum()
(y > y_mean).sum()
y_sorted = np.sort(y, axis=0)
y_sorted
y_sorted[15]
y_sorted[16]
(y_sorted[15]+y_sorted[16])/2.
### Univariate Distribution Visualization
%pylab inline
import seaborn as sns
sns.distplot(y)
sns.distplot(y, kde=False, rug=True)
sns.distplot(y, bins=5, kde=False, rug=True)
sns.distplot(y, hist=False,  rug=True)


In [69]:
# first h3 section in second h2 section with all children
se3 = [ v for v in start_end if v[0]==9][0]
block = nb.get_cells_by_id(range(*se3))
for cell in block:
    print(cell['source'])

### Numerical summaries
#### Sample Mean

$\bar{y} = \sum_{i=1}^n y_i$
y_mean = y.mean()
y_mean
#### Sample Standard Deviation

$\hat{\sigma} = \sqrt{\frac{\sum_{i=1}^n (y_i-\bar{y})^2}{n-1}}$
y_std = y.std()
y_std
#### Median
import numpy as np
y_median = np.median(y)
y_median
(y < y_mean).sum()
(y > y_mean).sum()
y_sorted = np.sort(y, axis=0)
y_sorted
y_sorted[15]
y_sorted[16]
(y_sorted[15]+y_sorted[16])/2.
