In [1]:
### Boilerplate and imports

import csv, glob, json, os, math, numbers, pandas, re, scipy, scipy.sparse, shutil
import subprocess, sys, thread, threading, time, urllib2

import pandas as pd

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

In [2]:
# Wide display
from IPython.core.display import display, HTML
display(HTML("<style>#notebook-container { margin-left:-14px; width:calc(100% + 27px) !important; }</style>"))
pd.set_option('display.max_columns', 500)

In [3]:
# Per-school Common Core Data (CCD) from NCES
# https://nces.ed.gov/ccd/pubschuniv.asp

In [4]:
# Install r
# sudo apt-get install r-base sudo libhdf5-dev
# sudo R
# install.packages("haven")

# library(haven)
# write.csv()


In [5]:
ccd_download_dir = 'allegheny_county/schools/nces'

def uniqpath():
    return '%s-%s' % (thread.get_ident(), os.getpid())

# Very slow conversion ... but pandas read_sas is failing a lot
def convert_sas_to_h5(sas):
    h5 = os.path.splitext(sas)[0] + '.h5'
    if os.path.exists(h5):
        sys.stdout.write('%s already exists\n' % h5)
    else:
        h5tmp = h5 + uniqpath() + ".tmp.h5"
        cmd = 'library(haven); library(rhdf5); h5createFile("%s"); h5write(read_sas("%s"), file="%s", name="data")' % (h5tmp, sas, h5tmp)
        sys.stdout.write('Converting %s to %s\n' % (sas, h5))
        sys.stdout.write('%s\n' % cmd)
        !R_DEFAULT_PACKAGES= Rscript -e '$cmd'
        os.rename(h5tmp, h5)
    return h5

# Reads src_url into table, via intermediate caches of downloading original source and converting to local HDF5
def read_ccd_src(src_url):
    dest = ccd_download_dir + '/' + os.path.basename(src_url)
    download_file(src_url, dest)
    unzip_file(dest)
    sasfile = glob.glob(os.path.splitext(dest)[0] + '/*.sas7bdat')[0]
    
    hdffile = convert_sas_to_h5(sasfile)

    sys.stdout.write('Reading %s\n' % hdffile)
    return pd.read_hdf(hdffile)
    
def ccd_from_year(year):
    src = ccd_src_from_year(year)

In [10]:
def merge_cols(a, b):
    ret = a.copy()
    use_b = a.isna() & ~b.isna()
    ret[use_b] = b[use_b]
    return ret

def merge_ccd_dataframes(frames, join_col):
    m = frames[0]
    for rhs in frames[1:]:
        cols_to_merge = sorted(set(m.columns) & set(rhs.columns) - set([join_col]))
        m = pd.merge(m, rhs, on=join_col, how='outer', validate='1:1', suffixes=('_xxx', '_yyy'))
        for col in cols_to_merge:
            m[col] = merge_cols(m[col + '_xxx'], m[col + '_yyy'])
            m.drop(columns = [col + '_xxx', col + '_yyy'], inplace=True)
    return m

def read_ccd(year):
    srcs = ccd_src_from_year(year)

def read_and_merge_ccd_srcs(srcs, horizontal=False):
    if horizontal:
        [read_ccd_src(src) for src in srcs]
    else:
        return merge_ccd_dataframes([read_ccd_src(src) for src in srcs], 'NCESSCH')


In [11]:
def compute_2017(): return read_ccd_src('https://nces.ed.gov/ccd/Data/zip/ccd_sch_029_1718_w_0a_03302018_sas.zip')

def compute_2016(): return read_ccd_src('https://nces.ed.gov/ccd/Data/zip/ccd_sch_029_1617_w_0e_050317_sas.zip')

def compute_2015(): return read_and_merge_ccd_srcs([
    'https://nces.ed.gov/ccd/Data/zip/ccd_sch_029_1516_w_2a_011717_sas.zip',
    'https://nces.ed.gov/ccd/Data/zip/ccd_sch_052_1516_w_2a_011717_sas.zip',
    'https://nces.ed.gov/ccd/Data/zip/ccd_sch_059_1516_w_2a_011717_sas.zip',
    'https://nces.ed.gov/ccd/Data/zip/ccd_sch_129_1516_w_2a_011717_sas.zip',
    'https://nces.ed.gov/ccd/Data/zip/ccd_sch_033_1516_w_2a_011717_sas.zip'])

def compute_2014(): return read_and_merge_ccd_srcs([
    'https://nces.ed.gov/ccd/Data/zip/ccd_sch_029_1415_w_0216161a_sas.zip',
    'https://nces.ed.gov/ccd/Data/zip/ccd_sch_052_1415_w_0216161a_sas.zip',
    'https://nces.ed.gov/ccd/Data/zip/ccd_sch_059_1415_w_0216161a_sas.zip',
    'https://nces.ed.gov/ccd/Data/zip/ccd_sch_129_1415_w_0216161a_sas.zip',
    'https://nces.ed.gov/ccd/Data/zip/ccd_sch_033_1415_w_0216161a_sas.zip'])

def compute_2013(): return read_ccd_src('https://nces.ed.gov/ccd/Data/zip/sc132a_sas.zip')

def compute_2012(): return read_ccd_src('https://nces.ed.gov/ccd/Data/zip/sc122a_sas.zip')

def compute_2011(): return read_ccd_src('https://nces.ed.gov/ccd/Data/zip/sc111a_supp_sas.zip')

def compute_2010(): return read_ccd_src('https://nces.ed.gov/ccd/Data/zip/sc102a_sas7bdat.zip')

def compute_2009(): return read_ccd_src('https://nces.ed.gov/ccd/data/zip/sc092a_sas.zip')

def compute_2008(): return read_ccd_src('https://nces.ed.gov/ccd/data/zip/sc081b_sas.zip')

def compute_2007(): return read_ccd_src('https://nces.ed.gov/ccd/data/zip/sc071b_sas.zip')

def compute_2006(): return read_ccd_srcs(['https://nces.ed.gov/ccd/data/zip/sc061cai_sas.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc061ckn_sas.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc061cow_sas.zip'],
                                         horizontal=True)
def compute_2005(): return read_ccd_srcs(['https://nces.ed.gov/ccd/data/zip/sc051aai_sas.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc051akn_sas.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc051aow_sas.zip'],
                                         horizontal=True)
                                         
def compute_2004(): return read_ccd_srcs(['https://nces.ed.gov/ccd/data/zip/sc041bai_sas.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc041bkn_sas.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc041bow_sas.zip'],
                                         horizontal=True)
                                         
def compute_2003(): return read_ccd_srcs(['https://nces.ed.gov/ccd/data/zip/sc031aai_sas.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc031akn_sas.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc031aow_sas.zip'],
                                         horizontal=True)
                                         
def compute_2002(): return read_ccd_srcs(['https://nces.ed.gov/ccd/data/zip/sc021aai_sd2.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc021akn_sd2.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc021aow_sd2.zip'],
                                         horizontal=True)
                                         
def compute_2001(): return read_ccd_srcs(['https://nces.ed.gov/ccd/data/zip/sc011aai.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc011akn.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc011aow.zip'],
                                         horizontal=True)
                                         
def compute_2000(): return read_ccd_srcs(['https://nces.ed.gov/ccd/data/zip/sc001aai.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc001akn.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc001aow.zip'],
                                         horizontal=True)
                                         
def compute_1999(): return read_ccd_srcs(['https://nces.ed.gov/ccd/data/zip/sc991bai.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc991bkn.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc991bow.zip'],
                                         horizontal=True)
                                         
def compute_1998(): return read_ccd_srcs(['https://nces.ed.gov/ccd/data/zip/sc981cai.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc981ckn.zip',
                                          'https://nces.ed.gov/ccd/data/zip/sc981cow.zip'],
                                         horizontal=True)
                                         
def compute_1997(): return read_ccd_srcs(['https://nces.ed.gov/ccd/data/zip/psu97ai.zip',
                                          'https://nces.ed.gov/ccd/data/zip/psu97kn.zip',
                                          'https://nces.ed.gov/ccd/data/zip/psu97ow.zip'],
                                         horizontal=True)
                                         

    
    
    


In [15]:
def compute_if_needed(year):
    filename = ccd_download_dir + ('/%d-ccd-computed.h5' % year)
    if os.path.exists(filename):
        print '%s already exists, skipping' % filename
        return
    ccd = eval('compute_%d()' % year)
    tmp = filename + uniqpath() + '.h5'
    ccd.to_hdf(tmp, 'data')
    os.rename(tmp, filename)
    print 'Created %s' % filename

pool = SimpleProcessPoolExecutor(4)

for year in range(2017, 1980, -1):
    try:
        pool.submit(compute_if_needed, year)
        compute_if_needed(year)
    except:
        print '%d failed' % year
        
pool.shutdown()
    

allegheny_county/schools/nces/2017-ccd-computed.h5 already exists, skipping
allegheny_county/schools/nces/2016-ccd-computed.h5 already exists, skipping
allegheny_county/schools/nces/2015-ccd-computed.h5 already exists, skipping
allegheny_county/schools/nces/ccd_sch_029_1415_w_0216161a_sas.zip already downloaded
allegheny_county/schools/nces/ccd_sch_029_1415_w_0216161a_sas.zip already unzipped
allegheny_county/schools/nces/ccd_sch_029_1415_w_0216161a_sas/ccd_sch_029_1415_w_0216161a.h5 already exists
Reading allegheny_county/schools/nces/ccd_sch_029_1415_w_0216161a_sas/ccd_sch_029_1415_w_0216161a.h5
allegheny_county/schools/nces/2017-ccd-computed.h5 already exists, skipping
allegheny_county/schools/nces/2016-ccd-computed.h5 already exists, skipping
allegheny_county/schools/nces/2015-ccd-computed.h5 already exists, skipping
allegheny_county/schools/nces/ccd_sch_029_1415_w_0216161a_sas.zip already downloaded
allegheny_county/schools/nces/ccd_sch_029_1415_w_0216161a_sas.zip already unzipped

Exception caught in SimpleProcessPoolExecutor.shutdown.  Continuing until all are finished.
Exception follows:
Traceback (most recent call last):
  File "<string>", line 215, in shutdown
  File "/home/rsargent/anaconda2/lib/python2.7/site-packages/concurrent/futures/_base.py", line 455, in result
    return self.__get_result()
  File "/home/rsargent/anaconda2/lib/python2.7/site-packages/concurrent/futures/_base.py", line 414, in __get_result
    raise exception_type, self._exception, self._traceback
NameError: global name 'read_ccd_srcs' is not defined
Exception caught in SimpleProcessPoolExecutor.shutdown.  Continuing until all are finished.
Exception follows:
Traceback (most recent call last):
  File "<string>", line 215, in shutdown
  File "/home/rsargent/anaconda2/lib/python2.7/site-packages/concurrent/futures/_base.py", line 455, in result
    return self.__get_result()
  File "/home/rsargent/anaconda2/lib/python2.7/site-packages/concurrent/futures/_base.py", line 414, in __get_re

Exception: SimpleProcessPoolExecutor failed: 27 of 37 raised exception

In [None]:
!file allegheny_county/schools/nces/ccd_sch_052_1415_w_0216161a_sas/*

In [None]:
pd.read_sas('allegheny_county/schools/nces/ccd_sch_052_1415_w_0216161a_sas/ccd_sch_052_1415_w_0216161a.sas7bdat')

In [None]:
pd.read_hdf("bar.h5")