In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import sys
import os
import io
import shutil

import pudl
import pudl.glue.ferc1_eia
from pudl import init
from pudl import constants as pc
import logging
import yaml

In [2]:
pudl_settings = pudl.settings.init()

In [13]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:
from sqlalchemy.engine import reflection
pudl_engine = init.connect_db(testing=False)
insp = reflection.Inspector.from_engine(pudl_engine)
# extract the table names
tbls = insp.get_table_names()
# extract only the ferc tables
ferc_tbls = [s for s in tbls if "ferc" in s]

If you want to generate the package for just ferc as an example, you can use the commented out ferc options below.

In [None]:
name = "pudl-test"
title = "All tables integrated into PUDL."
tbls = tbls

#name = "pudl-ferc1"
#title =  "FERC Form 1 tables integrated into PUDL."
#tbls = ferc_tbls

In [None]:
#tbls.remove('hourly_emissions_epacems')

In [None]:
# we need this as the main info regarding this iteration of packaging
pkg_skeleton = {
    "name": name,
    "title": title,
    "description": "A full metadat description of all PUDL tables.",
    "keywords": [
        "ferc",
        "form 1",
        "energy",
        "electricity",
        "utility",
        "fuel",
        "expenses",
        "coal",
        "natural gas",
        "generation",
        "regulation"
    ]
}

this will generate metadata and csv's for every table in pudl. they will live in `results/data_pkgs`

In [None]:
pkg = pudl.output.export.data_package(tbls,
                                      pkg_skeleton,
                                      pudl_settings, 
                                      dry_run=True)

- Cems_pkg
    - data
        - cems2017.csv.zip
        - cems2016.csv.zip
        - cems2015.csv.zip
        (see "Compression of Resources": http://frictionlessdata.io/specs/patterns/)
    - datapackage.json
        which includes a list of paths in "paths" in the resource
        (see "Data in Multiple Files": https://frictionlessdata.io/specs/data-resource/)


I need to convert the ETL cems process to generate years of cems at a time and write it compressed.

In [100]:
pudl_settings = pudl.settings.init()

In [105]:
pkg_settings = pudl.settings.grab_package_settings(pudl_settings, 'settings_datapackage_default.yml')

In [126]:
metas = pudl.output.export.generate_data_packages(pkg_settings, pudl_settings, debug=True)

Loading Glue plants dataframe into CSV
Loading Glue utilities dataframe into CSV
Loading Glue utilities_ferc dataframe into CSV
Loading Glue plants_ferc dataframe into CSV
Loading Glue utility_plant_assn dataframe into CSV
Loading Glue utilities_eia dataframe into CSV
Loading Glue plants_eia dataframe into CSV
Tables are consistent for glue-test package
Not loading FERC1
Tables are consistent for ferc1-test package
Not generating metadata for ferc1-test
Not loading EIA.
Tables are consistent for eia-test package
Not generating metadata for eia-test
Not loading EIA.
Not ingesting EPA CEMS.
Tables are consistent for epacems_eia860 package
Not generating metadata for epacems_eia860


In [127]:
#ferc_vali = metas['ferc1_pkg'][1]
#ferc_meta = metas['ferc1-pkg'][0]
#eia_vali = metas['eia_pkg']
#cems_vali = metas['epacems_eia860']
glue_vali = metas['glue-test']

In [25]:
glue_vali[1]

{'time': 0.689,
 'valid': False,
 'error-count': 1000,
 'table-count': 5,
 'tables': [{'datapackage': '/Users/christinagosnell/code/pudl/results/datapackage/glue-test/datapackage.json',
   'time': 0.453,
   'valid': True,
   'error-count': 0,
   'row-count': 1000,
   'source': '/Users/christinagosnell/code/pudl/results/datapackage/glue-test/data/plants.csv',
   'headers': ['plant_id_pudl', 'name'],
   'format': 'inline',
   'schema': 'table-schema',
   'errors': []},
  {'datapackage': '/Users/christinagosnell/code/pudl/results/datapackage/glue-test/datapackage.json',
   'time': 0.297,
   'valid': False,
   'error-count': 1000,
   'row-count': 585,
   'source': '/Users/christinagosnell/code/pudl/results/datapackage/glue-test/data/plants_ferc.csv',
   'headers': ['utility_id_ferc1', 'plant_name', 'plant_id_pudl'],
   'format': 'inline',
   'schema': 'table-schema',
   'errors': [{'code': 'required-constraint',
     'row-number': 2,
     'column-number': 1,
     'message': 'Column 1 is a 

- the main coordinating function is pudl.output.export.generate_data_packages()
    - uses settings:
        -yml file (ex: ‘settings_init_pudl_package.yml’) and settings.py
    - validates settings using `pudl.ETL_pkg._input_validate`... this spits out a new, validated list of package settings
    - then for each of the of the packages definied in settings, run the ETL_pkg function which runs a data source specific function for each dataset. The ETL_pgk function needs the settings/inputs. In a stand-alone version of this function these can be validated settings or non-validated settings. The settings will get re-validated immediately inside of the data source specific ETL function. These ETL functions generate CSVs in the out_dir/(package name)/data folder.
    - an output of the ETL_pkg function is a list of tables being generated. this list is piped through to a `test_file_consistency` function, which ensures the ETL tables, the CSVs and dependencies from the metadata
    - then the `generate_metadata` function is run. at a high level, this generates and validates the `datapackage.json` file and runs `goodtables.validate`.