In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

import os
from io import StringIO

In [2]:
heads = {
    'RD':"RD|Action Code|State Code|County Code|Site ID|Parameter|POC|Sample Duration|Unit|Method|Date|Start Time|Sample Value|Null Data Code|Sampling Frequency|Monitor Protocol (MP) ID|Qualifier - 1|Qualifier - 2|Qualifier - 3|Qualifier - 4|Qualifier - 5|Qualifier - 6|Qualifier - 7|Qualifier - 8|Qualifier - 9|Qualifier - 10|Alternate Method Detectable Limit|Uncertainty",
    'RC':"RC|Action Code|State Code|County Code|Site ID|Parameter|POC|Unit|Method|Year|Period|Number of Samples|Composite Type|Sample Value|Monitor Protocol (MP) ID|Qualifier - 1|Qualifier - 2|Qualifier - 3|Qualifier - 4|Qualifier - 5|Qualifier - 6|Qualifier - 7|Qualifier - 8|Qualifier - 9|Qualifier - 10|Alternate Method Detectable Limit|Uncertainty",
    '1':"SITE INFORMATION TYPE = 1|STATE_CODE|STATE_NAME|COUNTY_CODE|COUNTY_NAME|SITE_ID|PARAMETER_CODE|PARAMETER_DESC|CITY_CODE|CITY_NAME|STREET_ADDRESS|AQCR_CODE|AQCR_NAME|CBSA_CODE|CBSA_NAME|CSA_CODE|CSA_NAME|EPA_REGION|UAR_CODE|UAR_NAME|LAND_USE|LOC_SET|LATITUDE|LONGITUDE|UTM_ZONE|UTM_NORTHING|UTM_EASTING|HORIZ_COLLECT|HORIZ_METHOD|HORIZ_DATUM|HORIZ_ACC|HORIZ_SCALE|ELEVATION_MSL|VERT_COLLECT|VERT_METHOD|VERT_DATUM|VERT_ACC",
    '2':"DAILY VALUE TYPE = 2|STATE_CODE|COUNTY_CODE|SITE_ID|PARAMETER_CODE|UNITS|PRIMARY_MONITOR_POC|MONITOR_TYPE|PQAO|PQAO_NAME|SAMPLE_DATE|VALUE|SOURCE|EDT_ID"
}

In [5]:
basePath='raw_data/'
files=[basePath+f for f in os.listdir(basePath) if f.endswith('.txt')]
file=basePath+files[0]
print(files)

['raw_data/Harrisburg_Lancaster_York_MSA_PM25SiteLevel_2014-2016.txt', 'raw_data/Lancaster_Met_2016.txt', 'raw_data/Harrisburg_Lancaster_York_MSA_CO_2014-2016.txt', 'raw_data/Harrisburg_Lancaster_York_MSA_PM10_2014-2016.txt', 'raw_data/Hershey_Met_2014.txt', 'raw_data/York_Met_2016.txt', 'raw_data/Harrisburg_Met_2015.txt', 'raw_data/York_Met_2015.txt', 'raw_data/Harrisburg_Lancaster_York_MSA_SO2_2014-2016.txt', 'raw_data/Harrisburg_Met_2016.txt', 'raw_data/Hershey_Met_2016.txt', 'raw_data/Lancaster_Met_2015.txt', 'raw_data/Lancaster_Met_2014.txt', 'raw_data/York_Met_2014.txt', 'raw_data/Harrisburg_Met_2014.txt', 'raw_data/PerryCounty_Met_2014.txt', 'raw_data/Hershey_Met_2015.txt']


In [6]:
with open(files[3]) as f:
    lines1 = pd.DataFrame(f.readlines(),columns=['raw'])
lines1.head()

Unnamed: 0,raw
0,# RD|Action Code|State Code|County Code|Site I...
1,# RC|Action Code|State Code|County Code|Site I...
2,RD|I|42|043|0401|81102|1|7|001|063|20140111|00...
3,RD|I|42|043|0401|81102|1|7|001|063|20140117|00...
4,RD|I|42|043|0401|81102|1|7|001|063|20140123|00...


In [7]:
lines1['comment']=[s.startswith('#') for s in lines1.raw]
lines1['type']=[s.split('|')[0] for s in lines1.raw]
lines1.tail()

Unnamed: 0,raw,comment,type
70154,RD|I|42|133|0008|81102|5|1|001|079|20151231|20...,False,RD
70155,RD|I|42|133|0008|81102|5|1|001|079|20151231|21...,False,RD
70156,RD|I|42|133|0008|81102|5|1|001|079|20151231|22...,False,RD
70157,RD|I|42|133|0008|81102|5|1|001|079|20151231|23...,False,RD
70158,# 70156 records were written\n,True,# 70156 records were written\n


In [8]:
# Drop comment lines
lines1.drop(lines1[lines1.comment].index, inplace=True)
lines1.tail()

Unnamed: 0,raw,comment,type
70153,RD|I|42|133|0008|81102|5|1|001|079|20151231|19...,False,RD
70154,RD|I|42|133|0008|81102|5|1|001|079|20151231|20...,False,RD
70155,RD|I|42|133|0008|81102|5|1|001|079|20151231|21...,False,RD
70156,RD|I|42|133|0008|81102|5|1|001|079|20151231|22...,False,RD
70157,RD|I|42|133|0008|81102|5|1|001|079|20151231|23...,False,RD


In [13]:
# Split into row types
types=lines1.type.unique()
outputs = dict()
for t in types:
    if t not in heads:
        raise Exception('Unknown row type '+t)
    outputs[t] = StringIO(heads[t]+'\n'+''.join(lines1[lines1.type==t].raw))

print(outputs)

{'RD': <_io.StringIO object at 0x7f3be9a39a68>}


In [15]:
# Functionize everything!
def load_split_file(file):
    with open(file) as f:
        lines1 = pd.DataFrame(f.readlines(),columns=['raw'])
    
    lines1['comment']=[s.startswith('#') for s in lines1.raw]
    lines1['type']=[s.split('|')[0] for s in lines1.raw]
    
    # Drop comment lines
    lines1.drop(lines1[lines1.comment].index, inplace=True)
    lines1.tail()
    
    # Split into row types
    types=lines1.type.unique()
    outputs = dict()
    for t in types:
        if t not in heads:
            raise Exception('Unknown row type '+t)
        outputs[t] = StringIO(heads[t]+'\n'+''.join(lines1[lines1.type==t].raw))

    return outputs

In [19]:
for f in files:
    print(load_split_file(f))

{'2': <_io.StringIO object at 0x7f3be9a39dc8>, '1': <_io.StringIO object at 0x7f3be9a39c18>}
{'RD': <_io.StringIO object at 0x7f3be9a39c18>}
{'RD': <_io.StringIO object at 0x7f3be9a39c18>}
{'RD': <_io.StringIO object at 0x7f3be9a39438>}
{'RD': <_io.StringIO object at 0x7f3be9a39438>}
{'RD': <_io.StringIO object at 0x7f3be9a39438>}
{'RD': <_io.StringIO object at 0x7f3be9a39438>}
{'RD': <_io.StringIO object at 0x7f3be9a39c18>}
{'RD': <_io.StringIO object at 0x7f3be9a39c18>}
{'RD': <_io.StringIO object at 0x7f3be9a39c18>}
{'RD': <_io.StringIO object at 0x7f3be9a39c18>}
{'RD': <_io.StringIO object at 0x7f3be9a39438>}
{'RD': <_io.StringIO object at 0x7f3be9a39438>}
{'RD': <_io.StringIO object at 0x7f3be9a39438>}
{'RD': <_io.StringIO object at 0x7f3be9a39c18>}
{'RD': <_io.StringIO object at 0x7f3be9a39c18>}
{'RD': <_io.StringIO object at 0x7f3be9a39c18>}
