# Airnow Common

Constants and functions common to the various Airnow scripts.

In [None]:
import os, dateutil, requests, datetime, subprocess, shutil

from dateutil import rrule, tz, parser

In [None]:
class AirnowCommonInstance:
    AIRNOW_ROOT_URL = 'https://files.airnowtech.org/airnow/'
    DATA_DIRECTORY = '../../airnow-data'

    DAILY_AQI_DIRECTORY = DATA_DIRECTORY + '/daily-aqi'
    DAILY_AQI_DAT_DIRECTORY = DAILY_AQI_DIRECTORY + '/dat'
    DAILY_AQI_JSON_DIRECTORY = DAILY_AQI_DIRECTORY + '/json'

    HOURLY_AQI_DIRECTORY = DATA_DIRECTORY + '/hourly-aqi'
    HOURLY_AQI_DAT_DIRECTORY = HOURLY_AQI_DIRECTORY + '/dat'
    HOURLY_AQI_JSON_DIRECTORY = HOURLY_AQI_DIRECTORY + '/json'

    DAILY_VALUES_DIRECTORY = DATA_DIRECTORY + '/daily-values'
    DAILY_VALUES_DAT_DIRECTORY = DAILY_VALUES_DIRECTORY + '/dat'

    HOURLY_VALUES_DIRECTORY = DATA_DIRECTORY + '/hourly-values'
    HOURLY_VALUES_DAT_DIRECTORY = HOURLY_VALUES_DIRECTORY + '/dat'
    HOURLY_VALUES_UPLOADED_DIRECTORY = HOURLY_VALUES_DIRECTORY + '/uploaded-to-esdr'

    HIGHEST_FIVE_AQI_DIRECTORY = DATA_DIRECTORY + '/highest-five-aqi'
    HIGHEST_FIVE_AQI_DAT_DIRECTORY = HIGHEST_FIVE_AQI_DIRECTORY + '/dat'

    HIGHEST_TEN_AQI_DIRECTORY = DATA_DIRECTORY + '/highest-ten-aqi'
    HIGHEST_TEN_AQI_DAT_DIRECTORY = HIGHEST_TEN_AQI_DIRECTORY + '/dat'

    REPORTING_AREA_DIRECTORY = DATA_DIRECTORY + '/reporting-area'
    REPORTING_AREA_DAT_DIRECTORY = REPORTING_AREA_DIRECTORY + '/dat'
    REPORTING_AREA_COOKED_DAT_DIRECTORY = REPORTING_AREA_DIRECTORY + '/dat-cooked'
    REPORTING_AREA_FORECASTS_DIRECTORY = REPORTING_AREA_DIRECTORY + '/forecasts'

    # Field names for the Airnow reportingarea.dat data file. See https://s3-us-west-1.amazonaws.com//files.airnowtech.org/airnow/docs/ReportingAreaFactSheet.pdf
    REPORTING_AREA_DATA_FILE_FIELDNAMES = 'issue_date|valid_date|valid_time|time_zone|record_sequence|data_type|primary|reporting_area|state_code|latitude|longitude|pollutant|aqi_value|aqi_category|action_day|discussion|forecast_source'.split('|')

    def directory_from_date(self, dt):
        return dt.strftime('%Y/%Y%m%d')

    def datetime2epoch(self, dt):
        return (dt - datetime.datetime(1970, 1, 1, tzinfo=tz.tzutc())).total_seconds()

    # Returns a tuple containing (True, message, HTTP status) if the file was actually mirrored (i.e. both newer than the current version, and successfully downloaded), returns a tuple containing (False, message, HTTP status) otherwise.
    def mirror_file_using_modtime(self, src_url, dest, will_also_extract_new_records):
        headers = {}
        # If destination already exists, mirror only if newer
        try:
            filestat = os.stat(dest)
            date = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(filestat.st_mtime))
            headers['If-Modified-Since'] = date
        except:
            pass

        response = requests.get(src_url, headers=headers)
        if response.status_code == 200:
            data = response.content

            server_modtime = dateutil.parser.parse(response.headers['Last-Modified'])
            server_modtime_epoch = self.datetime2epoch(server_modtime)
            tmp = dest + '.tmp' + str(os.getpid())
            os.makedirs(os.path.dirname(tmp), exist_ok=True)
            open(tmp, 'wb').write(data)

            # If will_also_extract_new_records is true AND the dest file already exists, then do some extra processing
            # to pick out new records and save to a file that we'll return.  If the dest file doesn't exist, then we'll
            # simply tmp to new_records_file so that they'll be initially identical.  The returned new_records_file
            # filename will be the same as the destination, but with hours, minutes, and seconds of the destination
            # file's modification time appended, e.g. 20210810-184139.dat
            new_records_file = None
            if will_also_extract_new_records:
                (base_filename, extension) = os.path.splitext(dest)
                new_records_file = time.strftime(base_filename + '-%Y%m%d%H%M%S' + extension, time.gmtime(server_modtime_epoch))

                if os.path.isfile(dest):
                    # Use awk magic to pick out lines in file2 which are not in file1.  The general pattern is:
                    #
                    #   awk 'NR==FNR{lines[$0];next} !($0 in lines)' file1 file2
                    #
                    # I found this amazingness at:
                    #
                    #   https://askubuntu.com/questions/845502/get-the-unique-lines-of-second-file-in-result-of-comparing-two-files/845528#845528
                    #
                    with open(new_records_file, 'wb') as out:
                        process = subprocess.Popen(['awk', 'NR==FNR{lines[$0];next} !($0 in lines)', dest, tmp], stdout=out)
                        process.wait()
                else:
                    # if the dest file doesn't exist, then just copy tmp to new_records_file so that, initially, dest
                    # and new_records_file will have identical contents
                    shutil.copy2(tmp, new_records_file)

                # make sure the modification timestamp of the new_records_file matches that of dest
                os.utime(new_records_file, (server_modtime_epoch, server_modtime_epoch))

            os.rename(tmp, dest)
            os.utime(dest, (server_modtime_epoch, server_modtime_epoch))
            print('Wrote %d bytes to %s' % (len(data), dest))
            return (True, 'Successfully mirrored %s to %s (%d bytes)' % (src_url, dest, len(data)), response.status_code, new_records_file)
        elif response.status_code == 304:
            return (False, 'Local mirror of %s is up to date.  Skipping.' % (src_url), response.status_code, None)
        elif response.status_code == 404:
            return (False, 'File %s not found (HTTP %d). Skipping.' % (src_url, response.status_code), response.status_code, None)
        else:
            return (False, 'Received status code %d while fetching %s.  Skipping.' % (response.status_code, src_url), response.status_code, None)


    # Mirrors a file from the Airnow server specified by src, assumed to be a path relative to
    # `AirnowCommon.AIRNOW_ROOT_URL`.  Returns a tuple of the form (is_new, message, status_code).  The is_new value is
    # True if the file was actually mirrored (i.e. both newer than the current # version, and successfully downloaded),
    # and False otherwise.
    def mirror_airnow_file(self, src, dest):
        (is_new, message, status_code, new_records_file) = self.mirror_file_using_modtime(AirnowCommon.AIRNOW_ROOT_URL + src, dest, False)
        return is_new, message, status_code # don't return new_records_file since it'll always be None in this case


    # Mirrors a file from the Airnow server specified by src, assumed to be a path relative to
    # `AirnowCommon.AIRNOW_ROOT_URL`.  Returns a tuple of the form (is_new, message, status_code, new_records_file).
    # The is_new value is # True if the file was actually mirrored (i.e. both newer than the current # version, and
    # successfully downloaded), # and False otherwise.
    def mirror_airnow_file_and_extract_new_records(self, src, dest):
        return self.mirror_file_using_modtime(AirnowCommon.AIRNOW_ROOT_URL + src, dest, True)


AirnowCommon = AirnowCommonInstance()