In [1]:
import pandas as pd
import tarfile
import json
from scipy import stats
import numpy as np

In [2]:
tar = tarfile.open("./scratch/scraped.tar.gz", "r:gz")

In [3]:
def parse(jsn):
    dump = {}
    for state in jsn['States']:
        time = state['DateFromTo']
        state_name = state['Name']
        for region in state['Regions']:
            region_name = region['Name']
            for station in region['Stations']:
                if 'Lat' in station:
                    sid = station['Name'] + '|' + time
                    out = {'time': time, 'state_name': state_name, 'region_name': region_name}
                    out['code'] = station['Code']
                    out['name'] = station['Name']
                    out['cls'] = station['Classif']
                    out['ix'] = station['Ix']
                    out['lat'] = station['Lat']
                    out['lon'] = station['Lon']
                    for comp in station['Components']:
                        if ((comp['Code'] == 'PM10') & (comp['Int'] == '24h')) | ('Val' not in comp):
                            continue
                        else:
                            out[comp['Code']] = comp['Val']
                    dump[sid] = out
    return dump

In [4]:
dta = {}
for member in tar.getmembers():
    if '.json' in str(member):
        try:
            f = tar.extractfile(member)
            jsn = json.loads(f.read())
            dta.update(parse(jsn))
        except:
            print('ERR ' + str(member))

ERR <TarInfo 'scraped/1455802202939.json' at 0x1fbfb6a9688>
ERR <TarInfo 'scraped/1455802505665.json' at 0x1fbfbd31f20>


In [55]:
jsn

{'Actualized': '28.10.2017 17:47 SELČ',
 'Components': [{'Code': 'SO2', 'Name': 'oxid siřičitý', 'Unit': 'µg/m³'},
  {'Code': 'NO2', 'Name': 'oxid dusičitý', 'Unit': 'µg/m³'},
  {'Code': 'CO', 'Name': 'oxid uhelnatý', 'Unit': 'µg/m³'},
  {'Code': 'O3', 'Name': 'ozon', 'Unit': 'µg/m³'},
  {'Code': 'PM10', 'Name': 'částice PM10', 'Unit': 'µg/m³'},
  {'Code': 'PM2_5', 'Name': 'jemné částice PM2,5', 'Unit': 'µg/m³'}],
 'IsOzoneUsedInIndex': False,
 'Legend': [{'Color': 'C7EAFB',
   'ColorText': '000000',
   'Description': 'velmi dobrá',
   'Ix': 1},
  {'Color': '9BD3AE', 'ColorText': '000000', 'Description': 'dobrá', 'Ix': 2},
  {'Color': 'FFF200',
   'ColorText': '000000',
   'Description': 'uspokojivá',
   'Ix': 3},
  {'Color': 'FAA61A',
   'ColorText': '000000',
   'Description': 'vyhovující',
   'Ix': 4},
  {'Color': 'ED1C24', 'ColorText': 'FFFFFF', 'Description': 'špatná', 'Ix': 5},
  {'Color': '671F20',
   'ColorText': 'FFFFFF',
   'Description': 'velmi špatná',
   'Ix': 6},
  {'Colo

In [5]:
frm = pd.DataFrame.from_dict(dta, orient='index')
dta = None

In [6]:
pd.DataFrame.from_dict(jsn['Legend']).set_index('Ix').to_dict(orient='index')

{-1: {'Color': 'CFCFCF',
  'ColorText': '000000',
  'Description': 'Index se na uvedené stanici nestanovuje'},
 0: {'Color': 'FFFFFF', 'ColorText': '000000', 'Description': 'Neúplná data'},
 1: {'Color': 'C7EAFB', 'ColorText': '000000', 'Description': 'velmi dobrá'},
 2: {'Color': '9BD3AE', 'ColorText': '000000', 'Description': 'dobrá'},
 3: {'Color': 'FFF200', 'ColorText': '000000', 'Description': 'uspokojivá'},
 4: {'Color': 'FAA61A', 'ColorText': '000000', 'Description': 'vyhovující'},
 5: {'Color': 'ED1C24', 'ColorText': 'FFFFFF', 'Description': 'špatná'},
 6: {'Color': '671F20', 'ColorText': 'FFFFFF', 'Description': 'velmi špatná'}}

In [7]:
frm['dtime'] = frm.time.apply(lambda x: ' '.join(x.split(' ')[0::3]))

In [8]:
frm['day'] = frm.dtime.apply(lambda x: str(x).split(' ')[0])

In [9]:
frm.dtime = pd.to_datetime(frm.dtime, dayfirst=True)

In [47]:
grp = frm.groupby(['code', 'day'], as_index=False)[['SO2', 'NO2', 'CO', 'PM10', 'O3', 'PM2_5']].mean().to_dict(orient='index')

In [48]:
out_dict = {}
for rec in grp:
    r = grp[rec]
    code = r['code']
    day = r['day']
    
    if code not in out_dict:
        out_dict[code] = {}
    if day not in out_dict[code]:
        out_dict[code][day] = {}
    
    r.pop('code')
    r.pop('day')
    out_dict[code][day] = r

In [None]:
def norm(v):
    if v > 0:
        return v
    else:
        return None
    
frm['ix'] = frm['ix'].apply(lambda x: norm(x))

In [49]:
for station in out_dict:
    with open('./data/' + station + '.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(out_dict[station]).replace('NaN', 'null'))

In [50]:
# nejcastejsi denni hodnota
glob = frm.groupby(['state_name', 'region_name', 'code', 'name', 'cls', 'lat', 'lon'], as_index=False)[['ix']].apply(lambda x: float(stats.mode(x)[0][0]))

In [51]:
glob = glob.reset_index()

In [52]:
glob.drop_duplicates(subset=['code'], inplace=True)

In [53]:
glob.rename(columns={0: 'ix'}, inplace=True)

In [54]:
glob[glob['ix'] > 0 ].set_index('code').to_json('./data/stations.json', orient='index')