# L1 Stats Generator
Gets the total number of unique date stamps in all L1 data for a given station within a given time period.

The first cell lists all ocean stations. You can use this to get hte ID of the station you want for the second part.

In [None]:
from icoscp.station import station
from icoscp.sparql.runsparql import RunSparql
from icoscp.cpb.dobj import Dobj
import pandas as pd

def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )
        
        
for icos_station in station.getList(theme=['OS']):
  print(f'{icos_station.stationId} {icos_station.name} {icos_station.country}')

## Now choose your station and timespan

In [None]:
station_id = 'SD1053'
start_date = '2020-01-01T00:00:00.000Z'
end_date = '2020-12-31T23:59:59.000Z'

icos_station = station.get(station_id)
icos_station.uri

In [None]:
all_datasets = []

# First we find the 'main' datasets, that are the latest versions
l1_sparql = (
  'prefix cpmeta: <http://meta.icos-cp.eu/ontologies/cpmeta/>\n'
  'prefix prov: <http://www.w3.org/ns/prov#>\n'
  'select ?dobj\n'
  'where {'
  'VALUES ?spec {<http://meta.icos-cp.eu/resources/cpmeta/icosOtcL1Product_v2>}\n'
  '?dobj cpmeta:hasObjectSpec ?spec .\n'
  f'VALUES ?station {{<{icos_station.uri[0]}>}}\n'
  '?dobj cpmeta:wasAcquiredBy/prov:wasAssociatedWith ?station .\n'
  '?dobj cpmeta:hasStartTime | (cpmeta:wasAcquiredBy / prov:startedAtTime) ?timeStart .\n'
  '?dobj cpmeta:hasEndTime | (cpmeta:wasAcquiredBy / prov:endedAtTime) ?timeEnd .\n'
  'FILTER NOT EXISTS {[] cpmeta:isNextVersionOf ?dobj}\n'
  f'FILTER( !(?timeStart > "{end_date}"^^xsd:dateTime || ?timeEnd < "{start_date}"^^xsd:dateTime))\n'
  '}'
)

l1_query = RunSparql(sparql_query=l1_sparql, output_format='pandas')
l1_query.run()
main_datasets = l1_query.data()

for index, row in main_datasets.iterrows():
    all_datasets.append(row['dobj'])
    
    old_sparql = (
      'prefix cpmeta: <http://meta.icos-cp.eu/ontologies/cpmeta/>\n'
      'prefix prov: <http://www.w3.org/ns/prov#>\n'
      'select ?older where {\n'
      f'bind (<{row["dobj"]}> as ?dobj)\n'
      '?dobj cpmeta:isNextVersionOf/cpmeta:isNextVersionOf* ?older .'
      '}'
    )
    
    old_query = RunSparql(sparql_query=old_sparql, output_format='pandas')
    old_query_result = old_query.run()
    old_datasets = old_query.data()
    for i2, row2 in old_datasets.iterrows():
      all_datasets.append(row2['older'])

print(f'Found {len(all_datasets)} datasets')

In [None]:
all_times = pd.Series()

for dataset in log_progress(all_datasets, every=1, size=len(all_datasets), name='Calculating'):
  dobj = Dobj(dataset)
  all_times = pd.concat([all_times, dobj.data['TIMESTAMP']])
  all_times = pd.Series(all_times.unique())

print(len(all_times.unique()))