# read example raw data from package and store to hdf5 file

the following provides a minimal working example of how to use DataLoader.read_flat_files to read raw data from filesystem

### to read in data and store via command line one can use:

```commandline
python -m GPSat.read_and_store /path/to/config.json
```

ensure an appropriate python environment is activated e.g. one satisfying requirements.txt

## package import

In [1]:
# need to add parent directory to sys.path...
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

In [2]:
import datetime
import os
import json
import pandas as pd

from GPSat import get_data_path, get_config_path
from GPSat.dataloader import DataLoader

pd.set_option("display.max_columns", 200)

In [3]:
import re

store_path = get_data_path("example", "ABC.h5")

with pd.HDFStore(store_path, mode="r") as store:
    # TODO: determine if it's faster to use select_colum - does not have where condition?

    all_keys = store.keys()
    dfs = {re.sub("/", "", k): store.get(k) for k in all_keys}
    

In [4]:
dfs['data']

Unnamed: 0,lon,lat,datetime,source,obs
0,59.944790,82.061122,2020-03-01 13:48:50,C,-0.0401
1,59.939555,82.063771,2020-03-01 13:48:50,C,-0.0861
2,59.934316,82.066419,2020-03-01 13:48:50,C,0.0648
3,59.929074,82.069068,2020-03-01 13:48:50,C,0.0516
4,59.923829,82.071716,2020-03-01 13:48:50,C,0.0222
...,...,...,...,...,...
312329,-114.353301,81.370738,2020-03-09 23:59:54,A,0.0640
312330,-114.412538,81.371724,2020-03-09 23:59:56,A,0.1662
312331,-114.432287,81.372051,2020-03-09 23:59:57,A,0.2857
312332,-114.452037,81.372377,2020-03-09 23:59:58,A,0.0000


## read config and specify directory containing data

In [5]:
# (example) data path - in package
data_dir = get_data_path("example")

# configuration file to read data
config_file = get_config_path("example_read_and_store_raw_data.json")

with open(config_file, "r") as f:
    config = json.load(f)

# change some of the directory locations to the package
config['output']['dir'] = data_dir
config['file_dirs'] = data_dir

print("reading raw data and storing to hdf file using config:")
print(json.dumps(config, indent=4))

# extract (pop out) the output information
output_dict = config.pop("output", None)


reading raw data and storing to hdf file using config:
{
    "output": {
        "dir": "/home/so/Documents/Projects/GPSat/data/example",
        "file": "ABC.h5",
        "table": "data",
        "append": false
    },
    "file_dirs": "/home/so/Documents/Projects/GPSat/data/example",
    "file_regex": ".*_RAW.csv$",
    "sub_dirs": null,
    "read_csv_kwargs": {},
    "col_funcs": {
        "source": {
            "func": "lambda x: re.sub('_RAW.*$', '', os.path.basename(x))",
            "filename_as_arg": true
        },
        "datetime": {
            "func": "lambda x: x.astype('datetime64[s]')",
            "col_args": "datetime"
        },
        "obs": {
            "func": "lambda x: x-0.1",
            "col_args": [
                "z"
            ]
        }
    },
    "row_select": [
        {
            "func": "lambda x: x>=65",
            "col_kwargs": {
                "x": "lat"
            }
        }
    ],
    "col_select": [
        "lon",
        "lat",
    

## read in data, select rows and columns, combine into a single dataframe


In [6]:
df = DataLoader.read_flat_files(**config)

print("read in raw data, looks like:")
df.head(5)


----------------------------------------------------------------------------------------------------
reading files from:
/home/so/Documents/Projects/GPSat/data/example/
read in raw data, looks like:


Unnamed: 0,lon,lat,datetime,source,obs
0,59.94479,82.061122,2020-03-01 13:48:50,C,-0.0401
1,59.939555,82.063771,2020-03-01 13:48:50,C,-0.0861
2,59.934316,82.066419,2020-03-01 13:48:50,C,0.0648
3,59.929074,82.069068,2020-03-01 13:48:50,C,0.0516
4,59.923829,82.071716,2020-03-01 13:48:50,C,0.0222


In [7]:
df.tail(3)

Unnamed: 0,lon,lat,datetime,source,obs
349604,-150.798101,79.445092,2020-03-10 23:59:57,A,0.0239
349605,-150.811408,79.443378,2020-03-10 23:59:58,A,-0.0429
349606,-150.824713,79.441664,2020-03-10 23:59:59,A,-0.0526


## store as hdf5

In [8]:
# get run information (including some details from git)
# - for auditing / future reference
# run_info = DataLoader.get_run_info()

# or provide some custom one
run_info = {
    "run_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

# specify output dir, file, table name and whether to append or not
output_dir = output_dict['dir']
out_file = output_dict['file']
table = output_dict['table']
append = output_dict.get("append", False)

print("writing to hdf5 file")
with pd.HDFStore(path=os.path.join(output_dir, out_file), mode='a' if append else 'w') as store:
    DataLoader.write_to_hdf(df,
                            table=table,
                            append=append,
                            store=store,
                            config=config,
                            run_info=run_info)



writing to hdf5 file


In [9]:
print("complete")

complete
