# read example raw data from package and store to hdf5 file

the following provides a minimal working example of how to use DataLoader.read_flat_files to read raw data from filesystem

### to read in data and store via command line one can use:

```commandline
python -m PyOptimalInterpolation.read_and_store /path/to/config.json
```

ensure an appropriate python environment is activated e.g. one satisfying requirements.txt

## package import

In [11]:
# need to add parent directory to sys.path...
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

In [13]:
import datetime
import os
import json
import pandas as pd

from PyOptimalInterpolation import get_data_path, get_config_path
from PyOptimalInterpolation.dataloader import DataLoader

pd.set_option("display.max_columns", 200)

## read config and specify directory containing data

In [14]:
# (example) data path - in package
data_dir = get_data_path("example")

# configuration file to read data
config_file = get_config_path("example_read_and_store_raw_data.json")

with open(config_file, "r") as f:
    config = json.load(f)

# change some of the directory locations to the package
config['output']['dir'] = data_dir
config['file_dirs'] = data_dir

print("reading raw data and storing to hdf file using config:")
print(json.dumps(config, indent=4))

# extract (pop out) the output information
output_dict = config.pop("output", None)


reading raw data and storing to hdf file using config:
{
    "output": {
        "dir": "/home/buddy/workspace/sparse_opt_interp/PyOptimalInterpolation/data/example",
        "file": "ABC.h5",
        "table": "data",
        "append": false
    },
    "file_dirs": "/home/buddy/workspace/sparse_opt_interp/PyOptimalInterpolation/data/example",
    "file_regex": "\\.csv$",
    "sub_dirs": null,
    "read_csv_kwargs": {},
    "col_funcs": {
        "source": {
            "func": "lambda x: re.sub('_RAW.*$', '', os.path.basename(x))",
            "filename_as_arg": true
        },
        "datetime": {
            "func": "lambda x: x.astype('datetime64[s]')",
            "col_args": "datetime"
        },
        "obs": {
            "func": "lambda x,y: x-y",
            "col_args": [
                "z",
                "z_mean"
            ]
        }
    },
    "row_select": [
        {
            "func": "lambda x: x>=65",
            "col_kwargs": {
                "x": "lat"
     

## read in data, select rows and columns, combine into a single dataframe


In [15]:
df = DataLoader.read_flat_files(**config)

print("read in raw data, looks like:")
df.head(5)


----------------------------------------------------------------------------------------------------
reading files from:
/home/buddy/workspace/sparse_opt_interp/PyOptimalInterpolation/data/example/
read in raw data, looks like:


Unnamed: 0,lon,lat,datetime,source,obs
420,-61.821182,65.594139,2020-02-10 00:22:20,A,0.2261
421,-61.847626,65.620234,2020-02-10 00:22:23,A,0.2144
422,-61.851806,65.624354,2020-02-10 00:22:24,A,0.09765
423,-61.878307,65.650444,2020-02-10 00:22:27,A,0.2098
424,-61.88529,65.65731,2020-02-10 00:22:28,A,0.2136


## store as hdf5

In [16]:
# get run information (including some details from git)
# - for auditing / future reference
# run_info = DataLoader.get_run_info()

# or provide some custom one
run_info = {
    "run_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

# specify output dir, file, table name and whether to append or not
output_dir = output_dict['dir']
out_file = output_dict['file']
table = output_dict['table']
append = output_dict.get("append", False)

print("writing to hdf5 file")
with pd.HDFStore(path=os.path.join(output_dir, out_file), mode='a' if append else 'w') as store:
    DataLoader.write_to_hdf(df,
                            table=table,
                            append=append,
                            store=store,
                            config=config,
                            run_info=run_info)



writing to hdf5 file


In [None]:
print("complete")