In [1]:
import pandas as pd
import pathlib
import json

# run these uploads locally
serv = "local"
configs = pathlib.Path("/database/configs.json")
with open(configs) as read_in:
    configs = json.load(read_in)

# setting local store path
configs[serv][serv]["database"] = str(pathlib.Path("/database/local_store.db"))

In [2]:
import os
import json
import shutil
import numpy as np
from orator import DatabaseManager
from modelingdbtools import query
from modelingdbtools import ingest
from modelingdbtools.utils import admin
from modelingdbtools.schemas import modeling

# create database connection
db = DatabaseManager(configs[serv])
# create and fill tables with basic data
modeling.create_schema(db)
modeling.add_schema_data(db)

Created table: SourceType
Created table: User
Created table: Iota
Created table: Dataset
Created table: IotaDatasetJunction
Created table: Run


In [3]:
# create test upload dataset
fp_ex = pathlib.Path("/active/fp_example/")
if not os.path.exists(fp_ex):
    os.makedirs(fp_ex)

test = []
for i in range(10):
    fp =  fp_ex / (str(i) + ".json")
    with open(fp, "w") as write_out:
        json.dump({"hello": "world"}, write_out)
    
    d = {}
    d["strings"] = "foo" + str(i)
    d["bools"] = np.random.rand() < 0.5
    d["floats"] = np.random.rand() * 100
    d["ndarrays"] = np.random.rand(2)
    d["files"] = str(fp)
    test.append(d)

test = pd.DataFrame(test)
test.to_csv(fp_ex / "example.csv")
test

Unnamed: 0,bools,files,floats,ndarrays,strings
0,False,/active/fp_example/0.json,86.473325,"[0.4346217302148405, 0.07436777024984176]",foo0
1,True,/active/fp_example/1.json,19.050157,"[0.9553036338793727, 0.41698016500337876]",foo1
2,True,/active/fp_example/2.json,33.310051,"[0.05526673926072756, 0.5194397668124452]",foo2
3,True,/active/fp_example/3.json,38.178778,"[0.7879324808942262, 0.05626024117071371]",foo3
4,False,/active/fp_example/4.json,34.431737,"[0.9061339391396931, 0.6837142961472406]",foo4
5,False,/active/fp_example/5.json,45.492576,"[0.31394108143052835, 0.3211381823332322]",foo5
6,False,/active/fp_example/6.json,20.439619,"[0.7540712618233792, 0.6004670320872274]",foo6
7,True,/active/fp_example/7.json,83.179313,"[0.5943230916832485, 0.02427039522202612]",foo7
8,True,/active/fp_example/8.json,89.035621,"[0.8602185719803257, 0.4404365604138215]",foo8
9,True,/active/fp_example/9.json,46.609603,"[0.021081394134853437, 0.1668892508329306]",foo9


In [4]:
# upload the test dataset to the connected db
# map each column to a python class/ object
# indicate not to upload the files to the fms
# indicate which files should be checked for existance
ds_info = ingest.upload_dataset(database=db,
                                dataset=test,
                                name="test_dataset",
                                description="this is the hello world of dataset ingestion",
                                type_map={"bools": bool, 
                                          "files": str, 
                                          "floats": float, 
                                          "ndarrays": np.ndarray, 
                                          "strings": str},
                                upload_files=False,
                                filepath_columns=["files"])
ds_info

Unnamed: 0,DatasetId,Description,Name
0,1,this is the hello world of dataset ingestion,test_dataset


If you want to validate a dataframe but do not want to upload the dataset to the database you can use:

```
from modelingdbtools.utils import checks
checks.validate_dataset(dataset, type_map, filepath_columns)
```

In [5]:
# get the dataset we just uploaded
query.get_dataset(db, id=ds_info["DatasetId"][0])

Unnamed: 0,bools,bools(Type),files,files(Type),floats,floats(Type),ndarrays,ndarrays(Type),strings,strings(Type)
0,False,<class 'bool'>,/active/fp_example/0.json,<class 'str'>,86.47332532423178,<class 'float'>,[0.43462173 0.07436777],<class 'numpy.ndarray'>,foo0,<class 'str'>
1,True,<class 'bool'>,/active/fp_example/1.json,<class 'str'>,19.050157056763418,<class 'float'>,[0.95530363 0.41698017],<class 'numpy.ndarray'>,foo1,<class 'str'>
2,True,<class 'bool'>,/active/fp_example/2.json,<class 'str'>,33.310050525188295,<class 'float'>,[0.05526674 0.51943977],<class 'numpy.ndarray'>,foo2,<class 'str'>
3,True,<class 'bool'>,/active/fp_example/3.json,<class 'str'>,38.17877843397457,<class 'float'>,[0.78793248 0.05626024],<class 'numpy.ndarray'>,foo3,<class 'str'>
4,False,<class 'bool'>,/active/fp_example/4.json,<class 'str'>,34.43173656426038,<class 'float'>,[0.90613394 0.6837143 ],<class 'numpy.ndarray'>,foo4,<class 'str'>
5,False,<class 'bool'>,/active/fp_example/5.json,<class 'str'>,45.492576137824194,<class 'float'>,[0.31394108 0.32113818],<class 'numpy.ndarray'>,foo5,<class 'str'>
6,False,<class 'bool'>,/active/fp_example/6.json,<class 'str'>,20.439619209731006,<class 'float'>,[0.75407126 0.60046703],<class 'numpy.ndarray'>,foo6,<class 'str'>
7,True,<class 'bool'>,/active/fp_example/7.json,<class 'str'>,83.1793132080237,<class 'float'>,[0.59432309 0.0242704 ],<class 'numpy.ndarray'>,foo7,<class 'str'>
8,True,<class 'bool'>,/active/fp_example/8.json,<class 'str'>,89.03562104490933,<class 'float'>,[0.86021857 0.44043656],<class 'numpy.ndarray'>,foo8,<class 'str'>
9,True,<class 'bool'>,/active/fp_example/9.json,<class 'str'>,46.60960274262774,<class 'float'>,[0.02108139 0.16688925],<class 'numpy.ndarray'>,foo9,<class 'str'>


In [6]:
# you can also upload this with a csv filepath
# you may lose typing though (ndarrays -> str)
ds_info = ingest.upload_dataset(database=db,
                                dataset=(fp_ex / "example.csv"),
                                description="this is the hello world of dataset ingestion",
                                type_map={"bools": bool, 
                                          "files": str, 
                                          "floats": float, 
                                          "ndarrays": str, 
                                          "strings": str},
                                upload_files=False,
                                filepath_columns=["files"])
ds_info

Unnamed: 0,DatasetId,Description,Name
0,2,this is the hello world of dataset ingestion,/active/fp_example/example.csv@@2018-06-12 21:...


We didn't specify a name for the dataset so the name is set to "{full_path}@@{upload_datetime}"

In [7]:
# minimum you need to pass to upload a dataset
ds_info = ingest.upload_dataset(database=db, dataset=test)
ds_info

Unnamed: 0,DatasetId,Description,Name
0,3,,jacksonb@@2018-06-12 21:58:59.595345


In [8]:
# get the dataset we just uploaded
query.get_dataset(db, id=ds_info["DatasetId"][0])

Unnamed: 0,bools,bools(Type),files,files(Type),floats,floats(Type),ndarrays,ndarrays(Type),strings,strings(Type)
0,False,<class 'str'>,/active/fp_example/0.json,<class 'str'>,86.47332532423178,<class 'str'>,[0.43462173 0.07436777],<class 'str'>,foo0,<class 'str'>
1,True,<class 'str'>,/active/fp_example/1.json,<class 'str'>,19.050157056763418,<class 'str'>,[0.95530363 0.41698017],<class 'str'>,foo1,<class 'str'>
2,True,<class 'str'>,/active/fp_example/2.json,<class 'str'>,33.310050525188295,<class 'str'>,[0.05526674 0.51943977],<class 'str'>,foo2,<class 'str'>
3,True,<class 'str'>,/active/fp_example/3.json,<class 'str'>,38.17877843397457,<class 'str'>,[0.78793248 0.05626024],<class 'str'>,foo3,<class 'str'>
4,False,<class 'str'>,/active/fp_example/4.json,<class 'str'>,34.43173656426038,<class 'str'>,[0.90613394 0.6837143 ],<class 'str'>,foo4,<class 'str'>
5,False,<class 'str'>,/active/fp_example/5.json,<class 'str'>,45.492576137824194,<class 'str'>,[0.31394108 0.32113818],<class 'str'>,foo5,<class 'str'>
6,False,<class 'str'>,/active/fp_example/6.json,<class 'str'>,20.439619209731006,<class 'str'>,[0.75407126 0.60046703],<class 'str'>,foo6,<class 'str'>
7,True,<class 'str'>,/active/fp_example/7.json,<class 'str'>,83.1793132080237,<class 'str'>,[0.59432309 0.0242704 ],<class 'str'>,foo7,<class 'str'>
8,True,<class 'str'>,/active/fp_example/8.json,<class 'str'>,89.03562104490933,<class 'str'>,[0.86021857 0.44043656],<class 'str'>,foo8,<class 'str'>
9,True,<class 'str'>,/active/fp_example/9.json,<class 'str'>,46.60960274262774,<class 'str'>,[0.02108139 0.16688925],<class 'str'>,foo9,<class 'str'>


Notice that in the minimum upload we lose:

1) value typing

2) filepaths are not checked

3) our dataset is given a name that follows the pattern of "{uploader}@@{upload_datetime)".

In [9]:
# dataset versions
# if you are to upload a dataset that conflicts with a previous dataset name a datetime is added
# get the dataset we just uploaded
ds_info = ingest.upload_dataset(database=db,
                                dataset=test,
                                name="test_dataset",
                                description="this is the hello world of dataset ingestion",
                                type_map={"bools": bool, 
                                          "files": str, 
                                          "floats": float, 
                                          "ndarrays": np.ndarray, 
                                          "strings": str},
                                upload_files=False,
                                filepath_columns=["files"])
ds_info

A dataset with that name already exists. Adding new version.


Unnamed: 0,DatasetId,Description,Name
0,4,this is the hello world of dataset ingestion,test_dataset@@2018-06-12 21:59:00.287966


In [10]:
# get the dataset we just uploaded
query.get_dataset(db, id=ds_info["DatasetId"][0])

Unnamed: 0,bools,bools(Type),files,files(Type),floats,floats(Type),ndarrays,ndarrays(Type),strings,strings(Type)
0,False,<class 'bool'>,/active/fp_example/0.json,<class 'str'>,86.47332532423178,<class 'float'>,[0.43462173 0.07436777],<class 'numpy.ndarray'>,foo0,<class 'str'>
1,True,<class 'bool'>,/active/fp_example/1.json,<class 'str'>,19.050157056763418,<class 'float'>,[0.95530363 0.41698017],<class 'numpy.ndarray'>,foo1,<class 'str'>
2,True,<class 'bool'>,/active/fp_example/2.json,<class 'str'>,33.310050525188295,<class 'float'>,[0.05526674 0.51943977],<class 'numpy.ndarray'>,foo2,<class 'str'>
3,True,<class 'bool'>,/active/fp_example/3.json,<class 'str'>,38.17877843397457,<class 'float'>,[0.78793248 0.05626024],<class 'numpy.ndarray'>,foo3,<class 'str'>
4,False,<class 'bool'>,/active/fp_example/4.json,<class 'str'>,34.43173656426038,<class 'float'>,[0.90613394 0.6837143 ],<class 'numpy.ndarray'>,foo4,<class 'str'>
5,False,<class 'bool'>,/active/fp_example/5.json,<class 'str'>,45.492576137824194,<class 'float'>,[0.31394108 0.32113818],<class 'numpy.ndarray'>,foo5,<class 'str'>
6,False,<class 'bool'>,/active/fp_example/6.json,<class 'str'>,20.439619209731006,<class 'float'>,[0.75407126 0.60046703],<class 'numpy.ndarray'>,foo6,<class 'str'>
7,True,<class 'bool'>,/active/fp_example/7.json,<class 'str'>,83.1793132080237,<class 'float'>,[0.59432309 0.0242704 ],<class 'numpy.ndarray'>,foo7,<class 'str'>
8,True,<class 'bool'>,/active/fp_example/8.json,<class 'str'>,89.03562104490933,<class 'float'>,[0.86021857 0.44043656],<class 'numpy.ndarray'>,foo8,<class 'str'>
9,True,<class 'bool'>,/active/fp_example/9.json,<class 'str'>,46.60960274262774,<class 'float'>,[0.02108139 0.16688925],<class 'numpy.ndarray'>,foo9,<class 'str'>


In [11]:
# upload failure
# a dataset will not be uploaded if any data or filepath validation fails
ds_info = ingest.upload_dataset(database=db,
                                dataset=test,
                                name="test_dataset",
                                description="this is the hello world of dataset ingestion",
                                # let's change the types in type_map
                                type_map={"bools": float, 
                                          "files": int, 
                                          "floats": bool, 
                                          "ndarrays": pd.DataFrame, 
                                          "strings": type(None)},
                                upload_files=False,
                                filepath_columns=["files"])
ds_info

TypeError: 

Allowed types: <class 'float'>
Given type: <class 'bool'>
Given value: False


In [12]:
# let's remove the files and try to upload the dataset
shutil.rmtree(fp_ex)

ds_info = ingest.upload_dataset(database=db,
                                dataset=test,
                                name="test_dataset",
                                description="this is the hello world of dataset ingestion",
                                type_map={"bools": bool, 
                                          "files": str, 
                                          "floats": float, 
                                          "ndarrays": np.ndarray, 
                                          "strings": str},
                                upload_files=False,
                                filepath_columns=["files"])
ds_info

FileNotFoundError: 

The provided filepath does not exist.
Given filepath: /active/fp_example/0.json
