In [1]:
import pandas as pd
import pathlib
import json

# run these uploads locally
serv = "local"
configs = pathlib.Path("/database/configs.json")
with open(configs) as read_in:
    configs = json.load(read_in)

# setting local store path
configs[serv][serv]["database"] = str(pathlib.Path("/database/local_store.db"))

In [2]:
import os
import json
import shutil
import numpy as np
from orator import DatabaseManager
from modelingdbtools import query
from modelingdbtools import ingest
from modelingdbtools.utils import admin
from modelingdbtools.schemas import modeling

# create database connection
db = DatabaseManager(configs[serv])
# create and fill tables with basic data
modeling.create_schema(db)
modeling.add_schema_data(db)

Created table: SourceType
Created table: User
Created table: Iota
Created table: Dataset
Created table: IotaDatasetJunction
Created table: Run


In [3]:
# create test upload dataset
fp_ex = pathlib.Path("/active/fp_example/")
if not os.path.exists(fp_ex):
    os.makedirs(fp_ex)

test = []
for i in range(10):
    fp =  fp_ex / (str(i) + ".json")
    with open(fp, "w") as write_out:
        json.dump({"hello": "world"}, write_out)
    
    d = {}
    d["strings"] = "foo" + str(i)
    d["bools"] = np.random.rand() < 0.5
    d["floats"] = np.random.rand() * 100
    d["ndarrays"] = np.random.rand(2)
    d["files"] = str(fp)
    test.append(d)

test = pd.DataFrame(test)
test.to_csv(fp_ex / "example.csv")
test

Unnamed: 0,bools,files,floats,ndarrays,strings
0,False,/active/fp_example/0.json,0.237093,"[0.040184217794615895, 0.43931315346741284]",foo0
1,False,/active/fp_example/1.json,63.360574,"[0.5014803009254608, 0.5144443005137713]",foo1
2,True,/active/fp_example/2.json,25.813012,"[0.21221597424933947, 0.714243912182222]",foo2
3,False,/active/fp_example/3.json,59.312217,"[0.6387362022620658, 0.46287444019627344]",foo3
4,False,/active/fp_example/4.json,54.574072,"[0.17528448292442256, 0.0874613588170785]",foo4
5,True,/active/fp_example/5.json,17.487231,"[0.05435066186973536, 0.3089689446669549]",foo5
6,True,/active/fp_example/6.json,86.643585,"[0.5684258324757682, 0.39324996800166057]",foo6
7,True,/active/fp_example/7.json,61.62328,"[0.2093430236975331, 0.9324218473717544]",foo7
8,False,/active/fp_example/8.json,88.55742,"[0.13709566282158192, 0.026085606026251318]",foo8
9,False,/active/fp_example/9.json,35.527336,"[0.9767734485282303, 0.48459324126648684]",foo9


In [4]:
# upload the test dataset to the connected db
# map each column to a python class/ object
# indicate not to upload the files to the fms
# indicate which files should be checked for existance
ds_info = ingest.upload_dataset(database=db,
                                dataset=test,
                                name="test_dataset",
                                description="this is the hello world of dataset ingestion",
                                type_map={"bools": bool, 
                                          "files": str, 
                                          "floats": float, 
                                          "ndarrays": np.ndarray, 
                                          "strings": str},
                                upload_files=False,
                                filepath_columns=["files"])
ds_info

Unnamed: 0,DatasetId,Description,Name
0,1,this is the hello world of dataset ingestion,test_dataset


If you want to validate a dataframe but do not want to upload the dataset to the database you can use:

```
from modelingdbtools.utils import checks
checks.validate_dataset(dataset, type_map, filepath_columns)
```

In [5]:
# get the dataset we just uploaded
query.get_dataset(db, id=ds_info["DatasetId"][0])

Unnamed: 0,bools,bools(Type),files,files(Type),floats,floats(Type),ndarrays,ndarrays(Type),strings,strings(Type)
0,False,<class 'bool'>,/active/fp_example/0.json,<class 'str'>,0.2370927327093097,<class 'float'>,[0.04018422 0.43931315],<class 'numpy.ndarray'>,foo0,<class 'str'>
1,False,<class 'bool'>,/active/fp_example/1.json,<class 'str'>,63.36057442656018,<class 'float'>,[0.5014803 0.5144443],<class 'numpy.ndarray'>,foo1,<class 'str'>
2,True,<class 'bool'>,/active/fp_example/2.json,<class 'str'>,25.81301209388496,<class 'float'>,[0.21221597 0.71424391],<class 'numpy.ndarray'>,foo2,<class 'str'>
3,False,<class 'bool'>,/active/fp_example/3.json,<class 'str'>,59.31221664357135,<class 'float'>,[0.6387362 0.46287444],<class 'numpy.ndarray'>,foo3,<class 'str'>
4,False,<class 'bool'>,/active/fp_example/4.json,<class 'str'>,54.57407236828379,<class 'float'>,[0.17528448 0.08746136],<class 'numpy.ndarray'>,foo4,<class 'str'>
5,True,<class 'bool'>,/active/fp_example/5.json,<class 'str'>,17.487230506206775,<class 'float'>,[0.05435066 0.30896894],<class 'numpy.ndarray'>,foo5,<class 'str'>
6,True,<class 'bool'>,/active/fp_example/6.json,<class 'str'>,86.64358540411634,<class 'float'>,[0.56842583 0.39324997],<class 'numpy.ndarray'>,foo6,<class 'str'>
7,True,<class 'bool'>,/active/fp_example/7.json,<class 'str'>,61.6232804538523,<class 'float'>,[0.20934302 0.93242185],<class 'numpy.ndarray'>,foo7,<class 'str'>
8,False,<class 'bool'>,/active/fp_example/8.json,<class 'str'>,88.55741987424179,<class 'float'>,[0.13709566 0.02608561],<class 'numpy.ndarray'>,foo8,<class 'str'>
9,False,<class 'bool'>,/active/fp_example/9.json,<class 'str'>,35.52733642900678,<class 'float'>,[0.97677345 0.48459324],<class 'numpy.ndarray'>,foo9,<class 'str'>


In [6]:
# you can also upload this with a csv filepath
# you may lose typing though (ndarrays -> str)
ds_info = ingest.upload_dataset(database=db,
                                dataset=(fp_ex / "example.csv"),
                                description="this is the hello world of dataset ingestion",
                                type_map={"bools": bool, 
                                          "files": str, 
                                          "floats": float, 
                                          "ndarrays": str, 
                                          "strings": str},
                                upload_files=False,
                                filepath_columns=["files"])
ds_info

Unnamed: 0,DatasetId,Description,Name
0,2,this is the hello world of dataset ingestion,/active/fp_example/example.csv@@2018-06-12 18:...


We didn't specify a name for the dataset so the name is set to "{full_path}@@{upload_datetime}"

In [7]:
# minimum you need to pass to upload a dataset
ds_info = ingest.upload_dataset(database=db, dataset=test)
ds_info

Unnamed: 0,DatasetId,Description,Name
0,3,,jovyan@@2018-06-12 18:54:09.531052


In [8]:
# get the dataset we just uploaded
query.get_dataset(db, id=ds_info["DatasetId"][0])

Unnamed: 0,bools,bools(Type),files,files(Type),floats,floats(Type),ndarrays,ndarrays(Type),strings,strings(Type)
0,False,<class 'str'>,/active/fp_example/0.json,<class 'str'>,0.2370927327093097,<class 'str'>,[0.04018422 0.43931315],<class 'str'>,foo0,<class 'str'>
1,False,<class 'str'>,/active/fp_example/1.json,<class 'str'>,63.36057442656018,<class 'str'>,[0.5014803 0.5144443],<class 'str'>,foo1,<class 'str'>
2,True,<class 'str'>,/active/fp_example/2.json,<class 'str'>,25.81301209388496,<class 'str'>,[0.21221597 0.71424391],<class 'str'>,foo2,<class 'str'>
3,False,<class 'str'>,/active/fp_example/3.json,<class 'str'>,59.31221664357135,<class 'str'>,[0.6387362 0.46287444],<class 'str'>,foo3,<class 'str'>
4,False,<class 'str'>,/active/fp_example/4.json,<class 'str'>,54.57407236828379,<class 'str'>,[0.17528448 0.08746136],<class 'str'>,foo4,<class 'str'>
5,True,<class 'str'>,/active/fp_example/5.json,<class 'str'>,17.487230506206775,<class 'str'>,[0.05435066 0.30896894],<class 'str'>,foo5,<class 'str'>
6,True,<class 'str'>,/active/fp_example/6.json,<class 'str'>,86.64358540411634,<class 'str'>,[0.56842583 0.39324997],<class 'str'>,foo6,<class 'str'>
7,True,<class 'str'>,/active/fp_example/7.json,<class 'str'>,61.6232804538523,<class 'str'>,[0.20934302 0.93242185],<class 'str'>,foo7,<class 'str'>
8,False,<class 'str'>,/active/fp_example/8.json,<class 'str'>,88.55741987424179,<class 'str'>,[0.13709566 0.02608561],<class 'str'>,foo8,<class 'str'>
9,False,<class 'str'>,/active/fp_example/9.json,<class 'str'>,35.52733642900678,<class 'str'>,[0.97677345 0.48459324],<class 'str'>,foo9,<class 'str'>


Notice that in the minimum upload we lose:

1) value typing

2) filepaths are not checked

3) our dataset is given a name that follows the pattern of "{uploader}@@{upload_datetime)".

In [9]:
# dataset versions
# if you are to upload a dataset that conflicts with a previous dataset name a datetime is added
# get the dataset we just uploaded
ds_info = ingest.upload_dataset(database=db,
                                dataset=test,
                                name="test_dataset",
                                description="this is the hello world of dataset ingestion",
                                type_map={"bools": bool, 
                                          "files": str, 
                                          "floats": float, 
                                          "ndarrays": np.ndarray, 
                                          "strings": str},
                                upload_files=False,
                                filepath_columns=["files"])
ds_info

A dataset with that name already exists. Adding new version.


Unnamed: 0,DatasetId,Description,Name
0,4,this is the hello world of dataset ingestion,test_dataset@@2018-06-12 18:54:10.138733


In [10]:
# get the dataset we just uploaded
query.get_dataset(db, id=ds_info["DatasetId"][0])

Unnamed: 0,bools,bools(Type),files,files(Type),floats,floats(Type),ndarrays,ndarrays(Type),strings,strings(Type)
0,False,<class 'bool'>,/active/fp_example/0.json,<class 'str'>,0.2370927327093097,<class 'float'>,[0.04018422 0.43931315],<class 'numpy.ndarray'>,foo0,<class 'str'>
1,False,<class 'bool'>,/active/fp_example/1.json,<class 'str'>,63.36057442656018,<class 'float'>,[0.5014803 0.5144443],<class 'numpy.ndarray'>,foo1,<class 'str'>
2,True,<class 'bool'>,/active/fp_example/2.json,<class 'str'>,25.81301209388496,<class 'float'>,[0.21221597 0.71424391],<class 'numpy.ndarray'>,foo2,<class 'str'>
3,False,<class 'bool'>,/active/fp_example/3.json,<class 'str'>,59.31221664357135,<class 'float'>,[0.6387362 0.46287444],<class 'numpy.ndarray'>,foo3,<class 'str'>
4,False,<class 'bool'>,/active/fp_example/4.json,<class 'str'>,54.57407236828379,<class 'float'>,[0.17528448 0.08746136],<class 'numpy.ndarray'>,foo4,<class 'str'>
5,True,<class 'bool'>,/active/fp_example/5.json,<class 'str'>,17.487230506206775,<class 'float'>,[0.05435066 0.30896894],<class 'numpy.ndarray'>,foo5,<class 'str'>
6,True,<class 'bool'>,/active/fp_example/6.json,<class 'str'>,86.64358540411634,<class 'float'>,[0.56842583 0.39324997],<class 'numpy.ndarray'>,foo6,<class 'str'>
7,True,<class 'bool'>,/active/fp_example/7.json,<class 'str'>,61.6232804538523,<class 'float'>,[0.20934302 0.93242185],<class 'numpy.ndarray'>,foo7,<class 'str'>
8,False,<class 'bool'>,/active/fp_example/8.json,<class 'str'>,88.55741987424179,<class 'float'>,[0.13709566 0.02608561],<class 'numpy.ndarray'>,foo8,<class 'str'>
9,False,<class 'bool'>,/active/fp_example/9.json,<class 'str'>,35.52733642900678,<class 'float'>,[0.97677345 0.48459324],<class 'numpy.ndarray'>,foo9,<class 'str'>


In [11]:
# upload failure
# a dataset will not be uploaded if any data or filepath validation fails
ds_info = ingest.upload_dataset(database=db,
                                dataset=test,
                                name="test_dataset",
                                description="this is the hello world of dataset ingestion",
                                # let's change the types in type_map
                                type_map={"bools": float, 
                                          "files": int, 
                                          "floats": bool, 
                                          "ndarrays": pd.DataFrame, 
                                          "strings": type(None)},
                                upload_files=False,
                                filepath_columns=["files"])
ds_info

TypeError: 

Allowed types: <class 'float'>
Given type: <class 'bool'>
Given value: False


In [12]:
# let's remove the files and try to upload the dataset
shutil.rmtree(fp_ex)

ds_info = ingest.upload_dataset(database=db,
                                dataset=test,
                                name="test_dataset",
                                description="this is the hello world of dataset ingestion",
                                type_map={"bools": bool, 
                                          "files": str, 
                                          "floats": float, 
                                          "ndarrays": np.ndarray, 
                                          "strings": str},
                                upload_files=False,
                                filepath_columns=["files"])
ds_info

FileNotFoundError: 

The provided filepath does not exist.
Given filepath: /active/fp_example/0.json
