## Construcing Structured TDA Model Data

In [10]:
from model_selection import TDAModelData

import numpy as np

from pprint import pprint

def rand_dgms(n=10, min_pairs=20, max_pairs=50):
    dgms = [np.random.rand(np.random.randint(min_pairs,max_pairs),2) for _ in range(n)]
    for dgm in dgms:
        dgm[:,1] = dgm[:,0]+dgm[:,1]
    
    return dgms

def rand_features(n=10, num_features=20):
    return np.random.rand(n, num_features)

def rand_targets(n=10, classification=False):
    if not classification:
        return np.random.rand(n)
    else:
        return np.random.choice([0,1], size=(n,), replace=True)

#### TDAModelData() provides simple storage of data samples, features, persistence diagrams, and target values for use with model hyperparameter optimization. 

In [32]:
tdata = TDAModelData()

# generate a feature dataframe
features = rand_features(n=10, num_features=5)

# generate some target labels
targets = rand_targets(n=10, classification=True)

tdata = TDAModelData()
tdata.add_features(features)
tdata.add_data(targets=targets)

In [31]:
# Add diagrams all at once or one dimension at a time.
tdata = TDAModelData()

pers_dgms = {0: rand_dgms(n=10, min_pairs=3, max_pairs=10), 1: rand_dgms(n=10, min_pairs=3, max_pairs=10)}
tdata.add_diagrams(pers_dgms)
tdata.add_diagrams(rand_dgms(n=10, min_pairs=3, max_pairs=10), dim=2)

tdata.pers_dgms

{0: [array([[0.46274659, 0.58101372],
         [0.44176286, 0.85657043],
         [0.32032551, 0.60691403],
         [0.86786461, 1.01441894],
         [0.38599122, 0.97639172],
         [0.19505765, 0.71042272],
         [0.78375602, 1.75519248]]),
  array([[0.64583576, 1.42757714],
         [0.32473638, 0.56075271],
         [0.77939709, 0.99724269]]),
  array([[0.76158212, 0.90908276],
         [0.19434078, 1.09368717],
         [0.58474618, 0.66704516],
         [0.30045828, 0.4869861 ],
         [0.89588426, 1.63497834],
         [0.78541599, 1.73956136],
         [0.76426566, 0.90337783],
         [0.09624634, 0.64592247],
         [0.88575066, 1.71600533]]),
  array([[0.47411612, 1.10493522],
         [0.46007052, 1.17058741],
         [0.24497332, 0.26553821],
         [0.11562964, 0.87424315],
         [0.55400314, 1.27323142],
         [0.65593714, 1.27908897],
         [0.55539799, 0.65493069],
         [0.98619232, 1.09180693]]),
  array([[0.11025656, 0.240153  ],
         

#### The class handles data type and shape validation, to ensure downstream methods will work seemlessly, and provides data validation and reports informative errors and warnings.
* All data must be of the same length, corresponding to the number of data samples. 
* Persistence diagrams can be a dictionary keeyd by homological dimension with values equal to iterable collection of (*,2) numpy arrays.
* Features must be an (N,k) numpy array, where N is the number of samples, k is the number of features.
* Targets must be an (N,) numpy vector, where N is the number of samples. 
* Data may be an iterable collection of arbitrary data.

In [33]:
tdata = TDAModelData()

# add diagrams one dimension at a time
tdata.add_diagrams(rand_dgms(n=10, min_pairs=3, max_pairs=10), dim=1)
tdata.add_diagrams(rand_dgms(n=11, min_pairs=3, max_pairs=10), dim=2)

ValueError: All data inputs must be iterables of the same size. Expected 10 diagrams.

In [34]:
tdata = TDAModelData()

# add diagrams one dimension at a time
tdata.add_diagrams(rand_dgms(n=10, min_pairs=3, max_pairs=10), dim=1)
tdata.add_diagrams(rand_dgms(n=10, min_pairs=3, max_pairs=10), dim=1)

In [35]:
# generate some persistence diagrams
pers_dgms = {0: rand_dgms(n=10, min_pairs=20, max_pairs=50), 1: rand_dgms(n=10, min_pairs=20, max_pairs=50)}

# generate a feature dataframe
features = rand_features(n=11, num_features=5)

# generate some target labels
targets = rand_targets(n=10, classification=True)


tdata = TDAModelData(pers_dgms=pers_dgms, targets=targets, features=features)

ValueError: All data inputs must be iterables of the same size.  Expected 10 feature vectors.

In [36]:
# generate some persistence diagrams
pers_dgms = {0: rand_dgms(n=10, min_pairs=20, max_pairs=50), 1: rand_dgms(n=10, min_pairs=20, max_pairs=50)}

# generate a feature dataframe
features = rand_features(n=10, num_features=5)

# generate some target labels
targets = rand_targets(n=11, classification=True)

tdata = TDAModelData(pers_dgms=pers_dgms, targets=targets, features=features)

ValueError: All data inputs must be iterables of the same size.  Expected 11 feature vectors.

In [38]:
pers_dgms = rand_dgms(n=10, min_pairs=2, max_pairs=10)

# transform diagrams into birth-persistence coordinates
for pers_dgm in pers_dgms:
    pers_dgm[:,1] = pers_dgm[:,1]-pers_dgm[:,0]

tdata = TDAModelData(pers_dgms=pers_dgms, dim=1)

ValueError: Persistence diagrams are expected to be in birth-death coordinates.