In [None]:
%pip install pinard

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
## Generate preprocessing test data - Snapshot (22-10-20 - commit:5544c8f119baea5f8a3d99f7ba67c1e914b98b9c)
import numpy as np
import pandas as pd
from sklearn.pipeline import FeatureUnion, Pipeline

from pinard import preprocessor as pp
from pinard.nirs_pipelines import FeatureAugmentation

x_fname = 'test_preprocessing_src.csv'
x_df = pd.read_csv(x_fname, sep=';', header=None)
x = x_df.astype(np.float32).values

preprocessing = [   ('Id', pp.IdentityTransformer()),
                    ('Baseline', pp.Baseline()),
                    ('StandardNormalVariate', pp.StandardNormalVariate()), 
                    ('RobustNormalVariate', pp.RobustNormalVariate()),
                    ('SavitzkyGolay', pp.SavitzkyGolay()),
                    ('Normalize', pp.Normalize()),
                    ('Detrend', pp.Detrend()),
                    ('MultiplicativeScatterCorrection', pp.MultiplicativeScatterCorrection()),
                    ('Derivate', pp.Derivate()),
                    ('Gaussian', pp.Gaussian(order = 2, sigma = 1)),
                    ('Wavelet', pp.Wavelet()),
                    ('SimpleScale', pp.SimpleScale()),
                ]

pipeline = FeatureAugmentation(preprocessing)
xt = pipeline.fit_transform(x)
xtt = np.swapaxes(xt, 1, 2)
a = pp.baseline(x)[0]
xtt = np.concatenate(xtt)
b = xtt[0:12,:]
np.savetxt("test_preprocessing_validation.csv", b, delimiter=";")
np.savetxt("test_data.csv", x, delimiter=";")

In [1]:
## Generate model selection test data - Snapshot (22-10-20 - commit:5544c8f119baea5f8a3d99f7ba67c1e914b98b9c)
import random

import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit

from pinard import model_selection

x_fname = 'test_split.csv'
x_df = pd.read_csv(x_fname, sep=';', header=None)
x = x_df.astype(np.float32).values
y = np.reshape(x[:,0], (-1,1))
x = x[:,1:]

np.random.seed(42)
random.seed(42)

train_index, test_index = model_selection.train_test_split_idx(x, method="random", test_size=0.25, random_state=42)
test_data = train_index

train_index, test_index = model_selection.train_test_split_idx(x, method="k_mean", test_size=0.25, random_state=42, metric= "canberra")
test_data = np.column_stack((test_data, train_index))

train_index, test_index = model_selection.train_test_split_idx(x, method="k_mean", test_size=0.25, random_state=42, pca_components=4, metric= "canberra")
test_data = np.column_stack((test_data, train_index))

train_index, test_index = model_selection.train_test_split_idx(x, method="kennard_stone", test_size=0.25, random_state=42)
test_data = np.column_stack((test_data, train_index))

train_index, test_index = model_selection.train_test_split_idx(x, method="kennard_stone", test_size=0.25, random_state=42, metric='correlation', pca_components=8)
test_data = np.column_stack((test_data, train_index))

train_index, test_index = model_selection.train_test_split_idx(x, method="kennard_stone", test_size=0.25, random_state=42, metric='correlation')
test_data = np.column_stack((test_data, train_index))

train_index, test_index = model_selection.train_test_split_idx(x, y=y, method="spxy", test_size=0.25, random_state=42)
test_data = np.column_stack((test_data, train_index))

train_index, test_index = model_selection.train_test_split_idx(x, y=y, method="spxy", test_size=0.25, random_state=42, pca_components=2)
test_data = np.column_stack((test_data, train_index))

train_index, test_index = model_selection.train_test_split_idx(x, y=y, method="spxy", test_size=0.25, random_state=42, metric='canberra')
test_data = np.column_stack((test_data, train_index))

train_index, test_index = model_selection.train_test_split_idx(x, y=y, method="stratified", test_size=0.25, random_state=42)
test_data = np.column_stack((test_data, train_index))

train_index, test_index = model_selection.train_test_split_idx(x, y=y, method="stratified", test_size=0.25, random_state=42, n_bins=4)
test_data = np.column_stack((test_data, train_index))

train_index, test_index = model_selection.train_test_split_idx(x, y=y, method="circular", test_size=0.25, random_state=42)
test_data = np.column_stack((test_data, train_index))

# train_index, test_index = model_selection.train_test_split_idx(x, method="SPlit", test_size=0.25, random_state=42)
# test_data = np.column_stack((test_data, train_index))

np.savetxt("test_split_validation.csv", test_data.astype(np.int32), delimiter=";", fmt='%i')

ImportError: attempted relative import with no known parent package

In [5]:
%load_ext autoreload
%autoreload 2

## Generate augmentation test data - Snapshot (22-10-20 - commit:-----)
import random

import numpy as np
import pandas as pd
from sklearn.pipeline import FeatureUnion

from pinard import augmentation as aug
from pinard.sklearn import SampleAugmentation

x_fname = "test_augmentation.csv"
x_df = pd.read_csv(x_fname, sep=";", header=None)
x = x_df.astype(np.float32).values
y = np.reshape(x[:, 0], (-1, 1))
x = x[:, 1:]

augmentations = [
    ("Id", aug.IdentityAugmenter()),
    (5,"Rotate_Translate", aug.Rotate_Translate(random_state=42, per_sample=False)),
    (3,"Rotate_Translate_per_sample", aug.Rotate_Translate(random_state=42, per_sample=True)),
    ("Random_Y_Shift", aug.Random_Y_Shift(random_state=42, per_sample=False)),
    # ("Random_Y_Shift_per_sample", aug.Random_Y_Shift(random_state=42, per_sample=True)),
    # ("Random_Multiplicative_Shift", aug.Random_Multiplicative_Shift(random_state=42, per_sample=False)),
    # ("Random_Multiplicative_Shift_per_sample", aug.Random_Multiplicative_Shift(random_state=42, per_sample=True)),
    # ("Random_Spline_Addition", aug.Random_Spline_Addition(random_state=42)),
    # ("Random_X_Spline_Deformation", aug.Random_X_Spline_Deformation(random_state=42)),
    # ("Random_X_Spline_Shift", aug.Random_X_Spline_Shift(random_state=42)),
    # ("Monotonous_Spline_Simplification", aug.Monotonous_Spline_Simplification(random_state=42)),
    # ("Dependent_Spline_Simplification", aug.Dependent_Spline_Simplification(random_state=42)),
]


# for augment in augmentations:
#     print(augment[0])
#     augment[1].fit_transform(x,y)

aug = SampleAugmentation(augmentations)
X_train, y_train = aug.transform(x, y)
print(X_train[:,:10])

# print(X_train.shape, y_train.shape)
# print(np.concatenate(y_train))
# print(X_train[2], x[1])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ValueError: operands could not be broadcast together with shapes (4,234) (4,) 

In [7]:
%load_ext autoreload
%autoreload 2

## Generate augmentation test data - Snapshot (22-10-20 - commit:-----)
import random

import numpy as np
import pandas as pd
from sklearn.pipeline import FeatureUnion

from pinard import augmentation as aug
from pinard.sklearn import SampleAugmentation

import matplotlib.pyplot as plt

x_fname = "test_augmentation.csv"
x_df = pd.read_csv(x_fname, sep=";", header=None)
x = x_df.astype(np.float32).values
y = np.reshape(x[:, 0], (-1, 1))
x = x[:, 1:]


x_axis = np.arange(0,len(x[0]),1)
augmenter = aug.Dependent_Spline_Simplification()
y_vals = augmenter.fit_transform(x)
print(x.shape, y_vals.shape)
plt.plot(x_axis, x[0], color="blue")
plt.plot(x_axis, y_vals, color="red")
plt.show()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
(265.55395736423134, array([[0.00522662, 0.00522662, 0.00522662, ..., 0.00522662, 0.00522662,
        0.00522662],
       [0.00570964, 0.00570964, 0.00570964, ..., 0.00570964, 0.00570964,
        0.00570964],
       [0.01090541, 0.01090541, 0.01090541, ..., 0.01090541, 0.01090541,
        0.01090541],
       ...,
       [0.00384586, 0.00384586, 0.00384586, ..., 0.00384586, 0.00384586,
        0.00384586],
       [0.00381146, 0.00381146, 0.00381146, ..., 0.00381146, 0.00381146,
        0.00381146],
       [0.00380581, 0.00380581, 0.00380581, ..., 0.00380581, 0.00380581,
        0.00380581]]), array([5.22661817e-03, 1.04532363e-02, 1.56798545e-02, ...,
       2.65546346e+02, 2.65550152e+02, 2.65553957e+02])) 234 10
>>> 908.5337543349737
>>> 908.5337543349737
>>> [5.22661817e-03 1.04532363e-02 1.56798545e-02 ... 2.65546346e+02
 2.65550152e+02 2.65553957e+02]


ValueError: zero-size array to reduction operation minimum which has no identity

In [8]:
import numpy as np
import random
import scipy.interpolate as interpolate

def segment_length(x1, y1, x2, y2):
    return np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)


v_segment_length = np.vectorize(segment_length)


def X_length(x, y):
    x1 = x[range((len(x) - 1))]
    y1 = y[range((len(y) - 1))]
    x2 = x[range(1, len(x))]
    y2 = y[range(1, len(y))]
    y1 = np.reshape(np.array(y1), (-1, 1))
    y2 = np.reshape(np.array(y2), (-1, 1))

    SpecLen_seg = v_segment_length(x1, y1, x2, y2)
    SpecLen = np.sum(SpecLen_seg)
    SpecLen_seg_cumsum = np.cumsum(SpecLen_seg)
    return (SpecLen, SpecLen_seg, SpecLen_seg_cumsum)


def segment_pt_coord(x1, y1, x2, y2, fracL, L):
    propL = fracL / L
    xp = x1 + propL * (x2 - x1)
    yp = y1 + propL * (y2 - y1)
    return (xp, yp)


def interval_selection(n_l, CumVect):
    i1 = np.where(n_l <= CumVect)
    i2 = np.where(n_l >= CumVect)
    return (np.min(i1), np.max(i2))



class Dependent_Spline_Simplification():
    def augment(self, X, apply_on="samples"):
        """Select regularly spaced points ON the X and adjust a spline"""
        nfreq = len(X)
        x0 = np.linspace(0, np.max(X), nfreq)
        res = X_length(x0, X)
        nb_segments = 10
        x_samples = []
        y_samples = []

        print(res, nfreq, nb_segments)

        for s in range(1, nb_segments):
            length = X_length(x0, X)[0] / nb_segments
            print(">>>", length)
            # cumulative_length = np.cumsum(np.repeat(l,nb_segments))
            n_l = s * length
            print(">>>", n_l)
            test = res[2]
            print(">>>", test)
            toto = interval_selection(n_l, test)
            print("----", toto)

            P = segment_pt_coord(
                x1=x0[toto[1]],
                y1=X[toto[1]],
                x2=x0[toto[0]],
                y2=X[toto[0]],
                fracL=res[1][toto[1]] % length,
                L=res[1][toto[1]],
            )

            x_samples.append(P[0])
            y_samples.append(P[1])

        x = np.array(x_samples)
        x = np.concatenate(([0], x, [np.max(x0)]))
        y = np.array(y_samples)
        y = np.concatenate(([X[0]], y, [X[nfreq - 1]]))
        # print(x)
        t, c, k = interpolate.splrep(x, y, s=0, k=3)
        xmin, xmax = x.min(), x.max()
        xx = np.linspace(xmin, xmax, nfreq)
        spline = interpolate.BSpline(t, c, k, extrapolate=False)

        return spline(xx)

import matplotlib.pyplot as plt

x_fname = "test_augmentation.csv"
x_df = pd.read_csv(x_fname, sep=";", header=None)
x = x_df.astype(np.float32).values
y = np.reshape(x[:, 0], (-1, 1))
x = x[:, 1:]


x_axis = np.arange(0,len(x[0]),1)
augmenter = Dependent_Spline_Simplification()
y_vals = augmenter.augment(x[0])
print(x.shape, y_vals.shape)
plt.plot(x_axis, x[0], color="blue")
plt.plot(x_axis, y_vals, color="red")
plt.show()

TypeError: object of type 'numpy.float32' has no len()