In [1]:
from datasets import *
from transforms import *

from __future__ import print_function

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

from ripser import lower_star_img
from ripser import Rips
vr = Rips()
from gtda.homology import VietorisRipsPersistence

import persim
import diagram2vec

from scipy.ndimage import gaussian_filter

from sklearn.datasets import make_circles
from sklearn.manifold import MDS

from gtda.diagrams import PersistenceEntropy, PersistenceImage, BettiCurve

import pickle
from tqdm import tqdm

import torch
from torch.nn import Linear
from torch.nn.functional import relu

from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

from sklearn.model_selection import cross_val_score

POT (Python Optimal Transport) package is not installed. Try to run $ conda install -c conda-forge pot ; or $ pip install POT
Rips(maxdim=1, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)


### Porus

In [6]:
W = 300
sigma1 = 4
sigma2 = 2
t = 0.01

def generate(N, S, W=300, sigma1=4, sigma2=2, t=0.01, bins=64):

    z = np.zeros((N, S, 2))
    for n in range(N):
        z[n, 0] = np.random.uniform(0, W, size=(2))
        for s in range(S-1):
            d_1 = np.random.normal(0, sigma1)
            d_2 = np.random.normal(0, sigma1)
            z[n, s+1, 0] = (z[n, s, 0] + d_1) % W
            z[n, s+1, 1] = (z[n, s, 1] + d_2) % W

    z_r = z.reshape(N*S, 2)
    H, _, _ = np.histogram2d(z_r[:,0], z_r[:,1], bins=bins)
    
    G = gaussian_filter(H, sigma2)
    G[G < t] = 0
    
    return G

In [12]:
count = 10000
classes_count = 2

images = np.zeros((classes_count * count, 64, 64))

# class A
N = 100
S = 30

for n in tqdm(range(count)):
    images[n] = generate(N, S)
    
# class B
N = 250
S = 10

for n in tqdm(range(count)):
    images[n+count] = generate(N, S)

100%|██████████| 10000/10000 [00:35<00:00, 280.81it/s]
100%|██████████| 10000/10000 [00:29<00:00, 333.62it/s]


In [13]:
from sklearn.model_selection import train_test_split

In [20]:
def push_diagrams(diagrams, name):
    labels = [0 for _ in range(count)] + [1 for _ in range(count)]
    pairs = list(zip(diagrams, labels))

    train_pairs, test_pairs = train_test_split(pairs, test_size=0.2, random_state=42)

    train_diagrams, train_labels = zip(*train_pairs)
    test_diagrams, test_labels = zip(*test_pairs)

    train_dataset = DiagramsDataset(train_diagrams, train_labels)
    test_dataset = DiagramsDataset(test_diagrams, test_labels)

    torch.save(train_dataset, f"../data/porus/{name}_train.pt")
    torch.save(test_dataset, f"../data/porus/{name}_test.pt")

In [19]:
baseline_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(images)
]
push_diagrams(baseline_diagrams, "baseline")

100%|██████████| 20000/20000 [02:04<00:00, 161.23it/s]


In [22]:
cedt_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(images)
]
push_diagrams(cedt_diagrams, "cedt")

100%|██████████| 20000/20000 [02:10<00:00, 153.14it/s]


In [23]:
for thickening in [1, 2, 3, 5, 8]:
    cedt_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(images)
    ]
    push_diagrams(cedt_diagrams, f"cedt_thickening_{thickening}")

100%|██████████| 20000/20000 [02:12<00:00, 150.99it/s]
100%|██████████| 20000/20000 [02:12<00:00, 151.21it/s]
100%|██████████| 20000/20000 [02:12<00:00, 150.83it/s]
100%|██████████| 20000/20000 [02:13<00:00, 149.59it/s]
100%|██████████| 20000/20000 [02:13<00:00, 150.09it/s]


In [24]:
for dir_count in [1, 4, 10]:
    dirs = np.arange(dir_count) / dir_count * 2 * np.pi
    cedt_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(images)
    ]
    push_diagrams(cedt_diagrams, f"directional_{dir_count}")


100%|██████████| 20000/20000 [02:06<00:00, 157.72it/s]
100%|██████████| 20000/20000 [08:25<00:00, 39.58it/s]
100%|██████████| 20000/20000 [20:49<00:00, 16.00it/s]


In [25]:
for conv_count in [1, 4, 10]:
    conv = nn.Conv2d(1, conv_count, kernel_size=3)
    cedt_diagrams = [
        process_by_conv(torch.Tensor(t).flatten(), conv, device="cpu") for t in tqdm(images)
    ]
    push_diagrams(cedt_diagrams, f"convolution_{conv_count}")


100%|██████████| 20000/20000 [02:05<00:00, 159.43it/s]
100%|██████████| 20000/20000 [08:35<00:00, 38.82it/s]
100%|██████████| 20000/20000 [20:44<00:00, 16.06it/s]


In [26]:
labels = [0 for _ in range(count)] + [1 for _ in range(count)]
pairs = list(zip(images, labels))

train_pairs, test_pairs = train_test_split(pairs, test_size=0.2, random_state=42)

train_images, train_labels = zip(*train_pairs)
test_images, test_labels = zip(*test_pairs)

train_dataset = ImagesDataset(train_images, train_labels)
test_dataset = ImagesDataset(test_images, test_labels)

torch.save(train_dataset, f"../data/porus/images_train.pt")
torch.save(test_dataset, f"../data/porus/images_test.pt")

### MNIST

In [4]:
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
mnist_train = torchvision.datasets.MNIST('../data/mnist/base_mnist_train_data', train=True, download=True, transform=transform)
mnist_test = torchvision.datasets.MNIST('../data/mnist/base_mnist_test_data', train=False, download=True, transform=transform)
X_train = mnist_train.data.numpy()
X_test = mnist_test.data.numpy()
y_train = mnist_train.targets.numpy()
y_test = mnist_test.targets.numpy()

In [5]:
def push_diagrams_mnist(train_diagrams, test_diagrams, train_labels, test_labels, name):
    train_dataset = DiagramsDataset(train_diagrams, train_labels)
    test_dataset = DiagramsDataset(test_diagrams, test_labels)

    torch.save(train_dataset, f"../data/mnist/{name}_train.pt")
    torch.save(test_dataset, f"../data/mnist/{name}_test.pt")

In [6]:
train_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_train)
]
test_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_test)
]
push_diagrams_mnist(train_diagrams, test_diagrams, y_train, y_test, "baseline")

100%|██████████| 60000/60000 [01:16<00:00, 779.34it/s]
100%|██████████| 10000/10000 [00:12<00:00, 782.56it/s]


In [7]:
train_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_train)
]
test_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_test)
]
push_diagrams_mnist(train_diagrams, test_diagrams, y_train, y_test, "cedt")

100%|██████████| 60000/60000 [01:20<00:00, 743.83it/s]
100%|██████████| 10000/10000 [00:13<00:00, 750.03it/s]


In [8]:
for thickening in [1, 2, 3, 5, 8]:
    train_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_mnist(train_diagrams, test_diagrams, y_train, y_test, f"cedt_thickening_{thickening}")

100%|██████████| 60000/60000 [01:23<00:00, 722.19it/s]
100%|██████████| 10000/10000 [00:13<00:00, 717.80it/s]
100%|██████████| 60000/60000 [01:23<00:00, 716.05it/s]
100%|██████████| 10000/10000 [00:13<00:00, 725.56it/s]
100%|██████████| 60000/60000 [01:27<00:00, 685.53it/s]
100%|██████████| 10000/10000 [00:14<00:00, 710.15it/s]
100%|██████████| 60000/60000 [01:26<00:00, 694.54it/s]
100%|██████████| 10000/10000 [00:14<00:00, 682.70it/s]
100%|██████████| 60000/60000 [01:26<00:00, 696.39it/s]
 81%|████████  | 8058/10000 [00:11<00:02, 698.95it/s]

In [None]:
for dir_count in [1, 4, 10]:
    dirs = np.arange(dir_count) / dir_count * 2 * np.pi
    train_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_mnist(train_diagrams, test_diagrams, y_train, y_test, f"directional_{dir_count}")



In [None]:
for conv_count in [1, 4, 10]:
    torch.random.manual_seed(42)
    conv = nn.Conv2d(1, conv_count, kernel_size=3)
    train_diagrams = [
        process_by_conv(torch.Tensor(t).flatten(), conv, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_by_conv(torch.Tensor(t).flatten(), conv, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams(cedt_diagrams, f"convolution_{conv_count}")


In [None]:
train_dataset = ImagesDataset(X_train, y_train)
test_dataset = ImagesDataset(X_test, y_test)

torch.save(train_dataset, f"../data/mnist/images_train.pt")
torch.save(test_dataset, f"../data/mnist/images_test.pt")

### CIFAR10

In [None]:
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
cifar_train = torchvision.datasets.CIFAR10('../data/cifar10/base_cifar10_train_data', train=True, download=True, transform=transform)
cifar_test = torchvision.datasets.CIFAR10('../data/cifar10/base_cifar10_test_data', train=False, download=True, transform=transform)
X_train = cifar_train.data.numpy()
X_test = cifar_test.data.numpy()
y_train = np.array(cifar_train.targets)
y_test = np.array(cifar_test.targets)

In [None]:
def push_diagrams_cifar(train_diagrams, test_diagrams, train_labels, test_labels, name):
    train_dataset = DiagramsDataset(train_diagrams, train_labels)
    test_dataset = DiagramsDataset(test_diagrams, test_labels)

    torch.save(train_dataset, f"../data/cifar10/{name}_train.pt")
    torch.save(test_dataset, f"../data/cifar10/{name}_test.pt")

In [None]:
train_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_train)
]
test_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_test)
]
push_diagrams_cifar(train_diagrams, test_diagrams, y_train, y_test, "baseline")

In [None]:
train_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_train)
]
test_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_test)
]
push_diagrams_cifar(train_diagrams, test_diagrams, y_train, y_test, "cedt")

In [None]:
for thickening in [1, 2, 3, 5, 8]:
    train_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_cifar(train_diagrams, test_diagrams, y_train, y_test, f"cedt_thickening_{thickening}")

In [None]:
for dir_count in [1, 4, 10]:
    dirs = np.arange(dir_count) / dir_count * 2 * np.pi
    train_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_cifar(train_diagrams, test_diagrams, y_train, y_test, f"directional_{dir_count}")

In [None]:
train_dataset = ImagesDataset(X_train, y_train)
test_dataset = ImagesDataset(X_test, y_test)

torch.save(train_dataset, f"../data/cifar10/images_train.pt")
torch.save(test_dataset, f"../data/cifar10/images_test.pt")

### Chinese MNIST

In [None]:
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
cifar_train = torchvision.datasets.CIFAR10('../data/cifar10/base_cifar10_train_data', train=True, download=True, transform=transform)
cifar_test = torchvision.datasets.CIFAR10('../data/cifar10/base_cifar10_test_data', train=False, download=True, transform=transform)
X_train = cifar_train.data.numpy()
X_test = cifar_test.data.numpy()
y_train = np.array(cifar_train.targets)
y_test = np.array(cifar_test.targets)