In [10]:
from datasets import *
from transforms import *

from __future__ import print_function

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

from ripser import lower_star_img
from ripser import Rips
vr = Rips()
from gtda.homology import VietorisRipsPersistence

import persim
import diagram2vec

from scipy.ndimage import gaussian_filter

from sklearn.datasets import make_circles
from sklearn.manifold import MDS

from gtda.diagrams import PersistenceEntropy, PersistenceImage, BettiCurve

import pickle
from tqdm import tqdm

import torch
from torch.nn import Linear
from torch.nn.functional import relu

from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

import pandas as pd
import os
from PIL import Image

from sklearn.model_selection import cross_val_score

Rips(maxdim=1, thresh=inf, coeff=2, do_cocycles=False, n_perm = None, verbose=True)


### Porus

In [6]:
W = 300
sigma1 = 4
sigma2 = 2
t = 0.01

def generate(N, S, W=300, sigma1=4, sigma2=2, t=0.01, bins=64):

    z = np.zeros((N, S, 2))
    for n in range(N):
        z[n, 0] = np.random.uniform(0, W, size=(2))
        for s in range(S-1):
            d_1 = np.random.normal(0, sigma1)
            d_2 = np.random.normal(0, sigma1)
            z[n, s+1, 0] = (z[n, s, 0] + d_1) % W
            z[n, s+1, 1] = (z[n, s, 1] + d_2) % W

    z_r = z.reshape(N*S, 2)
    H, _, _ = np.histogram2d(z_r[:,0], z_r[:,1], bins=bins)
    
    G = gaussian_filter(H, sigma2)
    G[G < t] = 0
    
    return G

In [12]:
count = 10000
classes_count = 2

images = np.zeros((classes_count * count, 64, 64))

# class A
N = 100
S = 30

for n in tqdm(range(count)):
    images[n] = generate(N, S)
    
# class B
N = 250
S = 10

for n in tqdm(range(count)):
    images[n+count] = generate(N, S)

100%|██████████| 10000/10000 [00:35<00:00, 280.81it/s]
100%|██████████| 10000/10000 [00:29<00:00, 333.62it/s]


In [13]:
from sklearn.model_selection import train_test_split

In [20]:
def push_diagrams(diagrams, name):
    labels = [0 for _ in range(count)] + [1 for _ in range(count)]
    pairs = list(zip(diagrams, labels))

    train_pairs, test_pairs = train_test_split(pairs, test_size=0.2, random_state=42)

    train_diagrams, train_labels = zip(*train_pairs)
    test_diagrams, test_labels = zip(*test_pairs)

    train_dataset = DiagramsDataset(train_diagrams, train_labels)
    test_dataset = DiagramsDataset(test_diagrams, test_labels)

    torch.save(train_dataset, f"../data/porus/{name}_train.pt")
    torch.save(test_dataset, f"../data/porus/{name}_test.pt")

In [19]:
baseline_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(images)
]
push_diagrams(baseline_diagrams, "baseline")

100%|██████████| 20000/20000 [02:04<00:00, 161.23it/s]


In [22]:
cedt_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(images)
]
push_diagrams(cedt_diagrams, "cedt")

100%|██████████| 20000/20000 [02:10<00:00, 153.14it/s]


In [23]:
for thickening in [1, 2, 3, 5, 8]:
    cedt_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(images)
    ]
    push_diagrams(cedt_diagrams, f"cedt_thickening_{thickening}")

100%|██████████| 20000/20000 [02:12<00:00, 150.99it/s]
100%|██████████| 20000/20000 [02:12<00:00, 151.21it/s]
100%|██████████| 20000/20000 [02:12<00:00, 150.83it/s]
100%|██████████| 20000/20000 [02:13<00:00, 149.59it/s]
100%|██████████| 20000/20000 [02:13<00:00, 150.09it/s]


In [24]:
for dir_count in [1, 4, 10]:
    dirs = np.arange(dir_count) / dir_count * 2 * np.pi
    cedt_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(images)
    ]
    push_diagrams(cedt_diagrams, f"directional_{dir_count}")


100%|██████████| 20000/20000 [02:06<00:00, 157.72it/s]
100%|██████████| 20000/20000 [08:25<00:00, 39.58it/s]
100%|██████████| 20000/20000 [20:49<00:00, 16.00it/s]


In [25]:
for conv_count in [1, 4, 10]:
    conv = nn.Conv2d(1, conv_count, kernel_size=3)
    cedt_diagrams = [
        process_by_conv(torch.Tensor(t).flatten(), conv, device="cpu") for t in tqdm(images)
    ]
    push_diagrams(cedt_diagrams, f"convolution_{conv_count}")


100%|██████████| 20000/20000 [02:05<00:00, 159.43it/s]
100%|██████████| 20000/20000 [08:35<00:00, 38.82it/s]
100%|██████████| 20000/20000 [20:44<00:00, 16.06it/s]


In [26]:
labels = [0 for _ in range(count)] + [1 for _ in range(count)]
pairs = list(zip(images, labels))

train_pairs, test_pairs = train_test_split(pairs, test_size=0.2, random_state=42)

train_images, train_labels = zip(*train_pairs)
test_images, test_labels = zip(*test_pairs)

train_dataset = ImagesDataset(train_images, train_labels)
test_dataset = ImagesDataset(test_images, test_labels)

torch.save(train_dataset, f"../data/porus/images_train.pt")
torch.save(test_dataset, f"../data/porus/images_test.pt")

### MNIST

In [2]:
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
mnist_train = torchvision.datasets.MNIST('../data/mnist/base_mnist_train_data', train=True, download=True, transform=transform)
mnist_test = torchvision.datasets.MNIST('../data/mnist/base_mnist_test_data', train=False, download=True, transform=transform)
X_train = mnist_train.data.numpy() / 255
X_test = mnist_test.data.numpy() / 255
y_train = mnist_train.targets.numpy()
y_test = mnist_test.targets.numpy()

In [3]:
def push_diagrams_mnist(train_diagrams, test_diagrams, train_labels, test_labels, name):
    train_dataset = DiagramsDataset(train_diagrams, train_labels)
    test_dataset = DiagramsDataset(test_diagrams, test_labels)

    torch.save(train_dataset, f"../data/mnist/{name}_train.pt")
    torch.save(test_dataset, f"../data/mnist/{name}_test.pt")

In [4]:
train_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_train)
]
test_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_test)
]
push_diagrams_mnist(train_diagrams, test_diagrams, y_train, y_test, "baseline")

100%|██████████| 60000/60000 [01:14<00:00, 806.87it/s]
100%|██████████| 10000/10000 [00:12<00:00, 802.34it/s]


In [5]:
train_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_train)
]
test_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_test)
]
push_diagrams_mnist(train_diagrams, test_diagrams, y_train, y_test, "cedt")

100%|██████████| 60000/60000 [01:17<00:00, 778.39it/s]
100%|██████████| 10000/10000 [00:12<00:00, 776.57it/s]


In [6]:
for thickening in [1, 2, 3, 5, 8]:
    train_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_mnist(train_diagrams, test_diagrams, y_train, y_test, f"cedt_thickening_{thickening}")

100%|██████████| 60000/60000 [01:19<00:00, 755.49it/s]
100%|██████████| 10000/10000 [00:13<00:00, 768.50it/s]
100%|██████████| 60000/60000 [01:19<00:00, 750.52it/s]
100%|██████████| 10000/10000 [00:13<00:00, 743.57it/s]
100%|██████████| 60000/60000 [01:20<00:00, 747.35it/s]
100%|██████████| 10000/10000 [00:13<00:00, 742.49it/s]
100%|██████████| 60000/60000 [01:21<00:00, 738.02it/s]
100%|██████████| 10000/10000 [00:13<00:00, 741.49it/s]
100%|██████████| 60000/60000 [01:20<00:00, 745.55it/s]
100%|██████████| 10000/10000 [00:13<00:00, 727.36it/s]


In [7]:
for dir_count in [1, 4, 10]:
    dirs = np.arange(dir_count) / dir_count * 2 * np.pi
    train_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_mnist(train_diagrams, test_diagrams, y_train, y_test, f"directional_{dir_count}")



100%|██████████| 60000/60000 [01:14<00:00, 808.48it/s]
100%|██████████| 10000/10000 [00:12<00:00, 824.59it/s]
100%|██████████| 60000/60000 [04:59<00:00, 200.21it/s]
100%|██████████| 10000/10000 [00:51<00:00, 195.57it/s]
100%|██████████| 60000/60000 [12:04<00:00, 82.84it/s]
100%|██████████| 10000/10000 [02:00<00:00, 82.73it/s]


In [8]:
for conv_count in [1, 4, 10]:
    torch.random.manual_seed(42)
    conv = nn.Conv2d(1, conv_count, kernel_size=3)
    train_diagrams = [
        process_by_conv(torch.Tensor(t).flatten(), conv, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_by_conv(torch.Tensor(t).flatten(), conv, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_mnist(train_diagrams, test_diagrams, y_train, y_test, f"convolution_{conv_count}")


100%|██████████| 60000/60000 [01:10<00:00, 854.89it/s]
100%|██████████| 10000/10000 [00:11<00:00, 841.26it/s]
100%|██████████| 60000/60000 [04:38<00:00, 215.40it/s]
100%|██████████| 10000/10000 [00:48<00:00, 208.14it/s]
100%|██████████| 60000/60000 [11:40<00:00, 85.62it/s]
100%|██████████| 10000/10000 [01:58<00:00, 84.70it/s]


In [9]:
train_dataset = ImagesDataset(X_train, y_train)
test_dataset = ImagesDataset(X_test, y_test)

torch.save(train_dataset, f"../data/mnist/images_train.pt")
torch.save(test_dataset, f"../data/mnist/images_test.pt")

### CIFAR10

In [10]:
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
cifar_train = torchvision.datasets.CIFAR10('../data/cifar10/base_cifar10_train_data', train=True, download=True, transform=transform)
cifar_test = torchvision.datasets.CIFAR10('../data/cifar10/base_cifar10_test_data', train=False, download=True, transform=transform)
X_train = cifar_train.data / 255
X_test = cifar_test.data / 255
y_train = np.array(cifar_train.targets)
y_test = np.array(cifar_test.targets)

Files already downloaded and verified
Files already downloaded and verified


In [11]:
# average on last channel
X_train = np.mean(X_train, axis=-1)
X_test = np.mean(X_test, axis=-1)

In [12]:
def push_diagrams_cifar(train_diagrams, test_diagrams, train_labels, test_labels, name):
    train_dataset = DiagramsDataset(train_diagrams, train_labels)
    test_dataset = DiagramsDataset(test_diagrams, test_labels)

    torch.save(train_dataset, f"../data/cifar10/{name}_train.pt")
    torch.save(test_dataset, f"../data/cifar10/{name}_test.pt")

In [13]:
train_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_train)
]
test_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_test)
]
push_diagrams_cifar(train_diagrams, test_diagrams, y_train, y_test, "baseline")

100%|██████████| 50000/50000 [01:29<00:00, 558.49it/s]
100%|██████████| 10000/10000 [00:17<00:00, 558.13it/s]


In [14]:
train_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_train)
]
test_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_test)
]
push_diagrams_cifar(train_diagrams, test_diagrams, y_train, y_test, "cedt")

100%|██████████| 50000/50000 [01:26<00:00, 578.34it/s]
100%|██████████| 10000/10000 [00:16<00:00, 611.02it/s]


In [15]:
for thickening in [1, 2, 3, 5, 8]:
    train_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_cifar(train_diagrams, test_diagrams, y_train, y_test, f"cedt_thickening_{thickening}")

100%|██████████| 50000/50000 [01:28<00:00, 566.49it/s]
100%|██████████| 10000/10000 [00:17<00:00, 561.19it/s]
100%|██████████| 50000/50000 [01:26<00:00, 580.78it/s]
100%|██████████| 10000/10000 [00:17<00:00, 581.21it/s]
100%|██████████| 50000/50000 [01:25<00:00, 582.27it/s]
100%|██████████| 10000/10000 [00:16<00:00, 589.99it/s]
100%|██████████| 50000/50000 [01:26<00:00, 579.47it/s]
100%|██████████| 10000/10000 [00:16<00:00, 597.38it/s]
100%|██████████| 50000/50000 [01:23<00:00, 599.88it/s]
100%|██████████| 10000/10000 [00:16<00:00, 595.46it/s]


In [16]:
for dir_count in [1, 4, 10]:
    dirs = np.arange(dir_count) / dir_count * 2 * np.pi
    train_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_cifar(train_diagrams, test_diagrams, y_train, y_test, f"directional_{dir_count}")

100%|██████████| 50000/50000 [01:27<00:00, 570.15it/s]
100%|██████████| 10000/10000 [00:17<00:00, 563.01it/s]
100%|██████████| 50000/50000 [05:49<00:00, 143.04it/s]
100%|██████████| 10000/10000 [01:09<00:00, 144.65it/s]
100%|██████████| 50000/50000 [14:06<00:00, 59.10it/s]
100%|██████████| 10000/10000 [02:48<00:00, 59.35it/s]


In [17]:
for conv_count in [1, 4, 10]:
    torch.random.manual_seed(42)
    conv = nn.Conv2d(1, conv_count, kernel_size=3)
    train_diagrams = [
        process_by_conv(torch.Tensor(t).flatten(), conv, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_by_conv(torch.Tensor(t).flatten(), conv, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_cifar(train_diagrams, test_diagrams, y_train, y_test, f"convolution_{conv_count}")


100%|██████████| 50000/50000 [01:17<00:00, 647.26it/s]
100%|██████████| 10000/10000 [00:15<00:00, 649.17it/s]
100%|██████████| 50000/50000 [05:15<00:00, 158.66it/s]
100%|██████████| 10000/10000 [01:01<00:00, 161.44it/s]
100%|██████████| 50000/50000 [13:24<00:00, 62.13it/s]
100%|██████████| 10000/10000 [02:45<00:00, 60.45it/s]


In [18]:
train_dataset = ImagesDataset(X_train, y_train)
test_dataset = ImagesDataset(X_test, y_test)

torch.save(train_dataset, f"../data/cifar10/images_train.pt")
torch.save(test_dataset, f"../data/cifar10/images_test.pt")

### Chinese MNIST

In [20]:
df = pd.read_csv('../data/chinese-mnist/chinese_mnist.csv')

In [29]:
label_mapping = {value: i for i, value in enumerate(sorted(df.value.unique()))}

def get_sample(row):
    path = os.path.join('../data/chinese-mnist/data/data', f'input_{row.suite_id}_{row.sample_id}_{row.code}.jpg')
    image = Image.open(path)
    image = np.array(image) / 255
    label = label_mapping[row.value]
    return image, label

In [30]:
def push_diagrams_chinese_mnist(train_diagrams, test_diagrams, train_labels, test_labels, name):
    train_dataset = DiagramsDataset(train_diagrams, train_labels)
    test_dataset = DiagramsDataset(test_diagrams, test_labels)

    torch.save(train_dataset, f"../data/chinese-mnist/{name}_train.pt")
    torch.save(test_dataset, f"../data/chinese-mnist/{name}_test.pt")

In [31]:
pairs = [get_sample(row) for _, row in df.iterrows()]
train_pairs, test_pairs = train_test_split(pairs, test_size=0.2, random_state=42)

X_train, y_train = zip(*train_pairs)
X_test, y_test = zip(*test_pairs)

In [32]:
train_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_train)
]
test_diagrams = [
    process_baseline(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_test)
]
push_diagrams_chinese_mnist(train_diagrams, test_diagrams, y_train, y_test, "baseline")

100%|██████████| 12000/12000 [01:18<00:00, 153.81it/s]
100%|██████████| 3000/3000 [00:19<00:00, 150.12it/s]


In [33]:
train_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_train)
]
test_diagrams = [
    process_cedt(torch.Tensor(t).flatten(), device="cpu") for t in tqdm(X_test)
]
push_diagrams_chinese_mnist(train_diagrams, test_diagrams, y_train, y_test, "cedt")

100%|██████████| 12000/12000 [01:10<00:00, 171.24it/s]
100%|██████████| 3000/3000 [00:17<00:00, 167.32it/s]


In [34]:
for thickening in [1, 2, 3, 5, 8]:
    train_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_cedt_thickening(torch.Tensor(t).flatten(), thickening, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_chinese_mnist(train_diagrams, test_diagrams, y_train, y_test, f"cedt_thickening_{thickening}")

100%|██████████| 12000/12000 [01:11<00:00, 167.18it/s]
100%|██████████| 3000/3000 [00:18<00:00, 166.64it/s]
100%|██████████| 12000/12000 [01:12<00:00, 166.41it/s]
100%|██████████| 3000/3000 [00:18<00:00, 161.60it/s]
100%|██████████| 12000/12000 [01:13<00:00, 163.31it/s]
100%|██████████| 3000/3000 [00:18<00:00, 164.42it/s]
100%|██████████| 12000/12000 [01:12<00:00, 164.89it/s]
100%|██████████| 3000/3000 [00:18<00:00, 160.54it/s]
100%|██████████| 12000/12000 [01:13<00:00, 163.55it/s]
100%|██████████| 3000/3000 [00:18<00:00, 163.14it/s]


In [35]:
for dir_count in [1, 4, 10]:
    dirs = np.arange(dir_count) / dir_count * 2 * np.pi
    train_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_image(torch.Tensor(t).flatten(), dirs, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_chinese_mnist(train_diagrams, test_diagrams, y_train, y_test, f"directional_{dir_count}")

100%|██████████| 12000/12000 [01:12<00:00, 165.59it/s]
100%|██████████| 3000/3000 [00:17<00:00, 168.98it/s]
100%|██████████| 12000/12000 [04:51<00:00, 41.17it/s]
100%|██████████| 3000/3000 [01:13<00:00, 40.81it/s]
100%|██████████| 12000/12000 [12:09<00:00, 16.46it/s] 
100%|██████████| 3000/3000 [02:59<00:00, 16.75it/s]


In [36]:
for conv_count in [1, 4, 10]:
    torch.random.manual_seed(42)
    conv = nn.Conv2d(1, conv_count, kernel_size=3)
    train_diagrams = [
        process_by_conv(torch.Tensor(t).flatten(), conv, device="cpu") for t in tqdm(X_train)
    ]
    test_diagrams = [
        process_by_conv(torch.Tensor(t).flatten(), conv, device="cpu") for t in tqdm(X_test)
    ]
    push_diagrams_chinese_mnist(train_diagrams, test_diagrams, y_train, y_test, f"convolution_{conv_count}")

100%|██████████| 12000/12000 [01:24<00:00, 142.41it/s]
100%|██████████| 3000/3000 [00:22<00:00, 135.28it/s]
100%|██████████| 12000/12000 [05:26<00:00, 36.70it/s]
100%|██████████| 3000/3000 [01:23<00:00, 36.04it/s]
100%|██████████| 12000/12000 [13:45<00:00, 14.53it/s] 
100%|██████████| 3000/3000 [03:23<00:00, 14.75it/s]


In [37]:
train_dataset = ImagesDataset(X_train, y_train)
test_dataset = ImagesDataset(X_test, y_test)

torch.save(train_dataset, f"../data/chinese-mnist/images_train.pt")
torch.save(test_dataset, f"../data/chinese-mnist/images_test.pt")