# Digits dataset experiment

### 1. Import libraries

In [None]:
from sklearn.datasets import load_digits
from sklearn.preprocessing import normalize
import numpy as np
import auxiliary_fun as a
from keras import utils
from keras.models import Sequential
from keras.layers import Dense, Activation

### 2. Load digits dataset and normalization.

In [None]:
digits = load_digits()
data = normalize(digits.data)
y = digits.target

### 3. Obtaining the dominating dataset with $\varepsilon \le 0.5$

In [None]:
import time
start = time.time()
d0 = a.dominatingSet(data[y==0],y[y==0],epsilon = 0.2)
d1 = a.dominatingSet(data[y==1],y[y==1],epsilon = 0.2)
d2 = a.dominatingSet(data[y==2],y[y==2],epsilon = 0.2)
d3 = a.dominatingSet(data[y==3],y[y==3],epsilon = 0.2)
d4 = a.dominatingSet(data[y==4],y[y==4],epsilon = 0.2)
d5 = a.dominatingSet(data[y==5],y[y==5],epsilon = 0.2)
d6 = a.dominatingSet(data[y==6],y[y==6],epsilon = 0.2)
d7 = a.dominatingSet(data[y==7],y[y==7],epsilon = 0.2)
d8 = a.dominatingSet(data[y==8],y[y==8],epsilon = 0.2)
d9 = a.dominatingSet(data[y==9],y[y==9],epsilon = 0.2)
domdata = np.concatenate((data[y==0][d0],data[y==1][d1],data[y==2][d2],data[y==3][d3],data[y==4][d4],data[y==5][d5],data[y==6][d6],data[y==7][d7],data[y==8][d8],data[y==9][d9]))
domy = np.concatenate((y[y==0][d0],y[y==1][d1],y[y==2][d2],y[y==3][d3],y[y==4][d4],y[y==5][d5],y[y==6][d6],y[y==7][d7],y[y==8][d8],y[y==9][d9]))
end = time.time()
print(abs(start-end))

### 4. Generation of a random dataset with the dominating dataset length.

In [None]:
random_index = random.sample(range(len(data)),len(domdata))
Xrand = data[random_index]
yrand = y[random_index]

### 5. Perceptron training.

In [None]:
y = utils.to_categorical(y,10)
domy = utils.to_categorical(domy,10)

In [None]:
model = Sequential()
model.add(Dense(units=32, activation='sigmoid', input_shape=(64,)))
model.add(Dense(units=10, activation='softmax'))
model.compile(optimizer="sgd", loss='categorical_crossentropy', metrics=['accuracy'])
start = time.time()
model.fit(data, y, batch_size=len(data), epochs=100000, verbose=False)
end = time.time()
print(abs(start-end))

In [None]:
model2 = Sequential()
model2.add(Dense(units=32, activation='sigmoid', input_shape=(64,)))
model2.add(Dense(units=10, activation='softmax'))
model2.compile(optimizer="sgd", loss='categorical_crossentropy', metrics=['accuracy'])
start = time.time()
model2.fit(domdata,domy , batch_size=len(domdata), epochs=100000, verbose=False)
end = time.time()
print(abs(start-end))

#### 100 iteration of the training (Takes a lot of time).

With the original dataset...

In [None]:
l1 = []
l2 = []
l3 = []
for i in range(100):
    random_index = random.sample(range(len(data)),len(domdata))
    Xrand = data[random_index]
    yrand = y[random_index]
    model = Sequential()
    model.add(Dense(units=32, activation='sigmoid', input_shape=(64,)))
    model.add(Dense(units=10, activation='softmax'))
    model.compile(optimizer="sgd", loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(data,y , batch_size=1, epochs=20, verbose=False)
    l1.append(model.evaluate(data,y)[1])
    l2.append(model.evaluate(domdata,domy)[1])
    l3.append(model.evaluate(Xrand,yrand)[1])

Metrics of the training over the different datasets

In [None]:
print(np.max(l1))
print(np.max(l2))
print(np.max(l3))
print(np.mean(l1))
print(np.mean(l2))
print(np.mean(l3))

In [None]:
print("Evaluation over original dataset: ",model.evaluate(data,y))
print("Evaluation over dominating dataset: ",model.evaluate(domdata,domy))
print("Evaluation over random dataset; ",model.evaluate(Xrand,yrand))

With the dominating dataset...

In [None]:
l1 = []
l2 = []
for i in range(100):
    
    model2 = Sequential()
    model2.add(Dense(units=32, activation='sigmoid', input_shape=(64,)))
    model2.add(Dense(units=10, activation='softmax'))
    model2.compile(optimizer="sgd", loss='categorical_crossentropy', metrics=['accuracy'])
    model2.fit(domdata,domy , batch_size=1, epochs=9*20, verbose=False)
    l1.append(model2.evaluate(domdata,domy)[1])
    l2.append(model2.evaluate(data,y)[1])

Metrics of the training over the different datasets

In [None]:
print(np.max(l1))
print(np.max(l2))
print(np.max(l3))
print(np.mean(l1))
print(np.mean(l2))
print(np.mean(l3))

In [None]:
print("Evaluation over original dataset: ",model2.evaluate(data,y))
print("Evaluation over dominating dataset: ",model2.evaluate(domdata,domy))
print("Evaluation over random dataset; ",model2.evaluate(Xrand,yrand))

with the random dataset...

In [None]:
l1 = []
l2 = []
for i in range(100):
    random_index = random.sample(range(len(data)),len(domdata))
    Xrand = data[random_index]
    yrand = y[random_index]
    model3 = Sequential()
    model3.add(Dense(units=32, activation='sigmoid', input_shape=(64,)))
    model3.add(Dense(units=10, activation='softmax'))
    model3.compile(optimizer="sgd", loss='categorical_crossentropy', metrics=['accuracy'])
    model3.fit(Xrand,yrand , batch_size=1, epochs=9*20, verbose=False)
    l1.append(model3.evaluate(Xrand,yrand,verbose = 0)[1])
    l2.append(model3.evaluate(data,y,verbose = 0)[1])
print("Mean accuracy over itself:",np.mean(np.array(l1)))
print("Mean accuracy over full dataset:",np.mean(np.array(l2)))

In [None]:
print(np.max(l1))
print(np.max(l2))
print(np.max(l3))
print(np.mean(l1))
print(np.mean(l2))
print(np.mean(l3))

In [None]:
print("Evaluation over original dataset: ",model3.evaluate(data,y))
print("Evaluation over dominating dataset: ",model3.evaluate(domdata,domy))
print("Evaluation over random dataset; ",model3.evaluate(Xrand,yrand))

### 6. Persistent homology

#### Persistence diagrams

In [None]:
from ripser import ripser, plot_dgms
diagrams_or = ripser(data,maxdim=0)['dgms']
diagrams_Sub = ripser(domdata,maxdim=0)['dgms']
diagrams_Rand = ripser(Xrand,maxdim=0)['dgms']

Plot of the persistence diagrams

In [None]:
print("Original dataset")
plot_dgms(diagrams_or, show=True)
print("Dominating dataset")
plot_dgms(diagrams_Sub, show=True)
print("Random dataset")
plot_dgms(diagrams_Rand, show=True)

#### Bottleneck distance

In [None]:
import gudhi as g
message = "Bottleneck distance for dominating dataset and dimension 0 =" + '%.2f' % g.bottleneck_distance(diagrams_or[0], diagrams_Sub[0])
print(message)
message = "Bottleneck distance for dominating dataset and dimension 1 =" + '%.2f' % g.bottleneck_distance(diagrams_or[1], diagrams_Sub[1])
print(message)
message = "Bottleneck distance for Dominating dataset and dimension 2 =" + '%.2f' % g.bottleneck_distance(diagrams_or[2], diagrams_Sub[2])
print(message)
message = "Bottleneck distance for Random dataset and dimension 0 =" + '%.2f' % g.bottleneck_distance(diagrams_or[0], diagrams_Rand[0])
print(message)
message = "Bottleneck distance for Random dataset and dimension 1 =" + '%.2f' % g.bottleneck_distance(diagrams_or[1], diagrams_Rand[1])
print(message)
message = "Bottleneck distance for Random dataset and dimension 2 =" + '%.2f' % g.bottleneck_distance(diagrams_or[2], diagrams_Rand[2])
print(message)

#### Hausdorff distance

In [None]:
from scipy.spatial.distance import directed_hausdorff
print("Hausdorff distance between the original dataset and the dominating dataset: ",max(directed_hausdorff(data, domdata)[0], directed_hausdorff(domdata, data)[0]))
print("Hausdorff distance between the original dataset and the random dataset: ",max(directed_hausdorff(data, Xrand)[0], directed_hausdorff(Xrand, data)[0]))

### 7. T-SNE plots

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})
RS = 123
def fashion_scatter(x, colors):
    # choose a color palette with seaborn.
    num_classes = len(np.unique(colors))
    palette = np.array(sns.color_palette("hls", num_classes))

    # create a scatter plot.
    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40, c=palette[colors.astype(np.int)])
    plt.xlim(-25, 25)
    plt.ylim(-25, 25)
    ax.axis('off')
    ax.axis('tight')

    # add the labels for each digit corresponding to the label
    txts = []

    for i in range(num_classes):

        # Position of each label at median of data points.

        xtext, ytext = np.median(x[colors == i, :], axis=0)
        txt = ax.text(xtext, ytext, str(i), fontsize=24)
        txt.set_path_effects([
            PathEffects.Stroke(linewidth=5, foreground="w"),
            PathEffects.Normal()])
        txts.append(txt)

    return f, ax, sc, txts

In [None]:
from sklearn.manifold import TSNE
Xs_tsne_2D_or = TSNE(n_components=2,perplexity =55).fit_transform(data)
fashion_scatter(Xs_tsne_2D_or, y)
fashion_scatter(Xs_tsne_2D_or[index], y[index])
fashion_scatter(Xs_tsne_2D_or[random_index], y[random_index])