In [1]:
# --- Import Libraries
import copy
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model, Sequential, layers, losses, optimizers
from sklearn import random_projection
from sklearn.metrics import accuracy_score

from data import Dataset

**Notes:** 
* https://link.springer.com/article/10.1007/s10044-018-0697-0
* https://keras.io/examples/vision/grad_cam/

In [2]:
#--- Autoselect GPU
from jarvis.utils.general import gpus
gpus.autoselect()

[ 2021-04-27 12:44:59 ] CUDA_VISIBLE_DEVICES automatically set to: 1           


In [3]:
# --- Model Blocks
conv = lambda x, features, dr=1, name=None : layers.Conv1D(filters=features, kernel_size=1, strides=1, dilation_rate=dr, padding='same', name=name)(x)
elu  = lambda x: layers.ELU()(x)
norm = lambda x: layers.BatchNormalization()(x)
mlp  = lambda x, features, dr=1: elu(norm((conv(x, features, dr))))

# --- Model Architecture
def encoder(x, features):
    for f in features:
        x = mlp(x, f)
    return x

def decoder(x, features):
    features = features[::-1]
    for f in features:
        x = mlp(x, f)
    return x

def network(dataset):
    tf.random.set_seed(0)
    x = dataset.Input
    input_size = x.shape[-1]
    features = [128, 64]
    embed_size = 32
    outputs = {}
    
    x_ = encoder(x, features)
    embedding = mlp(x_, embed_size)
    x_ = decoder(embedding, features)

    classification = conv(embedding, 2, name=dataset.l1)
    reconstruction = conv(x_, input_size, name=dataset.l2)
    
    outputs[dataset.l1] = classification
    outputs[dataset.l2] = reconstruction
    
    return Model(x, outputs)
    
# --- Model Compile
def compile_(model, dataset, lr=1e-4):
    model.compile(
        optimizer=optimizers.Adam(learning_rate=lr), 
        loss={dataset.l1: losses.SparseCategoricalCrossentropy(from_logits=True), 
              dataset.l2: losses.MSE}, 
        metrics={dataset.l1: 'accuracy', 
                 dataset.l2: 'mean_absolute_error'},
        experimental_run_tf_function=False)

    return model

# --- Model Trainer
def train(model, dataset):
    outputs = {}
    outputs[dataset.l1] = dataset.ytr
    outputs[dataset.l2] = dataset.xtr
    class_weights = {dataset.l1: dataset.weights}
    
    validation = {}
    validation[dataset.l1] = dataset.yte
    validation[dataset.l2] = dataset.xte
    
    history = model.fit(
        x=dataset.xtr,
        y=outputs, 
        batch_size=4, 
        epochs=80, 
        validation_data=(dataset.xte, validation), 
        validation_freq=10,
        class_weight=class_weights,
        verbose=0,
    )
    return history

In [4]:
def summarize(history):
    print('train acc: {}'.format(history.history['classification_accuracy'][-1]))
    print('train reconstruction error: {}'.format(history.history['reconstruction_mean_absolute_error'][-1]))

    print('valid acc: {}'.format(history.history['val_classification_accuracy'][-1]))
    print('valid reconstruction error: {}'.format(history.history['val_reconstruction_mean_absolute_error'][-1]))

def pred_one_class_acc(y):
    return sum(y/len(y))
    
def evaluate(model, data, labels):
    # test on whole dataset
    preds = model.predict(data)

    if type(preds) == list: preds = preds[0];
    preds = np.argmax(preds, axis=-1)
    
    print('Dominant class acc: {}'.format(pred_one_class_acc(labels)))
    print('Model pred acc: {}'.format(accuracy_score(labels, preds)))

In [8]:
def run_pipeline(path, mode):
    # --- Prepare Data
    dataset = Dataset(path, train_size=0.7)
    
    # --- Feature Selection
    dataset.feature_selection(percentile=10, mode=mode)
    
    # --- Prepare Model
    model = network(dataset)
    model = compile_(model, dataset)
    
    # --- Train Model
    history = train(model, dataset)
    
    # --- Summarize Results
    print()
    print('Training results from {}'.format(mode))
    summarize(history)
    print('Evaluating results from {}'.format(mode))
    print('Train')
    evaluate(model, dataset.xtr, dataset.ytr)
    print('Test')
    evaluate(model, dataset.xte, dataset.yte)
    print()

In [9]:
# --- Train Models
path = 'data/ctrl_vs_case.csv'
modes = ['default', 'chi', 'mutual_info']

for mode in modes:
    run_pipeline(path, mode)

default not in {chi, mutual_info} so using default features

Training results from default
train acc: 0.4646017551422119
train reconstruction error: 518.4947509765625
valid acc: 0.4818652868270874
valid reconstruction error: 513.9679565429688
Evaluating results from default
Train
Dominant class acc: 0.8157894736842123
Model pred acc: 0.5701754385964912
Test
Dominant class acc: 0.8163265306122455
Model pred acc: 0.3673469387755102


Training results from chi
train acc: 0.4247787594795227
train reconstruction error: 3193.3564453125
valid acc: 0.5336787700653076
valid reconstruction error: 3175.160400390625
Evaluating results from chi
Train
Dominant class acc: 0.8157894736842123
Model pred acc: 0.5263157894736842
Test
Dominant class acc: 0.8163265306122455
Model pred acc: 0.5918367346938775


Training results from mutual_info
train acc: 0.47345131635665894
train reconstruction error: 471.717041015625
valid acc: 0.6165803074836731
valid reconstruction error: 470.0350646972656
Evaluating re

# Notes

In [None]:
# Nonlinear [1024, 512, 256] embed 128
print(history.history['classification_accuracy'][-1])
print(history.history['reconstruction_mean_absolute_error'][-1])

print(history.history['val_classification_accuracy'][-1])
print(history.history['val_reconstruction_mean_absolute_error'][-1])


# (pred best class 0.8163265306122455)

# no class_weights 
# 0.7920354
# 489.65833
# 0.8134715
# 476.4897

# class_weights
# 0.53539824
# 490.59412
# 0.74093264
# 476.9029

# [256] class_weights
# 0.5132743
# 514.46576
# 0.54404145
# 513.9966

# [128, 64] class_weights embed 32
# 0.53097343
# 518.819
# 0.7098446
# 515.60516

# Deeper better than wider!

In [17]:
# Linear [1024, 512, 256] embed 128
print(history.history['classification_accuracy'][-1])
print(history.history['reconstruction_mean_absolute_error'][-1])

print(history.history['val_classification_accuracy'][-1])
print(history.history['val_reconstruction_mean_absolute_error'][-1])

# no class_weights 
# 0.8097345
# 506.56668
# 0.7512953
# 497.98688

0.5176991
506.6457
0.3834197
500.44925


# Machine Learning Stuff...

In [12]:
transformer = random_projection.GaussianRandomProjection()
X_new = transformer.fit_transform(dataset.features.squeeze())
print(X_new.shape)

(163, 4366)


In [13]:
X_new

array([[  7794.5112722 ,  25466.99115922,  50625.08150119, ...,
        -18741.85192469, -14975.12450545, -24716.42935759],
       [ 11071.49525594,  20976.92404719,  39328.86054669, ...,
        -14292.56014547,  -6918.82888456, -19860.47128973],
       [  8011.61620477,  17182.79149171,  40489.7180404 , ...,
        -12251.25835009,  -8394.20052692, -15792.59517772],
       ...,
       [ 13475.77548306,  21790.79481885,  44833.57616194, ...,
        -12127.87087676, -15508.98270711, -16103.80338758],
       [  7025.02032196,  16959.86900662,  42436.73625363, ...,
        -14074.24514158, -10336.02692565, -16806.70632879],
       [ 10728.48657413,  19286.38915573,  44174.01361244, ...,
        -17721.62286345, -15242.08650982, -21161.04696759]])

In [74]:
#https://jakevdp.github.io/PythonDataScienceHandbook/05.10-manifold-learning.html

In [5]:
from sklearn.manifold import LocallyLinearEmbedding
model = LocallyLinearEmbedding(n_neighbors=30, n_components=3, method='modified',
                               eigen_solver='dense')

out = model.fit_transform(dataset.features.squeeze())

In [None]:
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(out[:, 0], out[:, 1], out[:, 2])


plt.show()

In [7]:
X = dataset.features.squeeze()

In [13]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


Xtr, Xte, Ytr, Yte = [np.array(data.squeeze()) for data in [dataset.xtr, dataset.xte, dataset.ytr, dataset.yte]]


def pca(x, n=10, verbose=False):
    model = PCA(n).fit(x)
    if verbose:
        plt.plot(np.cumsum(model.explained_variance_ratio_))
        plt.xlabel('n components')
        plt.ylabel('cumulative variance');
    return model


def results(model, xtr, ytr, xte, yte):
    print('train: {}'.format(accuracy_score(ytr, model.predict(xtr))))
    print('train pick als: {}'.format(sum(ytr)/len(ytr)))
    print('test: {}'.format(accuracy_score(yte, model.predict(xte))))
    print('test pick als: {}'.format(sum(yte)/len(yte)))
    
    

def knn(xtr, ytr, xte, yte, n=3):
    model = KNeighborsClassifier(n_neighbors=n)
    model.fit(xtr, ytr)
    
    results(model, xtr, ytr, xte, yte)
    
    
def rf(xtr, ytr, xte, yte, n=1):
    model = RandomForestClassifier(n_estimators=500, max_depth=2, random_state=0, class_weight='balanced')
    model.fit(xtr, ytr)
    
    results(model, xtr, ytr, xte, yte)

In [14]:
# pca
pc = pca(Xtr)
xtr = pc.transform(Xtr)
xte = pc.transform(Xte)

print('- KNN')
knn(Xtr, Ytr, Xte, Yte)
print('- PCA')
knn(xtr, Ytr, xte, Yte)
print()
print('- RF')
rf(Xtr, Ytr, Xte, Yte)
print('- PCA')
rf(xtr, Ytr, xte, Yte)

- KNN
train: 0.8561643835616438
train pick als: 0.815068493150685
test: 0.8235294117647058
test pick als: 0.8235294117647058
- PCA
train: 0.8424657534246576
train pick als: 0.815068493150685
test: 0.8235294117647058
test pick als: 0.8235294117647058

- RF
train: 1.0
train pick als: 0.815068493150685
test: 0.8235294117647058
test pick als: 0.8235294117647058
- PCA
train: 0.8767123287671232
train pick als: 0.815068493150685
test: 0.8235294117647058
test pick als: 0.8235294117647058
