In [None]:
# --- Import Libraries
import copy
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model, Sequential, layers, losses, optimizers
from sklearn import random_projection
from sklearn.metrics import accuracy_score

from op import *
from ae import *
from utils import *
from metrics import *
from trainer import *
from data import Dataset

np.set_printoptions(precision=3)

**Notes:** 

**Deeper better than wider!**
**|**
**Nonlinear > Linear!**
**|**
**Small batch size = smaller loss!**
* https://link.springer.com/article/10.1007/s10044-018-0697-0
* https://keras.io/examples/vision/grad_cam/

In [None]:
#--- Autoselect GPU
from jarvis.utils.general import gpus
gpus.autoselect()

In [None]:
def run_pipeline(dataset, mode):
    # --- Reproducibility
    tf.random.set_seed(0)
    np.random.seed(0)
    
    # --- Feature Selection
    dataset.feature_selection(norm=False, percentile=10, mode=mode)
    print(dataset.features_.shape)
    print(dataset.features.shape)
    
    # --- Train Model
    print('Training using {} feature selection'.format(mode))
#     history, model = learn(dataset, batch_size=64, epochs=60, n_folds=5)
    history, model = learn(dataset, batch_size=64, epochs=60, n_folds=1)
    
    return history, model

In [None]:
# --- Prepare Data
if 'dataset' not in globals():
    path = ['data/ctrl_vs_case.csv', 'data/bulbar_vs_limb.csv', 'data/median_low_vs_high.csv']
    dataset = Dataset(path, train_size=0.7)

In [None]:
# --- Train Models
# modes = ['no', 'chi', 'mutual_info']
modes = ['no']
for mode in modes:
    history, model = run_pipeline(dataset, mode)

In [None]:
def recon(model, dataset, i, n):
    print(model.predict(dataset.xte)[1].squeeze()[i, :n])
    print(dataset.xte.squeeze()[i, :n])
    
recon(model, dataset, 9, 8)

In [None]:

pid = dataset.data.columns[0]
lbls = [lbl for lbl in dataset.label_names.keys()]
data = dataset.data.drop([pid], axis=1)
data = data.drop(lbls, axis=1)

# Machine Learning Stuff...

In [None]:
def default_dataset():
    path = 'data/ctrl_vs_case.csv'
    dataset = Dataset(path, train_size=0.7)
    dataset.feature_selection(mode='no')
    return dataset

In [None]:
# dataset = default_dataset()
Xtr, Xte, Ytr, Yte = [np.array(data.squeeze()) for data in [dataset.xtr, dataset.xte, dataset.ytr, dataset.yte]]

In [None]:
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold


def cross_valid(model, dataset, n_folds=5):
    x = dataset.features_
    y = dataset.labels
    kf = StratifiedKFold(n_splits=n_folds)

    for train_index, test_index in kf.split(x, y):
        xtr, xte = x[train_index], x[test_index]
        ytr, yte = y[train_index], y[test_index]
        data = (xtr.squeeze(), ytr, xte.squeeze(), yte)
        model(*data)
        
    return model


def pca(x, n=100, verbose=False):
    model = PCA(n).fit(x)
    if verbose:
        plt.plot(np.cumsum(model.explained_variance_ratio_))
        plt.xlabel('n components')
        plt.ylabel('cumulative variance');
    return model


def hard_predict(model, x):
    return model.predict(x)


def soft_predict(model, x):
    return model.predict_proba(x)[:, 1]


def plot_curves(model, x, y):
    pred = hard_predict(model, x)
    conf_scores = soft_predict(model, x)
    plot_auc(y, conf_scores, mode='roc', lw=2)
    plot_auc(y, conf_scores, mode='prc', lw=2)
    
    
def results(model, xtr, ytr, xte, yte):
    print('==================')
    print('baseline train acc: {}'.format(class_one_acc(ytr)))
    print('baseline test acc : {}'.format(class_one_acc(yte)))
    print()
    hptr = hard_predict(model, xtr)
    hpte = hard_predict(model, xte)
    print('model train acc: {}'.format(acc(ytr, hptr)))
    print('model test acc: {}'.format(acc(yte, hpte)))
    print()
    sptr = soft_predict(model, xtr)
    spte = soft_predict(model, xte)
    print('model train auc: {}'.format(roc_auc(ytr, sptr)))
    print('model test auc: {}'.format(roc_auc(yte, spte)))
    print('==================')
    
    print('Test-set')
    plot_curves(model, xte, yte)
    

def knn(xtr, ytr, xte, yte, n=3):
    model = KNeighborsClassifier(n_neighbors=n)
    model.fit(xtr, ytr)
    
    results(model, xtr, ytr, xte, yte)
    
    
def rf(xtr, ytr, xte, yte, d=2):
    model = RandomForestClassifier(n_estimators=500, max_depth=2, random_state=0, class_weight='balanced')
    model.fit(xtr, ytr)
    
    results(model, xtr, ytr, xte, yte)
    
def dt(xtr, ytr, xte, yte):
    model = DecisionTreeClassifier()
    model.fit(xtr, ytr)
    
    results(model, xtr, ytr, xte, yte)

In [None]:
def demo(Xtr, Xte, Ytr, Yte):
    # pca
    pc = pca(Xtr)
    xtr = pc.transform(Xtr)
    xte = pc.transform(Xte)
    print(Xtr.shape, xtr.shape)

    print('- KNN')
    knn(Xtr, Ytr, Xte, Yte)
    print('- PCA')
    knn(xtr, Ytr, xte, Yte)
    print()
    print('- RF')
    rf(Xtr, Ytr, Xte, Yte)
    print('- PCA')
    rf(xtr, Ytr, xte, Yte)
    print()
    print('- DT')
    dt(Xtr, Ytr, Xte, Yte)
    print('- PCA')
    dt(xtr, Ytr, xte, Yte)
    
demo(Xtr, Xte, Ytr, Yte)

In [None]:
cross_valid(dt, dataset, n_folds=5)

In [None]:
# TODO: check representational power of PCA features

dataset.pca(n_components=10, verbose=True)
np.sum(dataset.pca_.explained_variance_ratio_)

In [None]:
pca_importance(dataset)

In [None]:
# One pass through training split

def demo(Xtr, Xte, Ytr, Yte):
    pc = pca(Xtr)
    xtr = pc.transform(Xtr)
    xte = pc.transform(Xte)
    print(Xtr.shape, xtr.shape)

    print('- KNN')
    knn(Xtr, Ytr, Xte, Yte)
    print('- PCA')
    knn(xtr, Ytr, xte, Yte)
    print()
    print('- RF')
    rf(Xtr, Ytr, Xte, Yte)
    print('- PCA')
    rf(xtr, Ytr, xte, Yte)
    print()
    print('- DT')
    dt(Xtr, Ytr, Xte, Yte)
    print('- PCA')
    dt(xtr, Ytr, xte, Yte)
    
demo(Xtr, Xte, Ytr, Yte)