In [1]:
import glmnet_python
from glmnet import glmnet

# Import relevant modules and setup for calling glmnet
%reset -f
%matplotlib inline

import sys
import os
import re

import tensorflow as tf # data is in TFRecord format

import scipy, importlib, pprint, matplotlib.pyplot as plt, warnings
import numpy as np
from glmnet import glmnet; from glmnetPlot import glmnetPlot
from glmnetPrint import glmnetPrint; from glmnetCoef import glmnetCoef; from glmnetPredict import glmnetPredict
from cvglmnet import cvglmnet; from cvglmnetCoef import cvglmnetCoef
from cvglmnetPlot import cvglmnetPlot; from cvglmnetPredict import cvglmnetPredict

# glmnet has many deprecation warnings from using scipy.* instad of numpy.*
warnings.filterwarnings("ignore", category=DeprecationWarning) 


In [13]:
# Define constants
names = ['DA', '5HT', 'pH', 'NE']
ncores = 28 # 56
prefix = os.path.join('/mnt/nfs/proj/in-vitro/Mark/four_analyte/slow/allin')
good_probes = ['CF025', 'CF027']
# good_probes = ['CF025', 'CF027', 'CF057', 'CF064', 'CF066', 'CF078', 'CF081', 'CF082']
val_ratio = .1
# hold_probe = 0
hold_probe = -1 # split data randomly


In [3]:
# Load data

def natural_keys(text):
    '''
    alist.sort(key=natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    '''
    def atoi(text):
        return int(text) if text.isdigit() else text
    return [atoi(c) for c in re.split(r'(\d+)', text)]

def preprocess(serialized_example):
    features = tf.io.parse_example(
        serialized_example,
        features={
            'gram': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.string)
        })
    data = tf.io.decode_raw(features['gram'], tf.float32)
    label = tf.io.decode_raw(features['label'], tf.float32)
    data.set_shape((None, 999))
    label.set_shape((None, 4))
    return data, label

def merge_datasets(vfiles, projy=lambda x: x, asnumpy=False):
    yv = []
    yl = []
    for filename in vfiles:
        ds = tf.data.TFRecordDataset(filename)
        ds = ds.batch(batch_size=2**13)
        ds = ds.map(map_func=preprocess)
        for v,l in ds:
            v = np.array(v).astype(np.float64)
            l = np.array(l).astype(np.float64)
            yv.append(v)
            l = np.apply_along_axis(projy, axis=1, arr=l) 
            yl.append(l)
        
    x = np.vstack(yv)
    y = np.vstack(yl)

    if asnumpy:
        return x,y
    else:
        d = tf.data.Dataset.from_tensor_slices((x, y))
        return d

if hold_probe < 0:
    all_files = sum([
        sorted(tf.io.gfile.glob(os.path.join(prefix, probe, 'total_records', '*')),
               key=natural_keys) for probe in good_probes
    ], [])

    x, y = merge_datasets(all_files, asnumpy=True)
    
    idxs = np.random.permutation(x.shape[0])
    lim = int(x.shape[0]*(1-val_ratio))
    d1idx = idxs[idxs[:lim]]
    d2idx = idxs[idxs[lim:]]
    x_train, y_train, x_val, y_val = x[d1idx,:], y[d1idx,:], x[d2idx,:], y[d2idx,:]
else:
    hold_probe = good_probes.pop(hold_probe)
    train_files = sum([
        sorted(tf.io.gfile.glob(os.path.join(prefix, probe, 'total_records', '*')),
               key=natural_keys) for probe in good_probes
    ], [])
    x_train, y_train = merge_datasets(train_files, asnumpy=True)
    
    val_files = sum([
        sorted(tf.io.gfile.glob(os.path.join(prefix, probe, 'total_records', '*')),
               key=natural_keys) for probe in [hold_probe]
    ], [])
    x_val, y_val = merge_datasets(val_files, asnumpy=True)

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)  


(110970, 999) (110970, 4)
(12330, 999) (12330, 4)


In [4]:
ncores

28

In [5]:
# fit GLMNET in parallel (ncores) with cross validation to find lambda
fit = cvglmnet(x = x_train.copy(), y = y_train.copy(), family='mgaussian', parallel=ncores, ptype = 'mse', nfolds = 20)

[status]	Parallel glmnet cv with 28 cores


In [6]:
fit['lambda_min']

array([0.10870285])

In [7]:
y_hat = cvglmnetPredict(fit, newx = x_val, s='lambda_min')

In [14]:
for (error, name) in zip(np.mean(np.sqrt((y_hat[:,:,0]-y_val)**2),axis=0),names):
    print('%s: %4.5f'%(name,error))

DA: 340.70265
5HT: 134.71918
pH: 0.02452
NE: 399.73640
