In [1]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd 
import sys
import pickle
workingdirectory = os.popen('git rev-parse --show-toplevel').read()[:-1]
sys.path.append(workingdirectory)
os.chdir(workingdirectory)

import allensdk.core.json_utilities as ju
#import mcmodels
from mcmodels.core import VoxelModelCache#.voxel_model_cache import VoxelModelCache
from mcmodels.core import Mask
from mcmodels.core.utils import get_structure_id, get_ordered_summary_structures
from mcmodels.utils import nonzero_unique, unionize
from mcmodels.core.utils import get_minorstructures
from mcmodels.models.crossvalidation import get_loocv_predictions
from mcmodels.core.utils import get_loss_paper
from mcmodels.core.experiment import get_voxeldata_msvd
from mcmodels.core.utils import get_loss_paper

from mcmodels.core import ModelData
#from VoxelModel import VoxelModel
#from HomogeneousModel import HomogeneousModel

#from NadarayaWatson import NadarayaWatson



In [2]:
#read data
TOP_DIR = '/Users/samsonkoelle/alleninstitute/mcm_2020/mcm_updated/'
INPUT_JSON = os.path.join(TOP_DIR, 'input_011520.json')
EXPERIMENTS_EXCLUDE_JSON = os.path.join(TOP_DIR, 'experiments_exclude.json')
FILE_DIR = '/Users/samsonkoelle/alleninstitute/mcm_2020/mcm_updated/'
OUTPUT_DIR = os.path.join(FILE_DIR, 'output')

input_data = ju.read(INPUT_JSON)
manifest_file = input_data.get('manifest_file')
manifest_file = os.path.join(TOP_DIR, manifest_file)
experiments_exclude = ju.read(EXPERIMENTS_EXCLUDE_JSON)

#its unclear why the hyperparameters are loaded from the output directory
cache = VoxelModelCache(manifest_file=manifest_file)
major_structures = input_data.get('structures')
major_structure_ids = [get_structure_id(cache, s) for s in major_structures]

#regionalize voxel model: compare with regional model
#regional parameters
cre = None
eid_set=None
high_res=False
threshold_injection = False

In [3]:
COARSE_STRUCTURE_SET_ID = 2
DEFAULT_STRUCTURE_SET_IDS = tuple([COARSE_STRUCTURE_SET_ID])
tree = cache.get_structure_tree()
default_structures = tree.get_structures_by_set_id(DEFAULT_STRUCTURE_SET_IDS)
default_structure_ids = [st['id'] for st in default_structures if st['id'] != 934]
#cre= True

In [4]:
msvds = {}
#gammas = np.asarray([0.1])
for sid in major_structure_ids:
    print(sid)
    voxel_data = ModelData(cache, sid)
    print(cre)
    experiment_ids = voxel_data.get_experiment_ids(experiments_exclude=experiments_exclude, cre=cre)
    experiment_ids = np.asarray(list(experiment_ids))    
    msvd = get_voxeldata_msvd(cache, sid,experiments_exclude,default_structure_ids,cre)
    #msvd.l2losses, msvd.paperlosses,msvd.normspredict,msvd.normtrue = single_region_cv(msvd, gammas)
    msvds[sid]  = msvd

512
None
703
None
1089
None
1097
None
315
None
313
None
354
None
698
None
771
None
803
None
477
None
549
None


In [6]:
data_info = pd.read_excel('/Users/samsonkoelle/alleninstitute/Whole Brain Cre Image Series_curation only.xlsx', 'all datasets curated_070919pull')
data_info.set_index("id", inplace=True)
ontological_order = get_ordered_summary_structures(cache)

exps = np.asarray(data_info.index.values , dtype = np.int)
creline = {}
for sid in major_structure_ids:
    msvd = msvds[sid]
    experiment_ids = np.asarray(list(msvd.experiments.keys()))
    nexp = len(experiment_ids)
    creline[sid] = np.zeros(nexp, dtype = object)
    for i in range(len(experiment_ids)):
        index = np.where(exps == experiment_ids[i])[0][0]
        creline[sid][i] = data_info['transgenic-line'].iloc[index]
        
experiments_minor_structures = {}
for sid in major_structure_ids:
    msvd = msvds[sid]
    eids = np.asarray(list(msvd.experiments.keys()))
    experiments_minor_structures[sid] = get_minorstructures(eids, data_info)

In [7]:
key = list(msvd.experiments.keys())[0]
contra_key = msvd.experiments[key].projection_mask.get_key(structure_ids=ontological_order, hemisphere_id=1)
ipsi_key = msvd.experiments[key].projection_mask.get_key(structure_ids=ontological_order, hemisphere_id=2)

cre = None
l2losses = {}
paperlosses = {}
normspredict = {}
normtrue = {}
for sid in major_structure_ids:
    #print()
    msvd = msvds[sid]
    nexp = msvd.projections.shape[0]
    ngam = 1
    
    l2losses[sid] = np.zeros((ngam,nexp))
    paperlosses[sid] = np.zeros((ngam,nexp))
    normspredict[sid] = np.zeros((ngam,nexp))
    normtrue[sid] = np.zeros(nexp)

    minor_structures = np.unique(experiments_minor_structures[sid])
    nmins = len(minor_structures)
    
    projections = msvd.projections
    ipsi_proj = unionize(projections, ipsi_key)
    contra_proj = unionize(projections, contra_key)
    reg_proj = np.hstack([ipsi_proj, contra_proj])
    msvd.reg_proj = reg_proj
    
    ipsi_target_regions, ipsi_target_counts = nonzero_unique(ipsi_key, return_counts=True)
    contra_target_regions, contra_target_counts = nonzero_unique(contra_key, return_counts=True)
    target_counts = np.concatenate([ipsi_target_counts, contra_target_counts])
    reg_proj_vcount_norm = np.divide(reg_proj, target_counts[np.newaxis, :])
    msvd.reg_proj_vcount_norm = reg_proj_vcount_norm


    source_mask = Mask.from_cache(cache, structure_ids=[sid], hemisphere_id=2)
    source_key = source_mask.get_key(structure_ids=ontological_order)  
    source_target_counts, source_target_counts =nonzero_unique(source_key, return_counts=True)
    
    injections = msvd.injections
    reg_ipsi_inj = unionize(injections, source_key)
    msvd.reg_inj = reg_ipsi_inj  
    reg_inj_vcount_norm = np.divide(reg_ipsi_inj, source_target_counts[np.newaxis, :])
    msvd.reg_inj_vcount_norm = reg_inj_vcount_norm

In [8]:
nms= 12
for m in range(nms):
    print(m,'m')
    sid = major_structure_ids[m]
    msvd = msvds[sid]
    projections = msvd.reg_proj_vcount_norm
    projections = projections / np.expand_dims(np.linalg.norm(projections, axis = 1),1)
    msvds[sid].reg_proj_vcount_norm_renorm = projections

0 m
1 m
2 m
3 m
4 m
5 m
6 m
7 m
8 m
9 m
10 m
11 m


In [9]:
exps = np.asarray(data_info.index.values , dtype = np.int)
creline = {}
for sid in major_structure_ids:
    msvd = msvds[sid]
    experiment_ids = np.asarray(list(msvd.experiments.keys()))
    nexp = len(experiment_ids)
    creline[sid] = np.zeros(nexp, dtype = object)
    for i in range(len(experiment_ids)):
        index = np.where(exps == experiment_ids[i])[0][0]
        creline[sid][i] = data_info['transgenic-line'].iloc[index]

#inds_bad are where there is no injection
#we can use these to train but shouldnt to evaluate
#the 'wt_ind' should just be 'eval_ind'
inds_bad = {}
inds_good = {}
for sid in major_structure_ids:
    injections = msvds[sid].injections 
    inds_bad[sid] = np.where(injections.sum(axis = 1) == 0.)[0]
    inds_good[sid] = np.where(injections.sum(axis = 1) > 0.)[0]
    
nms = len(major_structure_ids)
#ngam = len(gammas)
wt_2ormore = {}
losses_reg_norm = np.zeros((nms,ngam))
inds_good_wt = {}
inds_good_wtsub = {}
for m in range(nms):
    sid = major_structure_ids[m]
    wt_inds = np.where(creline[sid] == 'C57BL/6J')[0]
    wt_2ormore[sid] = np.asarray([])
    #wt_2ormore is the indices of when there are 2 or more wild types
    if len(wt_inds) > 1:
        wt_2ormore[sid] = np.append(wt_2ormore[sid],wt_inds)
    wt_2ormore[sid] = np.asarray(wt_2ormore[sid], dtype = int)
    inds_good_wt[sid] = np.intersect1d(wt_2ormore[sid], inds_good[sid])
    inds_good_wtsub[sid] = np.where(np.isin(wt_2ormore[sid], inds_good[sid]))[0]

In [15]:
from mcmodels.models.crossvalidation import get_loocv_predictions_nnlinear_number_inj

In [20]:
#m  = 11
#nms = 12
for m in range(12):
#for m in np.asarray([4]):
    print(m,'m')
    sid = major_structure_ids[m]
    msvd = msvds[sid]
    projections = msvd.reg_proj_vcount_norm
    projections = projections / np.expand_dims(np.linalg.norm(projections, axis = 1),1)
    msvds[sid].reg_proj_vcount_norm_renorm = projections
    injections = msvd.reg_inj_vcount_norm / np.expand_dims(np.linalg.norm(msvd.reg_inj_vcount_norm, axis = 1),1)
    injections[np.where(np.isnan(injections))] = 0.
    nreg = projections.shape[1]
    nexp = projections.shape[0]
    nfeat = injections.shape[1]
    wt_inds = np.where(creline[sid] == 'C57BL/6J')[0]


    #nexp rather than nwtexp to retain indexing
    #msvds[sid].loocv_predictions_wt = np.zeros((ngam, nexp, nreg))

    #msvds[sid].loocv_predictions_all[g] = get_loocv_predictions(projections, centroids, gammas[g])

#    thresh = 0.000001
    #hyperparameters are number needed to be present above thresh
    #treat thresh as fixed and low for now
#     msvds[sid].loocv_predictions_all_in = np.zeros((5, nexp, nreg))
#     for g in range(5):
#         print(g,'g')
#         msvds[sid].loocv_predictions_all_in[g] = get_loocv_predictions_nnlinear_number_inj(projections, injections,1e-10,g)
    #msvds[sid].loocv_predictions_all_in = np.zeros((nexp, nreg))
    #for g in range(5):
    #    print(g,'g')
    msvds[sid].loocv_predictions_all_in = get_loocv_predictions_nnlinear_number_inj(projections, injections,1e-10,0)
    msvds[sid].loocv_predictions_all_in_pn = msvds[sid].loocv_predictions_all_in / np.expand_dims(np.linalg.norm(msvds[sid].loocv_predictions_all_in, axis = 1),1)


    #lpca can projection onto principal components in log space then transform back.
    #alternatively, find pcas in log space and factor design matrix onto the retransformed components in the original space
    #variability will be concentrated in machine e low components
    #set to zero and true nonzeros will drive variability
    #msvds[sid].loocv_predictions_all_lpca[0] = get_loocv_predictions_nnlinear_lpca

    #projection on nmf, fit, and back
    #hyp is n components
#     ncomps = np.asarray(np.linspace(nfeat / 2, nfeat-1, 5), dtype = int)
#     msvds[sid].loocv_predictions_all_nmf_in = np.zeros((5, nexp, nreg))
#     nmf_recon_err_in = np.zeros((5,nexp))
#     for g in range(5):
#         print(g)
#         msvds[sid].loocv_predictions_all_nmf_in[g],nmf_recon_err_in[g] = get_loocv_predictions_nnlinear_nmf(projections, injections,ncomps[g])


0 m




1 m
2 m
3 m
4 m
5 m
6 m




7 m
8 m
9 m
10 m
11 m


In [18]:
msvds[sid].loocv_predictions_all_in_pn = msvds[sid].loocv_predictions_all_in / np.expand_dims(np.linalg.norm(msvds[sid].loocv_predictions_all_in, axis = 1),1)

In [11]:
np.linalg.norm(msvds[sid].projections, axis = 1).shape

(83,)

In [19]:
msvds[sid].loocv_predictions_all_in_pn.shape

(1128, 577)