In [None]:
import os
import sys
import subprocess
import collections
import time
import nbformat
import socket
import re
import pickle

import numpy as np
import sklearn.metrics

import torch

lib_path = 'I:/code'
if not os.path.exists(lib_path):
  lib_path = '/media/6T/.tianle/.lib'
if not os.path.exists(lib_path):
  lib_path = '/projects/academic/azhang/tianlema/lib'
if os.path.exists(lib_path) and lib_path not in sys.path:
  sys.path.append(lib_path)
  
from dl.utils.visualization.visualization import *
from dl.utils.train import eval_classification, get_label_prob
from dl.utils.utils import *

%load_ext autoreload
%autoreload 2

In [None]:
def submit_job(model_type='nn', dense=False, residual=True, hidden_dim=[100, 100], 
               train_portion=0.7, val_portion=0.1, test_portion=0.2, 
               num_sets=10, num_folds=10, sel_set_idx=0,
               num_train_types=-1, 
               num_val_types=-1,
               num_test_types=-1,
               cv_type='instance-shuffle',
               sel_disease_types='all', 
               min_num_samples_per_type_cls=[100, 0],
               predefined_sample_set_file='auto-search',
               target_variable='PFI',
               target_variable_type='discrete',
               target_variable_range=[0, 1],
               data_type=['gene', 'mirna', 'methy', 'rppa'], 
               additional_vars=[],#['age_at_initial_pathologic_diagnosis', 'gender']
               additional_var_types=[],#['continuous', 'discrete']
               additional_var_ranges=[],
               normal_transform_feature=True, 
               randomize_labels=False,
               lr=5e-4,
               weight_decay=1e-4,
               num_epochs=1000,
               reduce_every=500,
               show_results_in_notebook=True, 
               idx_folder='results/data_split_idx', # no longer used
               notebook_folder='.', 
               template_file='exp_template.ipynb', 
               slurm_script='../gpu-slurm', 
               new_file=True, submit=True,
               cell_idx=2, gpu_id=3):
  """Create notebook and run it on dlm or submit to ccr slurm
  """
  # This is for filename
  sel_disease_type_str = sel_disease_types # will be overwritten if it is a list
  if isinstance(sel_disease_types, (list, tuple)):
    sel_disease_type_str = '-'.join(sorted(sel_disease_types))
  if isinstance(data_type, str):
    data_type_str = data_type
  else:
    data_type_str = '-'.join(sorted(data_type))
  if model_type == 'nn': # model_type, dense, residual are dependent
    assert not (residual and dense)
    if residual:
      model_type = 'resnet' 
    if dense:
      model_type = 'densenet'
  
  args = {'model_type': model_type, # model_type may be different from the argument
          'dense': dense,
          'residual': residual,
          'hidden_dim': hidden_dim,
          'train_portion': train_portion,
          'val_portion': val_portion,
          'test_portion': test_portion,
          'num_sets': num_sets,
          'num_folds': num_folds,
          'num_train_types': num_train_types, 
          'num_val_types': num_val_types,
          'num_test_types': num_test_types,
          'cv_type': cv_type,
          'sel_set_idx': sel_set_idx,
          'sel_disease_types': sel_disease_types,
          'min_num_samples_per_type_cls': min_num_samples_per_type_cls,
          'predefined_sample_set_file': predefined_sample_set_file,
          'target_variable': target_variable,
          'target_variable_type': target_variable_type,
          'target_variable_range': target_variable_range,
          'data_type': data_type,
          'additional_vars': additional_vars,#['age_at_initial_pathologic_diagnosis', 'gender']
          'additional_var_types': additional_var_types,#['continuous', 'discrete']
          'additional_var_ranges': additional_var_ranges,
          'normal_transform_feature': normal_transform_feature,
          'randomize_labels': randomize_labels,
          'lr': lr,
          'weight_decay': weight_decay,
          'num_epochs': num_epochs,
          'reduce_every': reduce_every,
          'show_results_in_notebook': show_results_in_notebook
         }
  
  predefined_sample_set_filename = (target_variable if isinstance(target_variable,str) 
                                else '-'.join(target_variable))
  predefined_sample_set_filename += f'_{cv_type}'
  if len(additional_vars) > 0:
    predefined_sample_set_filename += f"_{'-'.join(sorted(additional_vars))}"
  predefined_sample_set_filename += (f"_{data_type_str}_{sel_disease_type_str}_"
                                     f"{'-'.join(map(str, min_num_samples_per_type_cls))}")
  predefined_sample_set_filename += f"_{'-'.join(map(str, [train_portion, val_portion, test_portion]))}"
  if cv_type == 'group-shuffle' and num_train_types > 0:
    predefined_sample_set_filename += f"_{'-'.join(map(str, [num_train_types, num_val_types, num_test_types]))}"
  predefined_sample_set_filename += f'_{num_sets}sets'
  filename_prefix = f"{predefined_sample_set_filename}_{sel_set_idx}_{'-'.join(map(str, hidden_dim))}_{model_type}"
  filename = f'{filename_prefix}.ipynb'
  nb = nbformat.read(f'{notebook_folder}/{template_file}', 4)
  nb['cells'][0]['source'] = ("import socket\nif socket.gethostname() == 'dlm':\n"
                              "  %env CUDA_DEVICE_ORDER=PCI_BUS_ID\n"
                              f"  %env CUDA_VISIBLE_DEVICES={gpu_id}")
  nb['cells'][cell_idx]['source'] = '\n'.join(
    [f"{k} = '{v}'" if isinstance(v, str) else f'{k} = {v}' for k, v in args.items()])
  if os.path.exists(f'{notebook_folder}/{filename}'):
    print(f'To overwrite file {notebook_folder}/{filename}')
  else:
    print(f'To create file {notebook_folder}/{filename}')
  if new_file:
    nbformat.write(nb, f'{notebook_folder}/{filename}')
  
  if submit: # sometimes I just want to create files
    if re.search('ccr.buffalo.edu$', socket.gethostname()):
      command = f'sbatch {slurm_script} {notebook_folder}/{filename} {filename}'
      subprocess.run(command, shell=True)
      print(command)
    else:
      command = ['jupyter nbconvert', '--ExecutePreprocessor.timeout=360000',
               '--ExecutePreprocessor.allow_errors=True', '--to notebook', '--execute']
      command.append(f'{notebook_folder}/{filename} --output {filename}')
      command = ' '.join(command)
      start_time = time.time()
      tmp = subprocess.run(command, shell=True)
      end_time = time.time()
      print(f'Time spent: {end_time-start_time:.2f}')
  return filename_prefix

def load_results(disease_type_str = '0', #0-1-6-8-10-11-16-17
                  model_name = 'ml',
                  sel_set_idx = 0,
                  data_type_str = 'gene-mirna-rppa-methy',
                  data_split_str = '70-10-20',
                  hidden_dim_str = '100-100',
                  filefolder = 'results',
                  target_variable = 'pfi',
                  return_variable='metric_all',
                  filename=None, plot_acc=True, plot_loss=True):
  if filename is None:
    filename = (f'{filefolder}/{disease_type_str}_{data_type_str}_set{sel_set_idx}' 
                f'_{data_split_str}_{target_variable}_{hidden_dim_str}_{model_name}.pkl')
    
  with open(filename, 'rb') as f:
    data = pickle.load(f)
  if return_variable in data:
    return np.array(data[return_variable])
  metric = np.array(data['metric_all'])
  confusion_mat = np.array(data['confusion_mat_all'])
  model_names, split_names, metric_names = (data['model_names'], data['split_names'], 
                                            data['metric_names'])
  # sanity check
  assert metric.shape == (len(model_names), len(split_names), len(metric_names))
  assert confusion_mat.shape[:2] == (len(model_names), len(split_names))
  loss_his = data['loss_his_all']
  acc_his = np.array(data['acc_his_all'])
  title =  disease_type_str if len(disease_type_str)>2 else disease_stats[int(disease_type_str)]
  if plot_acc and len(acc_his)>0:
    for i, n in enumerate(split_names):
      plot_history(acc_his[:, i].T, title=f'{title} {n} acc', 
                   indices=None, colors='rgbkmc', markers='ov+*,<',
                       labels=model_names, linestyles=['']*6, markersize=3)
    for i, n in enumerate(model_names):
      plot_history(acc_his[i].T, title=f'{title} {n} acc', 
                   indices=None, colors='rgbkmc', markers='ov+*,<',
                       labels=split_names, linestyles=['']*6, markersize=3)
  if plot_loss and len(loss_his)>0:
    for i, n in enumerate(model_names):
      history = np.array(loss_his[i])
      if history.ndim == 2:
        plot_history(history.T, title=f'{title} {n} loss', indices=None, colors='rgbkmc', 
                     markers='ov+*,<',
                       labels=split_names, linestyles=['']*6, markersize=3)
      elif history.ndim == 3:
        for j in range(history.shape[2]):
           plot_history(history[:,:,j].T, title=f'{title} {n} loss{j}', indices=None, 
                        colors='rgbkmc', markers='ov+*,<',
                       labels=split_names, linestyles=['']*6, markersize=3)
      else:
        raise ValueError(f'{filename} {n} loss has unexpected shape')
  if return_variable == 'all':
    return metric, confusion_mat, model_names, split_names, metric_names, acc_his, loss_his

def new_notebook(kwargs, filename='notebook.ipynb', notebook_folder='.', 
               template_file='template.ipynb', 
               slurm_script='../gpu-slurm', 
               new_file=False, submit=False, run_local=True,
               cell_idx=2, gpu_id=3):
  """Create notebook and run it on dlm or submit to ccr slurm
  """
  nb = nbformat.read(f'{template_file}', 4)
  nb['cells'][0]['source'] = ("import socket\nif socket.gethostname() == 'dlm':\n"
                              "  %env CUDA_DEVICE_ORDER=PCI_BUS_ID\n"
                              f"  %env CUDA_VISIBLE_DEVICES={gpu_id}")
  new_source = ''
  for k, v in kwargs.items():
    if isinstance(v, str):
      new_source += f"{k} = '{v}'\n"
    elif isinstance(v, (list, tuple)):
      new_source += '{} = [{}]\n'.format(k, ', '.join(map(str, v)))
    else:
      new_source += f'{k} = {v}\n'
  new_source = new_source[:-1]   # remove the last '\n'     
  nb['cells'][cell_idx]['source'] = new_source
  if os.path.exists(f'{notebook_folder}/{filename}'):
    print(f'To overwrite file {notebook_folder}/{filename}')
  else:
    print(f'To create file {notebook_folder}/{filename}')
  if new_file:
    if not os.path.exists(notebook_folder):
      os.makedirs(notebook_folder)
    nbformat.write(nb, f'{notebook_folder}/{filename}')
  
  if submit: # sometimes I just want to create files
    if re.search('ccr.buffalo.edu$', socket.gethostname()) and not run_local:
      command = f'sbatch {slurm_script} {notebook_folder}/{filename} {filename}'
      subprocess.run(command, shell=True)
      print(command)
    else:
      command = ['jupyter nbconvert', '--ExecutePreprocessor.timeout=360000',
               '--ExecutePreprocessor.allow_errors=True', '--to notebook', '--execute']
      command.append(f'{notebook_folder}/{filename} --output {filename}')
      command = ' '.join(command)
      start_time = time.time()
      tmp = subprocess.run(command, shell=True)
      end_time = time.time()
      print(f'Time spent: {end_time-start_time:.2f}')
  return filename

# data_folder = '/media/6T/.Trash-1014/pan-can-atlas/data/processed'
# if not os.path.exists(data_folder):
#   data_folder = ('/projects/academic/azhang/tianlema/deeplearning/'
#     'pan-can-atlas/data/processed')
# if not os.path.exists(data_folder):
#   data_folder = 'F:/TCGA/Pan-Cancer-Atlas/data/processed'
# with open(f'{data_folder}/sel_patient_clinical.pkl', 'rb') as f:
#   data = pickle.load(f)
#   disease_types = data['disease_types']
#   disease_type_dict = data['disease_type_dict']
#   pfi = data['pfi']
# disease_stats = {}
# for idx, name in disease_type_dict.items():
#   cnt = list(collections.Counter(pfi[disease_types==idx]).values())
#   if cnt[0] > 100 and cnt[1] > 100:
#     disease_stats[idx] = f'{name}: {cnt}'
#     print(name, idx, cnt)

In [None]:
use_jia_data = False
randomize_labels = False
init_num_gene = 5000
min_num_gene_per_go = 5
lr = 1e-3
weight_decay = 1e-4
num_epochs = 100
reduce_every = 100
batch_size = None
print_every = 1
eval_every = 1
return_best_val = True
result_folder = 'results'

In [None]:
target_name = 'overall_survival'
seeds = range(10)
if target_name == 'sample_type':
  split_portion = [1, 1, 8]
else:
  split_portion = [6, 2, 2]

In [None]:
if target_name == 'sample_type':
  # num['01'] >= 100 and num['11'] >= 50
  cancer_type_done = ['LUAD', 'LIHC', 'BRCA', 'KIRC', 'PRAD', 'THCA']
elif target_name == 'tumor_stage':
  # min_num_per_cls = 100, min_num_per_type = 200
  cancer_type_done = ['LUSC', 'LUAD', 'BLCA', 'BRCA', 'KIRC', 'THCA']
elif target_name == 'pfi':
  # num[0] >= 100, num[1] >= 100
  cancer_type_done = ['BLCA', 'BRCA', 'COAD', 'HNSC', 'KIRC', 'LGG', 'LIHC', 'LUAD', 
                      'LUSC', 'OV', 'SARC', 'SKCM', 'STAD', 'UCEC']
elif target_name == 'overall_survival':
  # num[0] >= 100, num[1] >= 100
  cancer_type_done = ['BLCA', 'BRCA', 'COAD', 'HNSC', 'KIRC', 'LGG', 'LIHC', 'LUAD',
                      'LUSC', 'OV', 'SKCM', 'STAD']
  
for sel_proj_id in cancer_type_done:
  for seed in seeds:
    kwargs = {'use_jia_data': use_jia_data,
          'sel_proj_id': sel_proj_id,
          'target_name': target_name,
          'split_portion': split_portion, 
         'seed': seed,
         'randomize_labels': randomize_labels,
         'init_num_gene': init_num_gene,
         'min_num_gene_per_go': min_num_gene_per_go,
         'lr': lr,
         'weight_decay': weight_decay,
         'num_epochs': num_epochs,
         'reduce_every': reduce_every,
         'batch_size': batch_size,
         'print_every': print_every,
         'eval_every': eval_every,
         'return_best_val': return_best_val,
             'result_folder': result_folder}
    if use_jia_data:
      sel_proj_id = 'JIA'
    split_portion_str = np.array(split_portion)
    split_portion_str = split_portion_str * 100 / split_portion_str.sum()
    split_portion_str = '-'.join(map(lambda s: str(int(s)), split_portion_str))
    res_filename_prefix = f'{sel_proj_id}_{split_portion_str}_seed{seed}_{target_name}'
    filename = f'{res_filename_prefix}.ipynb'
    new_notebook(kwargs, filename=filename, notebook_folder='results', 
             template_file='GeneNet.ipynb', 
             slurm_script='gpu-slurm', 
             new_file=True, submit=True, run_local=True,
             cell_idx=2, gpu_id=2)