In [14]:
from __future__ import print_function 

%load_ext autoreload
%autoreload 2

from matplotlib import pyplot as plt
%matplotlib inline

import os, sys
import numpy as np
import time

import tensorflow as tf
from tensorflow.keras import backend as K
K.clear_session()

import pandas as pd
import pickle
import gc, re, copy
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.layers import deserialize, serialize
from tensorflow.python.keras.saving import saving_utils

# Project imports 
from data import mnist_m as mnistm
from data import mnist
from data.label_shift import label_shift_linear, plot_labeldist, plot_splitbars
from data.tasks import load_task
from experiments.training import *
from bounds.bounds import *
from util.kl import *
from util.misc import *
from util.batch import *
# 
# Hyper-parameters
task = 7
seeds = [1]#[1,2,3,4,5]#[1,2,3,4,5]#1,2,3,4,5,6
image_size=64
batch_size=32
n_classifiers = 5
delta=0.05 ## what would this be?   
binary=True
bound='germain'
architecture="resnet"
#epsilons=[0.1]
alphas=[0.3]#[0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
#[0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
sigmas=[[3,3],[3,4]]

#project_folder = "/cephyr/users/adambre/Alvis/"
project_folder="/cephyr/NOBACKUP/groups/snic2021-23-538/mnist_transfer/"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Alvis params

In [15]:
project = 'SNIC2022-5-244'
username = 'adambre'
job = 'batch_bound_array.sbatch'

In [16]:
!squeue -u adambre


             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          436812_1     alvis paramete  adambre  R       3:35      1 alvis2-02
          436812_2     alvis paramete  adambre  R       3:35      1 alvis2-02
          436812_3     alvis paramete  adambre  R       3:35      1 alvis2-02
          436812_4     alvis paramete  adambre  R       3:35      1 alvis2-02
          436812_5     alvis paramete  adambre  R       3:35      1 alvis2-02
          436812_6     alvis paramete  adambre  R       3:35      1 alvis2-02
          436812_7     alvis paramete  adambre  R       3:35      1 alvis2-02
          436812_8     alvis paramete  adambre  R       3:35      1 alvis2-03
          436812_9     alvis paramete  adambre  R       3:35      1 alvis2-03
         436812_10     alvis paramete  adambre  R       3:35      1 alvis2-03
         436812_11     alvis paramete  adambre  R       3:35      1 alvis2-03
         436812_12     alvis paramete  adambr

In [17]:
!cat $job

#!/usr/bin/bash

#SBATCH -t 1-00:00:00
#SBATCH -N 1 --gpus-per-node=T4:1
#SBATCH -p alvis 
num_parameter_combos=$1

eval `head -n $SLURM_ARRAY_TASK_ID $num_parameter_combos | tail -1`



if [ -z "$task" ]
then
    task=2
fi
if [ -z "$seed" ]
then 
    seed=69105
fi
if [ -z "$alpha" ]
then 
    alpha=0.0
fi
if [ -z "$sigma" ]
then 
    sigma='3,3'
fi
if [ -z "$delta" ]
then 
    delta=0.05
fi
if [ -z "$binary" ]
then 
    binary=0
fi
if [ -z "$n_classifiers" ]
then 
    n_classifiers=2
fi
if [ -z "$bound" ]
then 
    bound='germain'
fi
if [ -z "$prior_path" ]
then 
    prior_path=''
fi
if [ -z "$posterior_path" ]
then 
    posterior_path=''
fi
if [ -z "$architecture" ]
then 
    architecture='lenet'
fi
if [ -z "$image_size" ]
then 
    image_size=32
fi
if [ -z "$batch_size" ]
then 
    batch_size=128
fi
. load_modules.sh

python batch_bound_single.py -t $task -r $seed -a $alpha -s $sigma -d $delta -b $binary -n $n_classifi

In [18]:
print('Iterating over experiments...\n')

fids = []

os.makedirs(project_folder+'logs', exist_ok=True)
logdir=project_folder+'logs'
parameter_set_name="parameter_sets"
experiments=1

with open(parameter_set_name, 'w') as input_file:
  # all the same loops as before
    for seed in seeds:
        np.random.seed(seed)

        for alpha in alphas:

            print("alpha:"+str(alpha))

            for sigma in sigmas:    
                print("    sigma:"+str(sigma))
                arg_list = get_job_args(task, bound=bound, alpha=alpha, sigma=sigma,
                                        binary=binary, n_classifiers=n_classifiers,architecture=architecture,
                                        seed=seed,image_size=image_size, batch_size=batch_size)
                for a in arg_list:   

                    ckpt = os.path.splitext(os.path.basename(a['posterior_path']))[0]
                    fid = project_folder+'logs/batch_t-%d_r-%d_a-%.4f_s-%d%d_d-%.4f_b-%d_n-%d_B-%s_c-%s_A-%s_I-%d_F-%d' % \
                        (task, seed, alpha, sigma[0], sigma[1], delta, binary, n_classifiers, bound, ckpt, architecture,image_size,batch_size)

                    sigstr = '"%d.%d"' % (sigma[0], sigma[1])
                    exp = 'task=%d,seed=%d,alpha=%.4f,sigma="%s",delta=%.4f,'% (task, seed, alpha, sigstr, delta)\
                            +'binary=%d,nclassifiers=%d,bound=%s,prior=%s,posterior=%s,architecture=%s,image_size=%d,batch_size=%d' % (binary, n_classifiers, bound, a['prior_path'], a['posterior_path'],architecture,image_size,batch_size)

                    prior_path = '"%s"' % a['prior_path']
                    posterior_path = '"%s"' % a['posterior_path']
                    if binary:
                        binary=1
                    else:
                        binary=0
                    #architecture = '"%s"' % architecture
                    #bound = '"%s"' % bound
                    
                    input_file.write(f'task={task} seed={seed} alpha={alpha:.4f} sigma={sigstr} delta={delta:.4f} binary={binary} n_classifiers={n_classifiers} bound={bound} prior_path={prior_path} posterior_path={posterior_path} architecture={architecture} image_size={image_size} batch_size={batch_size}\n')  # space separated for bash
                    
                    experiments += 1
# print(experiments)                                     
# print(input_file)
                                                     
output = !sbatch --array 1-$experiments -J "$parameter_set_name" -o "$logdir"/"$parameter_set_name".%A_%a.out -A $project batch_bound_array.sbatch $parameter_set_name 
jobid = int(output[0].split(' ')[-1])       


#### OLD WAY
# for seed in seeds:
#     np.random.seed(seed)

#     for alpha in alphas:

#         print("alpha:"+str(alpha))

#         for sigma in sigmas:    
#             print("    sigma:"+str(sigma))
#             arg_list = get_job_args(task, bound=bound, alpha=alpha, sigma=sigma,
#                                     binary=binary, n_classifiers=n_classifiers,architecture=architecture,
#                                     seed=seed,image_size=image_size, batch_size=batch_size)
#             for a in arg_list:   

#                 ckpt = os.path.splitext(os.path.basename(a['posterior_path']))[0]
#                 fid = project_folder+'logs/batch_t-%d_r-%d_a-%.4f_s-%d%d_d-%.4f_b-%d_n-%d_B-%s_c-%s_A-%s_I-%d_F-%d' % \
#                     (task, seed, alpha, sigma[0], sigma[1], delta, binary, n_classifiers, bound, ckpt, architecture,image_size,batch_size)

#                 sigstr = '"%d.%d"' % (sigma[0], sigma[1])
#                 exp = 'task=%d,seed=%d,alpha=%.4f,sigma="%s",delta=%.4f,'% (task, seed, alpha, sigstr, delta)\
#                         +'binary=%d,nclassifiers=%d,bound=%s,prior=%s,posterior=%s,architecture=%s,image_size=%d,batch_size=%d' % (binary, n_classifiers, bound, a['prior_path'], a['posterior_path'],architecture,image_size,batch_size)

#                 prior_path = a['prior_path']
#                 output = !sbatch -o "$fid"-%j.out -e "$fid"-%j.err -A $project --export="$exp" $job

#                 jobid = int(output[0].split(' ')[-1])
#                 fid = fid+'-%s' % jobid

#                 fids.append(fid)

Iterating over experiments...

alpha:0.3
    sigma:[3, 3]
    sigma:[3, 4]


In [13]:
#print(fids)
#print(output)
!tail -n 20 "$logdir"/"$parameter_set_name"."$jobid"_1.out
#pd.read_pickle("/cephyr/NOBACKUP/groups/snic2021-23-538/mnist_transfer/results/task6/Binary/fc/32_0_32_4_1_0_results.pkl")
#"$jobid"_1.out

coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.61GiB deviceMemoryBandwidth: 298.08GiB/s
2022-06-21 16:39:37.318663: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0
2022-06-21 16:39:37.318725: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2022-06-21 16:39:37.318742: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      0 
2022-06-21 16:39:37.318757: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0:   N 
2022-06-21 16:39:37.324071: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 13693 MB memory) -> physical GPU (device: 0, name: Tesla T4, pci bus id: 0000:06:00.0, compute capability: 7.5)
2022-06-21 16:39:37.344545: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:06:00.0 name: Tesla T4 computeCapability:

In [26]:
!squeue -u $username

"""
to remove all jobs queued
bash_jobid=214746
squeue -u adambre | awk '{if ($1!=JOBID && $1!=bash_jobid) {print $1}}' | tail -n +2 |xargs scancel


"""
#!squeue -u adambre | awk '{if ($1!=JOBID && $1!=373627) {print $1}}' | tail -n +2 |xargs scancel
#!scancel 398381

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            435145     alvis     bash  adambre  R 4-23:48:45      1 alvis2-01


"\nto remove all jobs queued\nbash_jobid=214746\nsqueue -u adambre | awk '{if ($1!=JOBID && $1!=bash_jobid) {print $1}}' | tail -n +2 |xargs scancel\n\n\n"

In [18]:
for fid in fids: 
    print('------')
    print(fid)
    !tail -n 10 "$fid".out
    !tail -n 5 "$fid".err
    print(' \n\n')