# Importing libraries

In [1]:
%reload_ext autoreload

%autoreload 2

import os
import funcs 
import load_data
import tensorflow as tf
import mlflow
import subprocess
import git
import numpy as np
import pandas as pd
from tqdm import tqdm
from time import time

%reload_ext load_data
%reload_ext funcs

# Set up the GPU

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
config = tf.compat.v1.ConfigProto(device_count={"GPU":1, "CPU": 10})
config.gpu_options.allow_growth = True  
config.log_device_placement = True  
sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device

  and should_run_async(code)


# Creating a ssh-tunnel to server in the background

In [3]:
command = 'ssh -N -L 5000:localhost:5432 artinmajdi@data7-db1.cyverse.org &'
ssh_session = subprocess.Popen('exec ' + command, stdout=subprocess.PIPE, shell=True)

# MLflow set up

In [4]:
server, artifact = funcs.mlflow_settings()
mlflow.set_tracking_uri(server)


# Creating/Setting the experiment
experiment_name = 'label_inter_dependence'

# Line below should be commented if the experiment is already created
# If kept commented during the first run of a new experiment, the set_experiment 
# will automatically create the new experiment with local artifact storage

# mlflow.create_experiment(name=experiment_name, artifact_location=artifact)
mlflow.set_experiment(experiment_name=experiment_name)


# Loading the optimization parameters aturomatically from keras
mlflow.keras.autolog()

# Starting the MLflow 
ADD_RUN_NAME = False
if ADD_RUN_NAME:
    # When we add a run_name, it will remove the run_id. this will save the run_id on top of the ui page 
    mlflow.start_run(run_name = 'Uncertainty Measurement')
    run = mlflow.active_run()
    mlflow.set_tag('run_id',run.info.run_id)
else:
    mlflow.start_run()

## Saving the Git commit  (only in Jupyter notebook)
This is only needed for jupyter notebook

You can annotate runs with arbitrary tags. Tag keys that start with mlflow. are reserved for internal use. The following tags are set automatically by MLflow, when appropriate:

In [5]:
repo = git.Repo(search_parent_directories=True)
git_commit_hash = repo.head.object.hexsha
print('git commit hash', git_commit_hash)
mlflow.set_tag('mlflow.source.git.commit', git_commit_hash)


git commit hash 58b97dd53b8db7c68f13edbb52d6806d5dc34fee


###  Writing on top of the page of run


In [6]:
mlflow.set_tag('mlflow.note.content','This simulation does \n this \n that')

  and should_run_async(code)


# Model optimization

## Reading Terminal Inputs

In [14]:
# epochs, batch_size, max_sample = funcs.reading_terminal_inputs()
epochs, batch_size, max_sample = 1, 32, 1000

### Selecting the dataset 

In [8]:
dataset = 'chexpert' # 'nih'
dir = '/groups/jjrodrig/projects/chest/dataset/' + dataset + '/'

mlflow.log_param('dataset',dataset)

### Loading the data

In [15]:
%%time
(train_dataset, valid_dataset), (train_generator, valid_generator), Info= load_data.load(dir=dir, dataset=dataset, batch_size=30, mode='train_val', max_sample=max_sample)

mlflow.log_param('train count',len(train_generator.filenames))
mlflow.log_param('valid count',len(valid_generator.filenames))
mlflow.log_param('max_sample',max_sample)

before sample-pruning
train: (223414, 19)
test: (234, 19)

after sample-pruning
train (certain): (567, 20)
train (uncertain): (291, 20)
valid: (142, 20)
test: (169, 20) 

Found 567 validated image filenames.
Found 142 validated image filenames.


MlflowException: Changing param values is not allowed. Param with key='train count' was already logged with value='2710' for run ID='a952cf776e0f4411853331ab7791f1e6'. Attempted logging new value '567'.

### Optimization

In [16]:
%%time
funcs.optimize(dir, train_dataset, valid_dataset, epochs, Info)

  and should_run_async(code)
  tensor_proto.tensor_content = nparray.tostring()
  try_mlflow_log(mlflow.log_param, param_name, kwargs[param_name])
  try_mlflow_log(mlflow.log_param, param_name, kwargs[param_name])
  try_mlflow_log(mlflow.log_param, param_name, kwargs[param_name])
CPU times: user 14min 13s, sys: 2min 2s, total: 16min 15s
Wall time: 1min 19s


# Evaluation

In [17]:
EVALUATE = True
if EVALUATE:
    score = funcs.evaluate(dir=dir, dataset=dataset, batch_size=200)

    
    # converting the outputs into panda dataframe
    df = pd.DataFrame.from_dict(score).T


    # Save the outputs as mlflow artifact
    tm = str(int(time()))
    df.to_json(dir + 'model/test_results_' + tm + '.json')
    mlflow.log_artifact(dir + 'model/test_results_' + tm + '.json')

before sample-pruning
train: (223414, 19)
test: (234, 19)

after sample-pruning
train (certain): (54951, 20)
train (uncertain): (31311, 20)
valid: (13738, 20)
test: (169, 20) 

Found 169 validated image filenames.
  tensor_proto.tensor_content = nparray.tostring()
100%|██████████| 169/169 [00:26<00:00,  6.39it/s]


### Converting the outputs to a dataframe

In [17]:
df = pd.DataFrame.from_dict(score).T

  and should_run_async(code)


### Save as mlflow artifact

In [18]:
tm = str(int(time()))
df.to_json(dir + 'model/test_results_'+tm+'.json')
mlflow.log_artifact(dir + 'model/test_results_'+tm+'.json')

## Closing the mlflow & ssh sessions

In [19]:
# closing the mlflow session
mlflow.end_run()

# closing the ssh session
ssh_session.kill()

print('Finished')

Finished


### Guide: Viewing the results:

        
    >> ssh -N -L 5000:localhost:5432 artinmajdi@data7-db1.cyverse.org &
    >> mlflow ui --backend-store-uri postgresql://artinmajdi:1234@localhost:5000/chest_db --port 6789             
