Notes:
1. projects must be launched from iris working directory
9. integrate tfds into loops
13. find more stats for regression analysis
14. learn how to use compilers, hooks and registers to speed up processing
15. use sampler to increase information quality
16. need missed image adder func for the image runs, with image, label and value using tensorboard
need decision boundary function for 2d sanity tests, these two can be added when appropriate
    
    

In [49]:
# imports

import torch
import torch.utils.tensorboard as tb
from torch.profiler import profile, ProfilerActivity, record_function
import general_torch as gt
import display
import os


In [None]:
# todo initialize dataframe and exp directory here, pass it down to the next part

In [None]:
# todo use itertools to generate hparam dictionaries

In [50]:
# init exp from scratch
# each dictionary represents an individual experiment

exp = {}
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float
exp['model'] = 'model0'
exp['model_init_file'] = 'model0_0.pth'
exp['optim'] = 'sgd'
exp['optim_dict'] = {'lr': 0.1, 'momentum': 0.4}  # if a value is part of the experiment, leave it empty and init it where appropriate
exp['error_f'] = 'bce'
exp['stop'] = 'epoch'
exp['stop_val'] = None
exp['max_epoch'] = 100
exp['data_file'] = 'iris_tensor_dict.pt'   # put the tensors into one dict and this will be the file extension!
exp['dl_dict'] = {'batch_size': 1, 'shuffle': True}  # 'data_batch_size': None, 'random_batch': False,
exp['model_type'] = 'classification'  # regression, class
exp['seed'] = 42
exp['hparams'] = {'lr': 0.1, 'optimizer': 'sgd'}  # todo, exapnd this to include everything, build the optim and dl dicts after
exp['device'] = device
exp['data_type'] = dtype
exp['exp_file'] = f'experiments/troubleshoot_dump'
exp['run'] = '4'
exp['test_error'] = 0


In [51]:
# init profiler log paths
cwd = os.getcwd()
exp_dir = os.path.join(cwd, exp['exp_file'])
train_log= os.path.join(exp_dir, f"run{exp['run']}")
test_log = os.path.join(exp_dir, 'test')

In [52]:
# raw data loader, change device, dtype and if needed shape
# this needs to change, take exp file, load the dict, use dict to distribute to tensors

file_tensor = os.path.join(cwd, f'data/{exp["data_file"]}')

dict_tensor = torch.load(file_tensor)

train_data = dict_tensor['train_data'].to(exp['device']).to(exp['data_type'])
train_label = dict_tensor['train_label'].to(exp['device']).to(exp['data_type'])

valid_data = dict_tensor['valid_data'].to(exp['device']).to(exp['data_type'])
valid_label = dict_tensor['valid_label'].to(exp['device']).to(exp['data_type'])

test_data = dict_tensor['test_data'].to(exp['device']).to(exp['data_type'])
test_label = dict_tensor['test_label'].to(exp['device']).to(exp['data_type'])



In [53]:
# init data set

from torch.utils.data import Dataset

class Data(Dataset):
    def __init__(self, train, label):
        self.data = train
        self.label = label
      
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]
        
train_set = Data(train_data, train_label)
valid_set = Data(valid_data, valid_label)
test_set = Data(test_data, test_label)

In [54]:
# always start by resetting seeds before initializing anything
gt.init_seeds(exp['seed'], exp['device'])

In [55]:
# prep dataloader, use kwargs to init
from torch.utils.data import DataLoader

train_loader = DataLoader(train_set, **exp['dl_dict'])  
valid_loader = DataLoader(valid_set, **exp['dl_dict'])
test_loader = DataLoader(test_set, **exp['dl_dict'])

In [56]:
# build model / load state dict / load whole model / explicitly init param weights
import models

model = models.get_model(exp['model'])()  
model.load_state_dict(models.load_init_model(exp['model_init_file']))  
model.to(exp['data_type'])
model.to(exp['device'])

epochs = exp['max_epoch']
optim = gt.get_optim(exp['optim'])(model.parameters(), **exp['optim_dict'])
error = gt.get_error(exp['error_f'])()

profile_model = models.get_model(exp['model'])()  
profile_model.load_state_dict(models.load_init_model(exp['model_init_file'])) 
profile_model.to(exp['data_type'])
profile_model.to(exp['device'])
profile_train_loader = DataLoader(train_set, **exp['dl_dict'])


In [57]:
# check for runtime errors / init predictions should be part of data collection loop
'''for i in model.parameters():
    print(i.dtype)
    break
print(test_data.dtype)'''

with torch.inference_mode():  
    for batch_idx, (data, target) in enumerate(train_loader):
        model(test_data)

  return torch.nn.functional.softmax(x)


In [58]:
# init tensorboard
# writer needs independent var, trial num, model
# write model, params, flops, train time, inference time, memory usage, profiler, can be used here
# write trial num, model and independent var

train_writer = tb.writer.SummaryWriter(log_dir=train_log)
train_writer.add_hparams(exp['hparams'], metric_dict={})  # todo add all params!!!



In [None]:
# profile model
# todo this all was added

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], 
             schedule=torch.profiler.schedule(wait=5, warmup=4, active=1, repeat=1),
             on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(train_log, 'model_profiles')),
             record_shapes=True,
             with_stack=True,
             profile_memory=True) as prof:
    
    with record_function('training'):
            for batch_idx, (data, target) in enumerate(train_loader):
                pred = model(data)
                loss = error(pred, target)
                optim.zero_grad()
                loss.backward()
                optim.step()
                prof.step()
                if batch_idx == 9:
                    break
    
    with record_function('inference'):
            for batch_idx, (data, target) in enumerate(train_loader):
                model(data)
                prof.step()
                if batch_idx == 9:
                    train_writer.add_graph(model, data)
                    break


In [59]:
# init test predictions, to test overall improvement, only needed if model reuse

'''with torch.inference_mode():  
    pred_list = []
    target_list = []
    total = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        preds = model(data)
        loss = error(preds, target)
        pred_list.append(preds)
        target_list.append(target)
        total += loss
    avg_error = total / len(test_data)
    gt.write_test_info(exp, train_writer, target_list, pred_list, avg_error)'''
    

In [60]:
# modify hyperparams, only for model reuse

'''modify_hyper = False
if modify_hyper:
    epochs = 10
    optim = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.4)
    error = torch.nn.MSELoss()'''

In [None]:
# for train and valid, log every epoch

error_last = None
for epoch in range(epochs):
    model.train()
    total = 0
    target_list = [] 
    pred_list = []
    for batch_idx, (data, target) in enumerate(train_loader):
        pred = model(data)
        loss = error(pred, target)
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += loss
        target_list.append(target)
        pred_list.append(pred)
    avg_error = total / len(train_data)
    display.write_train_info(exp, train_writer, model, target_list, pred_list, avg_error, 'train', epoch)

    model.eval()
    error_new = None
    total = 0
    target_list = []
    pred_list = []
    for batch_idx, (data, target) in enumerate(valid_loader):
        pred = model(data)
        error_new = error(pred, target)
        total += error_new
        target_list.append(target)
        pred_list.append(pred)
    avg_error = total / len(valid_data)
    display.write_train_info(exp, train_writer, model, target_list, pred_list, avg_error, 'valid', epoch)
    if exp['stop'] == 'error_change':
        if error_last is not None and error_new / error_last < exp['stop_value']:
            break
        else:
            error_last = error_new



In [None]:
# run final inference

# init test predictions, to test overall improvement

with torch.inference_mode():  
    pred_list = []
    target_list = []
    total = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        preds = model(data)
        loss = error(preds, target)
        pred_list.append(preds)
        target_list.append(target)
        total += loss
    avg_error = total / len(test_data)

In [None]:
# todo test confusion matrix and para coord func here, 

In [None]:
"""
write confusion matrix with custum func
"""
display.write_test_info(exp, df, target_list, pred_list, avg_error)
train_writer.add_pr_curve('test/precision_recall_curve', target_list, pred_list,0)  # todo just added
train_writer.close()  

In [63]:
#mkdown for notes

In [64]:
# save trial folder need pred list, model and above mkdown, to the exp['exp_dir']

import pickle
import pprint

file_dir = {'preds': pred_list, 'lables': target_list, 'exp': exp}
filename = os.path.join(exp_dir, 'pred_dir.pkl')

with open(filename, "wb") as file:
    pickle.dump(file_dir, file)
    
filename = os.path.join(exp_dir, 'info.txt')

with open(filename, 'w') as file:
    pprint.pprint(exp, file)  # todo store everything as json together in exp dir along with df
                                # todo save df in exp dir
    
filename = os.path.join(exp_dir, f'{exp["model"]}.pth')
torch.save(model.state_dict(), filename)

In [None]:
# this ends the trial loop

In [None]:
'''write parallel coordiante graph using dictionaries
write seaborn test bar graphs using pandas df'''

# todo final write

In [None]:
# Complete! All data is viewable in tensorboard