Notes:
1. projects must be launched from iris working directory
9. integrate tfds into loops
13. find more stats for regression analysis
14. learn how to use compilers, hooks and registers to speed up processing
15. use sampler to increase information quality

In [None]:
import pprint
# jupyter notebook magic

%load_ext tensorboard

In [2]:
# imports

import torch
import torch.utils.tensorboard as tb
from torch.profiler import profile, ProfilerActivity
import general_torch as gt
import os


In [None]:
# init exp from scratch
# each dictionary represents an individual experiment  # todo important point here
# todo use itertools to generate dictionaryies for each

exp = {}
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float
exp['model'] = 'model0'
exp['model_init_file'] = 'model0_0.pth'
exp['optim'] = 'sgd'
exp['optim_dict'] = {'lr': 0.001, 'momentum': 0.4}  # if a value is part of the experiment, leave it empty and init it where appropriate
exp['error_f'] = 'mse'
exp['stop'] = 'epoch'
exp['stop_val'] = None
exp['max_epoch'] = 100
exp['data_file'] = 'iris_tensor_dict.pt'   # put the tensors into one dict and this will be the file extension!
exp['dl_dict'] = {'batch_size': 1, 'shuffle': True}  # 'data_batch_size': None, 'random_batch': False,
exp['model_type'] = 'classification'  # regression, class
exp['seed'] = 42
exp['independent_var'] = 'lr'  
exp['hparams'] = {'lr': 0.001, 'optimizer': 'sgd'}
exp['device'] = device
exp['data_type'] = dtype
exp['exp_file'] = f'experiments/troubleshoot_dump/run1'


In [None]:
# init profiler log paths
cwd = os.getcwd()
exp_dir = os.path.join(cwd, exp['exp_file'])
train_log= os.path.join(exp_dir, 'train')
test_log = os.path.join(exp_dir, 'test')

In [172]:
# raw data loader, change device, dtype and if needed shape
# this needs to change, take exp file, load the dict, use dict to distribute to tensors

file_tensor = os.path.join(cwd, f'data/{exp["data_file"]}')

dict_tensor = torch.load(file_tensor)

train_data = dict_tensor['train_data'].to(exp['device']).to(exp['data_type'])
train_label = dict_tensor['train_label'].to(exp['device']).to(exp['data_type'])

valid_data = dict_tensor['valid_data'].to(exp['device']).to(exp['data_type'])
valid_label = dict_tensor['valid_label'].to(exp['device']).to(exp['data_type'])

test_data = dict_tensor['test_data'].to(exp['device']).to(exp['data_type'])
test_label = dict_tensor['test_label'].to(exp['device']).to(exp['data_type'])



In [228]:
# init data set

from torch.utils.data import Dataset

class Data(Dataset):
    def __init__(self, train, label):
        self.data = train
        self.label = label
      
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]
        
train_set = Data(train_data, train_label)
valid_set = Data(valid_data, valid_label)
test_set = Data(test_data, test_label)

In [None]:
# always start by resetting seeds before initializing anything
gt.init_seeds(exp['seed'], exp['device'])

In [229]:
# prep dataloader, use kwargs to init
from torch.utils.data import DataLoader

train_loader = DataLoader(train_set, **exp['dl_dict'])  
valid_loader = DataLoader(valid_set, **exp['dl_dict'])
test_loader = DataLoader(valid_set, **exp['dl_dict'])

In [231]:
# build model / load state dict / load whole model / explicitly init param weights
import models

model = models.get_model(exp['model'])()  
model.load_state_dict(models.load_trained_model(exp['model_init_file']))  
model.to(exp['data_type'])
model.to(exp['device'])
epochs = exp['max_epoch']
optim = gt.get_optim(exp['optim'])(model.parameters(), **exp['optim_dict'])
error = gt.get_error(exp['error_f'])()



Model0_2(
  (lin1): Linear(in_features=1, out_features=50, bias=True)
  (lin2): Linear(in_features=50, out_features=50, bias=True)
  (lin3): Linear(in_features=50, out_features=1, bias=True)
)

In [None]:
# check for runtime errors / init predictions should be part of data collection loop
'''for i in model.parameters():
    print(i.dtype)
    break
print(test_data.dtype)'''

with torch.inference_mode():  
    for batch_idx, (data, target) in enumerate(train_loader):
        model(test_data)

In [None]:
# init tensorboard
# writer needs independent var, trial num, model
# write model, params, flops, train time, inference time, memory usage, profiler, can be used here
# write trial num, model and independent var

writer = tb.writer.SummaryWriter(log_dir=exp_dir)
writer.add_hparams(exp['hparams'], metric_dict={})

In [None]:
# init test predictions, to test overall improvement

with torch.inference_mode():  
    pred_list = []
    target_list = []
    total = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        preds = model(test_data)
        loss = error(preds, target)
        pred_list.append(preds)
        target_list.append(target)
        total += loss
    avg_error = total / len(test_data)
    gt.write_test_info(exp, writer, target_list, pred_list, avg_error, 'test')
    

In [None]:
# modify hyperparams

modify_hyper = False
if modify_hyper:
    epochs = 10
    optim = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.4)
    error = torch.nn.MSELoss()

In [235]:
# need seperate data collection and examination loops here, also switch to dataloader, use skinny flag to fork collection

# for train and valid, log every epoch

'''
test  - accuracy, precision, f1, recall, error, gradients and weights
valid - accuracy, precision, f1, recall, error

there are two
online stats are collected after a complete epoch
batch stats are after each batch
mini_batch, after whole batch

'''

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], 
             schedule=torch.profiler.schedule(wait=2, warmup=2, active=1),
             on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(train_log, 'profile')),
             record_shapes=True,
             with_stack=True) as prof:

    error_last = None
    for _ in range(epochs):
        model.train()
        total = 0
        target_list = [] 
        pred_list = []
        for batch_idx, (data, target) in enumerate(train_loader):
            pred = model(data)
            loss = error(pred, target)
            optim.zero_grad()
            loss.backward()
            optim.step()
            total += loss
            target_list.append(target)
            pred_list.append(pred)
            prof.step()
        avg_error = total / len(train_data)
        gt.write_train_info(exp, writer, model, target_list, pred_list, avg_error, 'train')

        model.eval()
        error_new = None
        total = 0
        target_list = []
        pred_list = []
        for batch_idx, (data, target) in enumerate(valid_loader):
            pred = model(data)
            error_new = error(pred, target)
            total += error_new
            target_list.append(target)
            pred_list.append(pred)
        avg_error = total / len(valid_data)
        gt.write_train_info(exp, writer, model, target_list, pred_list, avg_error, 'valid')
        if exp['stop'] == 'error_change':
            if error_last is not None and error_new / error_last < exp['stop_value']:
                break
            else:
                error_last = error_new


In [237]:
# run final inference

# init test predictions, to test overall improvement
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], 
             schedule=torch.profiler.schedule(wait=2, warmup=2, active=3),
             on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(test_log, 'profile')),
             record_shapes=True,
             with_stack=True) as prof:

    with torch.inference_mode():  
        pred_list = []
        target_list = []
        total = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            preds = model(test_data)
            loss = error(preds, target)
            pred_list.append(preds)
            target_list.append(target)
            total += loss
            prof.step()
        avg_error = total / len(test_data)
        gt.write_test_info(exp, writer, target_list, pred_list, avg_error, 'test')
    
writer.close()  

In [None]:
%tensorboard --logdir='experiments/troubleshoot_dump/run1'  # this has to be set manually as this is a terminal 

'''if skinny_run:
    pass
    # skiny eval with no data collection, but still with tensor board
else:
    pass
    # eval model -- tensorboard, events, profiler, take predictions overtime and create a seaborn graph, build a suite!'''

In [None]:
#mkdown for notes

In [None]:
# save trial folder need pred list, model and above mkdown, to the exp['exp_dir']

import pickle

file_dir = {'preds': pred_list, 'lables': target_list, 'exp': exp}
filename = os.path.join(exp_dir, 'pred_dir.pkl')

with open(filename, "wb") as file:
    pickle.dump(file_dir, file)
    
filename = os.path.join(exp_dir, 'info.txt')

with open(filename, 'w') as file:
    pprint.pprint(exp, file)
    
filename = os.path.join(exp_dir, f'{exp["model"]}.pth')
torch.save(model.state_dict(), filename)

In [None]:
# this ends the trial loop