# Gitlab setup

In [None]:
token = 
email = 
username = 
repo = 
id = 
branch = 
experiment =

!git config --global user.email {email}
!git config --global user.name {username}

import os
os.environ['MLFLOW_TRACKING_URI'] = f'https://gitlab.com/api/v4/projects/{id}/ml/mlflow'
os.environ['MLFLOW_TRACKING_TOKEN'] = token

In [None]:
# !git clone -b {branch} https://oauth2:{token}@gitlab.com/{username}/{repo}.git

# Data setup

In [None]:
# Paths
try:
  import google.colab
  google.colab.drive.mount('/content/drive')
  root = 'path_to_main_folder/'
except:
  root= 'path_to_main_folder/'
images_folder = root+'data/images/'
save_path = root+'models/'
other_paths = root+'declare_other_paths/'

task_type = 'classification'
task_names = ['a', 'b']

In [4]:
from fastai.vision.all import *
import albumentations
from DLOlympus.training.transforms import AlbumentationsTransform
from DLOlympus.training.utils import get_model
from DLOlympus.training.unbalanced import get_weights, oversampled_epoch

In [None]:
# Hyperparameters

h, w = 224, 224

hyperparameters = {
    'MODEL_DESCRIPTION': '',
    'BS': 16,
    'EPOCHS': 30,
    'IMG_SIZE': (h, w),      # (height, width)
    'WD': 0.0,
    'TRANSFORMS': [
        albumentations.HorizontalFlip(p=0.5),
        albumentations.VerticalFlip(p=0.5),
        albumentations.Rotate(p=0.5),
        albumentations.Sharpen(p=0.5),
        albumentations.ColorJitter(brightness=0.3, contrast=0.5, saturation=0.5, hue=0.0, p=0.5),
        albumentations.RGBShift(p=0.5),
        albumentations.GaussianBlur(p=0.5),
        albumentations.GaussNoise(p=0.5),
        albumentations.RandomSizedCrop((int(0.75*h),h), h, w, p=1.0)
        ],
    'ARCH': 'resnet50',
    'ARCH_TYPE': 'torchvision',
    'LOSS_FUNC': 'LabelSmoothingCrossEntropyFlat',
    'OPT_FUNC': 'Adam',
    'USE_OVERSAMPLING': False,
    'USE_LOSS_WEIGHTS': False,
    'SEED': 18,
}

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold

def get_data():
    image_files = 
    labels = 
    groups = 
    return image_files, labels, groups

def create_df(image_files, labels, groups, n_splits=10, n_valid=2):
    # Initiate dataframe
    df = pd.DataFrame()
    df['file_path'] = image_files
    df['label'] = labels
    df['groups'] = groups
    df['fold'] = -1
    # Make folds
    cv = StratifiedGroupKFold(n_splits=n_splits)
    for i, (train_idxs, valid_idxs) in enumerate(cv.split(image_files, labels, groups)):
        df.loc[valid_idxs, ['fold']] = i
    # Assign folds for validation
    df['split'] = 'train'
    for i in range (n_valid):
        df.loc[df.fold == i, ['split']] = 'valid'
    del df['fold']
    df.split.value_counts()
    # Add a binary column to the dataframe
    df['is_valid'] = df.split == 'valid'
    del df['split']
    return df

In [None]:
# Dataframe
image_files, labels, groups = get_data()
df = create_df(image_files, labels, groups)

df['label'].value_counts()

In [None]:
set_seed(hyperparameters['SEED'], True)

# Determine the number of tasks
n_tasks = len(task_names)

# Determine block types
if task_type == 'classification':
	blocks = (ImageBlock,) + (CategoryBlock,) * n_tasks 
if task_type == 'regression':
	blocks = (ImageBlock, RegressionBlock(n_out=n_tasks))
	
# Datablock
block = DataBlock(
    blocks=blocks,
    n_inp=1,
    get_x=ColReader('file_path'),
    get_y=[lambda x, i=i: x['label'].split(', ')[i] for i in range(n_tasks)],
    splitter=ColSplitter(col='is_valid'),
    item_tfms=[
        Resize(hyperparameters['IMG_SIZE'], method='squish'), 
        AlbumentationsTransform(albumentations.Compose(hyperparameters['TRANSFORMS']))])

# Dataloaders
dls = block.dataloaders(df, bs=hyperparameters['BS'], shuffle=True)
dls.rng.seed(hyperparameters['SEED'])

# Sanity check
if task_type == 'classification':
	n_classes = [dls.c] if isinstance(dls.c, int) else dls.c
	print('Number of clases: ', n_classes)
	classes = dls.vocab if n_tasks>1 else [dls.vocab]
	print('Names of classes: ', classes)
if task_type == 'regression':
	n_out = dls.c 
	print('Number of outputs: ', n_out)

In [None]:
# Show batch
dls.train.show_batch(max_n=16, figsize=(15,12))

In [None]:
# Show transforms
dls.train.show_batch(max_n=16, unique=True, figsize=(15,12))

In [None]:
# Loss weights
loss_weights = get_weights(dls) if hyperparameters['USE_LOSS_WEIGHTS'] else None

In [None]:
from DLOlympus.training.metrics import AccuracyMetric, F1ScoreMetric, MSEMetric
from DLOlympus.training.multitask import create_loss_func

if task_type == 'classification':
	if n_tasks == 1:
		loss = getattr(sys.modules[__name__], hyperparameters['LOSS_FUNC'])(weight=loss_weights)
		metrics = [AccuracyMetric(), F1ScoreMetric(average='macro')]
		callbacks = [SaveModelCallback(monitor='f1_score', with_opt=True), ShowGraphCallback]
	else:
		loss_functions = [create_loss_func(getattr(sys.modules[__name__], hyperparameters['LOSS_FUNC'])(weight=loss_weights), sum(n_classes[:i]), sum(n_classes[:i+1]), i) for i in range(n_tasks)]    
		def combined_loss(inp, *args): 
			return sum(f(inp, *args) for f in loss_functions)
		loss = combined_loss
		acc_functions = [AccuracyMetric(axis=i, metric_name='acc_'+t) for i,t in enumerate(task_names)]
		f1_functions = [F1ScoreMetric(axis=i, metric_name='f1_'+t) for i,t in enumerate(task_names)]
		metrics = acc_functions + f1_functions + [AccuracyMetric(multi=True, metric_name='acc_multi')]
		callbacks = [SaveModelCallback(monitor='acc_multi', with_opt=True), ShowGraphCallback]
if task_type == 'regression':
	loss = getattr(sys.modules[__name__], hyperparameters['LOSS_FUNC'])()
	callbacks = [SaveModelCallback(monitor='valid_loss', comp=np.less, with_opt=True), ShowGraphCallback]
	if n_tasks == 1:
		metrics = [MSEMetric(metric_name='rmse', root=True)]
	else:
		metrics = [MSEMetric(metric_name='rmse_'+t, axis=i, root=True) for i,t in enumerate(task_names)]
		
# Learner
learn = vision_learner(dls,
                        get_model(hyperparameters),
                        normalize=True,
                        pretrained=True,
                        n_out=sum(n_classes),
                        loss_func=loss,
                        opt_func=getattr(sys.modules[__name__], hyperparameters['OPT_FUNC']),
                        metrics=metrics,
                        wd=hyperparameters['WD']).to_fp16()

# Fix issue with pickling while calling learn.export
import typing, functools
learn.loss_func.func.__annotations__ = typing.get_type_hints(learn.loss_func.func, globalns=globals(), localns=locals())
functools.update_wrapper(learn.loss_func, learn.loss_func.func)

In [None]:
# Oversampling
if hyperparameters['USE_OVERSAMPLING']:
    class_weights = pd.DataFrame(1 / np.sqrt(learn.dls.items.label.value_counts())).rename(index=lambda x: str(x)).to_dict()['count']
    learn.dls.train.get_idxs = types.MethodType(partial(oversampled_epoch, class_weights=class_weights), learn.dls.train)

# Training

In [None]:
# Find LR
learn.lr_find()

In [None]:
# Set LR
hyperparameters['LR'] = 

In [None]:
# Train
learn.fine_tune(hyperparameters['EPOCHS'], base_lr=hyperparameters['LR'], cbs=callbacks)

# Results and logs

In [None]:
import dill
learn.export(f'{save_path}/model.pkl', pickle_module=dill)
learn.save(f'{save_path}/model')

from DLOlympus.training.plots import plot_losses, plot_metrics
_ = plot_losses(learn, save_path)
_ = plot_metrics(learn, save_path)

if task_type == 'classification':
	import itertools
	from DLOlympus.training.plots import plot_confusion_matrix
	probs, ground_truths = learn.get_preds(ds_idx=1)        # DO NOT PREDICT BEFORE PLOTTING LOSSES AND METRICS
	ground_truths = ground_truths if n_tasks>1 else [ground_truths]
	predictions = [np.argmax(probs[:,sum(n_classes[:i]):sum(n_classes[:i+1])], axis=1) for i in range(n_tasks)]
	decoded_preds = [' '.join([classes[i][p] for i, p in enumerate(tensor(g))]) for g in zip(*predictions)]
	decoded_gts = [' '.join([classes[i][p] for i, p in enumerate(tensor(g))]) for g in zip(*ground_truths)]
	new_vocab = [' '.join(i) for i in list(itertools.product(*classes))]
	_ = plot_confusion_matrix(decoded_gts, decoded_preds, new_vocab, save_path)

if n_tasks==1:
    from DLOlympus.training.tables import get_predictions_table
    train_table = get_predictions_table(learn, learn.dls.train)
    valid_table = get_predictions_table(learn, learn.dls.valid)
    train_table.to_csv(f'{save_path}train_table.csv', index=False)
    valid_table.to_csv(f'{save_path}valid_table.csv', index=False)

from DLOlympus.training.utils import get_metrics
results = get_metrics(learn, with_tta=False)

In [None]:
if len(str([t.__class__.__name__ for t in hyperparameters['TRANSFORMS']])) > 250:
    hyperparameters['TRANSFORMS'] = 'Too many transforms to log'

In [None]:
from DLOlympus.training.mlflow import mlflow_log

mlflow_log(save_path, hyperparameters, results, experiment)