# Execution notebook
This notebook serves as a walk-through of the code to execute training of the ViT keypoint tracker

In [1]:
from utils.train_test_split import test_train_split
import utils.data_augmentation

In [2]:
#Prepare paths

root = "/Users/annastuckert/Documents/GitHub/ViT_facemap/ViT-pytorch"


## Data Preprocessing

### Test-train split (incl. dropping NAs)

In [3]:
# Define paths
csv_path = f"{root}/data/facemap/CollectedData_AVS.csv"
dest_folder = f"{root}/data/facemap/data_No_NaN"
source_folder = f"{root}/data/facemap"


# Call the function to process data
test_train_split(csv_path, source_folder, dest_folder)

Invalid image name: nan
Invalid image name: nan
Invalid image name: nan
Invalid image name: nan


(           scorer   Unnamed: 1   Unnamed: 2                AVS  \
 3    labeled-data  cam1_G7c1_1  img0068.png  589.0232336327339   
 43   labeled-data  cam1_G7c1_1  img2997.png  590.5093017792906   
 48   labeled-data  cam1_G7c1_1  img3575.png  589.0232336327339   
 68   labeled-data  cam1_G7c1_1  img5379.png  581.5928928999506   
 79   labeled-data  cam1_G7c1_1  img6392.png  586.0510973396206   
 89   labeled-data  cam1_G7c1_1  img7197.png  583.0789610465073   
 105  labeled-data  cam1_G7c1_1  img8629.png  587.5371654861773   
 
                  AVS.1               AVS.2               AVS.3  \
 3    67.63932048311811  502.83128113244777  106.27709229359114   
 43   67.63932048311811  507.28948557211777  106.27709229359114   
 48   64.66718419000479  507.28948557211777  101.81888785392117   
 68   64.66718419000479   498.3730766927779  122.62384190571437   
 79   67.63932048311811  507.28948557211777  109.24922858670446   
 89   67.63932048311811   504.3173492790045  104.79102414703

## Data augmentation

In [4]:
from torchvision import transforms, utils

rotate_rescale = transforms.Compose([Rotate(20), ZeroPadHeight(846), Rescale(224)])

flip_rescale = transforms.Compose([HorizontalFlip(), ZeroPadHeight(846), Rescale(224)])

pad_rescale = transforms.Compose(
    [
        ZeroPadHeight(846),  # Set the desired height
        Rescale(224),
    ]
)

rotate_flip_rescale = transforms.Compose(
    [HorizontalFlip(), Rotate(20), ZeroPadHeight(846), Rescale(224)]
)

blur = transforms.Compose([GaussianBlur(), ZeroPadHeight(846), Rescale(224)])

NameError: name 'Rotate' is not defined

In [None]:
# Example usage
transforms_list = [rotate_rescale, flip_rescale, pad_rescale, rotate_flip_rescale, blur]
face_dataset = AugmentedFaceDataset(
    csv_file="f{dest_folder}/train/train_data.csv",
    root_dir="f{dest_folder}/train/",
    output_dir="f{dest_folder}/train/augmented_data/",
)
# face_dataset = AugmentedFaceDataset(csv_file='data/facemap/LabeledData/Test/CollectedDataTest.csv', root_dir='data/facemap/LabeledData/Test/', output_dir='augmented_data_test/')
face_dataset.apply_transforms_and_save(transforms_list)

NameError: name 'rotate_rescale' is not defined

# Running Training

In order to train the ViT, the following sections are run. Subprocess is used in order to run train.py from within a python script.

For reference, the following arguments are to be specified for the training model.

"--name", default="test" 
--> "Name of this run. Used for monitoring."

"--dataset", default="facemap" 
--> "Which downstream task and dataset to use"

"--model_type", choices=["ViT-B_16", "ViT-B_32", "ViT-L_16", "ViT-L_32", "ViT-H_14", "R50-ViT-B_16"], default="ViT-B_16"
-->help="Which variant to use."

"--pretrained_dir", type=str, default="ViT-B_16.npz"
--> "Where to search for pretrained ViT models. If not modified, will search in the directory where .ipynb project execution file is placed."

"--output_dir", default="output", type=str
-->"The output directory where checkpoints will be written."

"--img_size", default=224, type=int
--> ="Resolution size for image"

"--train_batch_size", default=20, type=int
--> "Batch size for training."

"--eval_batch_size", default=20, type=int
h--> "Total batch size for eval."

"--eval_every", default=100, type=int,
--> "Run prediction on validation set every so many steps. Will always run one evaluation at the end of training."

"--learning_rate", default=2e-4, type=float,
--> "The initial learning rate for the optimizer."

"--weight_decay", default=1e-2, type=float,
--> "Weight deay if we apply some."

"--num_steps", default=3000, type=int,
--> "Total number of training epochs to perform."

"--decay_type", choices=["cosine", "linear"], default="linear", #changed from cosine as I believe this is what Yichen did
--> "How to decay the learning rate."

"--warmup_steps", default=500, type=int,
--> "Step of training to perform learning rate warmup for."

"--max_grad_norm", default=1.0, type=float,
--> "Max gradient norm."

"--local_rank", type=int, default=-1,
--> "local_rank for distributed training on gpus" - I think this might be if you have more than one GPU available, you can distribute training. Or if one GPU has more than one core

'--seed', type=int, default=42,
--> "random seed for initialization"

'--gradient_accumulation_steps', type=int, default=1, # tried adjusting this from 1 to 25 to match Yichen
--> "Number of updates steps to accumulate before performing a backward/update pass."

('--fp16', action='store_true',
--> "Whether to use 16-bit float precision instead of 32-bit")

'--fp16_opt_level', type=str, default='O2',
-->"For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")

'--loss_scale', type=float, default=0,
-->"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True. 0 (default value): dynamic loss scaling. Positive power of 2: static loss scaling value.


In [1]:
import subprocess

# Define the command to run the script with arguments
command = [
    "python", "train.py",
    "--name", "experiment_20240813",
    "--dataset", "facemap",
    "--model_type", "ViT-B_16",
    "--pretrained_dir", "ViT-B_16.npz",
    "--output_dir", "model_checkpoints",  # Added missing comma here
    "--train_batch_size", str(20),
    "--eval_batch_size", str(20),
    "--eval_every", str(5), 
    "--num_steps", str(5),
]

# Run the script
result = subprocess.run(command, capture_output=True, text=True)

# Print the output and errors (if any)
print("Output:", result.stdout)
print("Errors:", result.stderr)


Output: load_pretrained: grid-size from 24 to 14
85.817112
Input tensor size: torch.Size([20, 197, 768])
Input tensor size: torch.Size([20, 197, 768])
Input tensor size: torch.Size([20, 197, 768])
Input tensor size: torch.Size([20, 197, 768])
Input tensor size: torch.Size([20, 197, 768])
Input tensor size: torch.Size([20, 197, 768])
Input tensor size: torch.Size([20, 197, 768])
Input tensor size: torch.Size([20, 197, 768])
Input tensor size: torch.Size([20, 197, 768])
Input tensor size: torch.Size([20, 197, 768])
Input tensor size: torch.Size([20, 197, 768])
Input tensor size: torch.Size([20, 197, 768])
Data: {'steps': [0, 1, 2, 3, 4, 5, 5], 'metric': ['training_loss', 'training_loss', 'training_loss', 'training_loss', 'training_loss', 'validation_loss', 'validation_acc'], 'training_loss': [10727.318359375, 10490.2548828125, 10947.5283203125, 10596.41015625, 11004.0400390625, 10698.757358871764, 0.0], 'validation_loss': [None, None, None, None, None, None, None], 'validation_acc': [Non