# Execution notebook
This notebook serves as a walk-through of the code to execute training of the ViT keypoint tracker

In [None]:
from utils.train_test_split import test_train_split
from utils.data_utils import FaceLandmarksDataset

In [None]:
#Prepare paths

root = "/Users/annastuckert/Documents/GitHub/ViT_facemap/ViT-pytorch"


## Data Preprocessing

### Test-train split (incl. dropping NAs)

In [None]:
# Define paths
csv_path = f"{root}/data/facemap/CollectedData_AVS.csv"
dest_folder = f"{root}/data/facemap/data_No_NaN"
source_folder = f"{root}/data/facemap"


# Call the function to process data
test_train_split(csv_path, source_folder, dest_folder)

## Data augmentation

Arguments:

rotation = how much rotation (degrees) should be applied to the image
img_height = input image height (consider changing this to automatically be derived from meta data files if expecting it not to be uniform)
img_size = size in pixels (ViT expect 224)

In [None]:
from utils.Dataaugmentation import AugmentedFaceDataset

transforms_list = [
    "rotate_rescale",
    "flip_rescale",
    "pad_rescale",
    "rotate_flip_rescale",
    "blur",]
transform_params = {
    "rotate_rescale": {"rotation": 30, "img_height": 256, "img_size": (224, 224)},
    "flip_rescale": {"img_height": 256, "img_size": (224, 224)},
    "pad_rescale": {"img_height": 300, "img_size": (224, 224)},
    "rotate_flip_rescale": {
        "rotation": 30,
        "img_height": 256,
        "img_size": (224, 224),
    },
    "blur": {"img_height": 256, "img_size": (224, 224)},
}


    # Apply transformations and save augmented data
face_dataset.apply_transforms_and_save()



# # Example usage
# transforms_list = [rotate_rescale, flip_rescale, pad_rescale, rotate_flip_rescale, blur]
# face_dataset = AugmentedFaceDataset(
#     csv_file="f{dest_folder}/train/train_data.csv",
#     root_dir="f{dest_folder}/train/",
#     output_dir="f{dest_folder}/train/augmented_data/",
# )
# # face_dataset = AugmentedFaceDataset(csv_file='data/facemap/LabeledData/Test/CollectedDataTest.csv', root_dir='data/facemap/LabeledData/Test/', output_dir='augmented_data_test/')
# face_dataset.apply_transforms_and_save(transforms_list)

In [2]:


# Import necessary components from the Dataaugmentation module
from utils.Dataaugmentation import AugmentedFaceDataset, rotate_rescale, flip_rescale, pad_rescale, rotate_flip_rescale, blur

# Define the transformation list and parameters
transforms_list = [
    "rotate_rescale",
    "flip_rescale",
    "pad_rescale",
    "rotate_flip_rescale",
    "blur",
]
transform_params = {
    "rotation": 10,
    "img_height": 256,
    "img_size": (224, 224),
}

# Set up dataset
face_dataset = AugmentedFaceDataset(
    csv_file="/Users/annastuckert/Documents/GitHub/ViT_facemap/ViT-pytorch/data/facemap/data_No_NaN/train/train_data.csv",  # Adjust path as necessary
    root_dir="/Users/annastuckert/Documents/GitHub/ViT_facemap/ViT-pytorch/data/facemap/data_No_NaN/train",  # Adjust path as necessary
    output_dir="/Users/annastuckert/Documents/GitHub/ViT_facemap/ViT-pytorch/data/facemap/data_No_NaN/train/augmented_data/",  # Adjust path as necessary
    output_size=(224, 224),
    transform_list=transforms_list,
    transform_params=transform_params,
)

# Apply transformations and save augmented data
face_dataset.apply_transforms_and_save()

# Print output directory to confirm
print("Augmented data saved to:", face_dataset.output_dir)


Augmented data saved to: /Users/annastuckert/Documents/GitHub/ViT_facemap/ViT-pytorch/data/facemap/data_No_NaN/train/augmented_data/


# Running Training

In order to train the ViT, the following sections are run. Subprocess is used in order to run train.py from within a python script.

For reference, the following arguments are to be specified for the training model.

"--name", default="test" 
--> "Name of this run. Used for monitoring."

"--dataset", default="facemap" 
--> "Which downstream task and dataset to use"

"--model_type", choices=["ViT-B_16", "ViT-B_32", "ViT-L_16", "ViT-L_32", "ViT-H_14", "R50-ViT-B_16"], default="ViT-B_16"
-->help="Which variant to use."

"--pretrained_dir", type=str, default="ViT-B_16.npz"
--> "Where to search for pretrained ViT models. If not modified, will search in the directory where .ipynb project execution file is placed."

"--output_dir", default="output", type=str
-->"The output directory where checkpoints will be written."

"--img_size", default=224, type=int
--> ="Resolution size for image"

"--train_batch_size", default=20, type=int
--> "Batch size for training."

"--eval_batch_size", default=20, type=int
h--> "Total batch size for eval."

"--eval_every", default=100, type=int,
--> "Run prediction on validation set every so many steps. Will always run one evaluation at the end of training."

"--learning_rate", default=2e-4, type=float,
--> "The initial learning rate for the optimizer."

"--weight_decay", default=1e-2, type=float,
--> "Weight deay if we apply some."

"--num_steps", default=3000, type=int,
--> "Total number of training epochs to perform."

"--decay_type", choices=["cosine", "linear"], default="linear", #changed from cosine as I believe this is what Yichen did
--> "How to decay the learning rate."

"--warmup_steps", default=500, type=int,
--> "Step of training to perform learning rate warmup for."

"--max_grad_norm", default=1.0, type=float,
--> "Max gradient norm."

"--local_rank", type=int, default=-1,
--> "local_rank for distributed training on gpus" - I think this might be if you have more than one GPU available, you can distribute training. Or if one GPU has more than one core

'--seed', type=int, default=42,
--> "random seed for initialization"

'--gradient_accumulation_steps', type=int, default=1, # tried adjusting this from 1 to 25 to match Yichen
--> "Number of updates steps to accumulate before performing a backward/update pass."

('--fp16', action='store_true',
--> "Whether to use 16-bit float precision instead of 32-bit")

'--fp16_opt_level', type=str, default='O2',
-->"For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")

'--loss_scale', type=float, default=0,
-->"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True. 0 (default value): dynamic loss scaling. Positive power of 2: static loss scaling value.


In [None]:
import subprocess

# Define the command to run the script with arguments
command = [
    "python", "train.py",
    "--name", "experiment_20240825",
    "--dataset", "facemap",
    "--model_type", "ViT-B_16",
    "--pretrained_dir", "ViT-B_16.npz",
    "--output_dir", "model_checkpoints",  # Added missing comma here
    "--train_batch_size", str(20),
    "--eval_batch_size", str(20),
    "--eval_every", str(5), 
    "--num_steps", str(5),
]

# Run the script
result = subprocess.run(command, capture_output=True, text=True)

# Print the output and errors (if any)
print("Output:", result.stdout)
print("Errors:", result.stderr)


In [None]:
import subprocess

# Define the command to run the script with arguments
command = [
    "python", "train_epochs.py",
    "--name", "experiment_20240825",
    "--dataset", "facemap",
    "--model_type", "ViT-B_16",
    "--pretrained_dir", "ViT-B_16.npz",
    "--output_dir", "model_checkpoints",
    "--train_batch_size", str(20),
    "--eval_batch_size", str(20),
    "--eval_every", str(2),
    "--num_epochs", str(2),  # Changed from --num_steps to --num_epochs
]

# Run the script
result = subprocess.run(command, capture_output=True, text=True)

# Print the output and errors (if any)
print("Output:", result.stdout)
print("Errors:", result.stderr)


# Video inference

In [None]:
# Create 10 second video for testing purpose

import cv2

def create_one_second_video(input_video_path, output_video_path):
    # Open the input video file
    cap = cv2.VideoCapture(input_video_path)
    
    # Check if the video opened successfully
    if not cap.isOpened():
        print("Error: Could not open video file.")
        return
    
    # Get the video's frames per second (fps) and size information
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Calculate the number of frames for 10 seconds
    frames_to_extract = int(fps)*10
    
    # Define the codec and create a VideoWriter object to save the output
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for .mp4
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
    
    # Read and write frames to the output file
    frame_count = 0
    while cap.isOpened() and frame_count < frames_to_extract:
        ret, frame = cap.read()
        if not ret:
            break
        out.write(frame)
        frame_count += 1
    
    # Release resources
    cap.release()
    out.release()
    print(f"10-second video saved to {output_video_path}")

# Example usage
input_video_path = '/Users/annastuckert/Documents/GitHub/ViT_facemap/ViT-pytorch/Facemap_videos/cam1_G7c1_1.avi'
video_path = '/Users/annastuckert/Documents/GitHub/ViT_facemap/ViT-pytorch/Facemap_videos/cam1_G7c1_1_10seconds.avi'
create_one_second_video(input_video_path, video_path)

In [None]:
# Import necessary functions and libraries
import torch
from utils.video_inference import load_model, run_inference_on_video, overlay_keypoints_on_video_and_save_csv

# Define paths and configuration
video_path = '/Users/annastuckert/Documents/GitHub/ViT_facemap/ViT-pytorch/Facemap_videos/cam1_G7c1_1_10seconds.avi'  # Path to your input video
checkpoint_path = '/Users/annastuckert/Documents/GitHub/ViT_facemap/ViT-pytorch/model_checkpoints/test_checkpoint.pth'  # Path to your model checkpoint file
output_video_path = 'output/keypoints.mp4'  # Path to save the output video
output_csv_path = 'output/keypoints.csv'  # Path to save the keypoints CSV file
config_name = 'ViT-B_16'  # Use the appropriate configuration name for your model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Select device

# Load the model
model = load_model(checkpoint_path, config_name, device)

# Run inference on the video to get predicted keypoints
keypoints_list = run_inference_on_video(video_path, model, device)

# Overlay the predicted keypoints on the video frames and save the output
overlay_keypoints_on_video_and_save_csv(video_path, keypoints_list, output_video_path, output_csv_path)

# Output paths and check files
print(f"Output video saved to: {output_video_path}")
print(f"Output CSV saved to: {output_csv_path}")
