# **Fine tuning text to video model**
Please refer to the respective sections in the book for further details.

## **Step 1. Installing Libraries**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
!pip install opencv-python-headless transformers
!pip install -U -q autotrain-advanced
!apt-get install git-lfs
!git lfs install
!git clone https://github.com/hotshotco/Hotshot-XL.git
!git clone https://huggingface.co/hotshotco/SDXL-512

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.4/259.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.4/150.4 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.6/174.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m92

In [None]:
cd /content/drive/MyDrive/Hotshot-XL/Hotshot-XL
!pip install -r requirements.txt

In [None]:
import os
import cv2
from PIL import Image

## **Step 2. Data loading & pre-processing**

In [None]:
video_folder_path = "/content/drive/MyDrive/sampe_videos_for_fine_tuning_384_256"
output_folder_path = "/content/drive/MyDrive/fine_tunedataset"

os.makedirs(output_folder_path, exist_ok=True)

def extract_first_frame(video_path, output_image_path):
    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()
    if ret:
        frame_resized = cv2.resize(frame, (512, 512))
        cv2.imwrite(output_image_path, frame_resized)
    cap.release()

video_files = sorted(os.listdir(video_folder_path))
for index, video in enumerate(video_files):
    video_path = os.path.join(video_folder_path, video)
    frame_name = f"frame_{index + 1:03d}.jpg"
    output_image_path = os.path.join(output_folder_path, frame_name)
    extract_first_frame(video_path, output_image_path)
    print(f"Extracted first frame from {video} as {frame_name}")

Extracted first frame from processed_VID-20240214-WA0007.mp4 as frame_001.jpg
Extracted first frame from processed_VID-20240214-WA0008.mp4 as frame_002.jpg
Extracted first frame from processed_VID-20240214-WA0009.mp4 as frame_003.jpg
Extracted first frame from processed_VID-20240214-WA0010.mp4 as frame_004.jpg
Extracted first frame from processed_VID-20240214-WA0011.mp4 as frame_005.jpg
Extracted first frame from processed_VID-20240214-WA0012.mp4 as frame_006.jpg
Extracted first frame from processed_VID-20240214-WA0013.mp4 as frame_007.jpg
Extracted first frame from processed_VID-20240214-WA0014.mp4 as frame_008.jpg
Extracted first frame from processed_VID-20240214-WA0015.mp4 as frame_009.jpg
Extracted first frame from processed_VID-20240214-WA0016.mp4 as frame_010.jpg
Extracted first frame from processed_VID-20240214-WA0017.mp4 as frame_011.jpg
Extracted first frame from processed_VID-20240214-WA0018.mp4 as frame_012.jpg
Extracted first frame from processed_VID-20240214-WA0019.mp4 as 

## **Step 3. Model training (fine-tuning)**

In [None]:
project_name = 'sdxlfinetuned'
model_name = 'stabilityai/stable-diffusion-xl-base-1.0'
prompt = 'colorful flowers in pot with green background'
push_to_hub = "True"
hf_token = "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"


learning_rate = 1e-4
num_steps = 500
batch_size = 1
gradient_accumulation = 4
resolution = 512
use_8bit_adam = False
use_xformers = False
mixed_precision = "fp16"
train_text_encoder = False
disable_gradient_checkpointing = False

os.environ["PROJECT_NAME"] = project_name
os.environ["MODEL_NAME"] = model_name
os.environ["PROMPT"] = prompt
os.environ["PUSH_TO_HUB"] = str(push_to_hub)
os.environ["HF_TOKEN"] = hf_token
os.environ["LEARNING_RATE"] = str(learning_rate)
os.environ["NUM_STEPS"] = str(num_steps)
os.environ["BATCH_SIZE"] = str(batch_size)
os.environ["GRADIENT_ACCUMULATION"] = str(gradient_accumulation)
os.environ["RESOLUTION"] = str(resolution)
os.environ["USE_8BIT_ADAM"] = str(use_8bit_adam)
os.environ["USE_XFORMERS"] = str(use_xformers)
os.environ["MIXED_PRECISION"] = str(mixed_precision)
os.environ["TRAIN_TEXT_ENCODER"] = str(train_text_encoder)
os.environ["DISABLE_GRADIENT_CHECKPOINTING"] = str(disable_gradient_checkpointing)

In [None]:
!autotrain dreambooth \
--model ${MODEL_NAME} \
--project-name ${PROJECT_NAME} \
--image-path fine_tunedataset/ \
--prompt "${PROMPT}" \
--resolution ${RESOLUTION} \
--batch-size ${BATCH_SIZE} \
--num-steps ${NUM_STEPS} \
--gradient_accumulation ${GRADIENT_ACCUMULATION} \
--lr ${LEARNING_RATE} \
--mixed-precision ${MIXED_PRECISION} \
$( [[ "$USE_XFORMERS" == "True" ]] && echo "--xformers" ) \
$( [[ "$TRAIN_TEXT_ENCODER" == "True" ]] && echo "--train-text-encoder" ) \
$( [[ "$USE_8BIT_ADAM" == "True" ]] && echo "--use-8bit-adam" ) \
$( [[ "$DISABLE_GRADIENT_CHECKPOINTING" == "True" ]] && echo "--disable_gradient-checkpointing" ) \
$( [[ \"$PUSH_TO_HUB\" == True ]] && echo \" --push-to-hub --token ${HF_TOKEN}\" )

[1mINFO    [0m | [32m2024-04-24 23:51:54[0m | [36mautotrain.cli.run_dreambooth[0m:[36mrun[0m:[36m388[0m - [1mRunning DreamBooth Training[0m
[1mINFO    [0m | [32m2024-04-24 23:51:54[0m | [36mautotrain.preprocessor.dreambooth[0m:[36m_save_concept_images[0m:[36m62[0m - [1mSaving concept images[0m
[1mINFO    [0m | [32m2024-04-24 23:51:54[0m | [36mautotrain.preprocessor.dreambooth[0m:[36m_save_concept_images[0m:[36m63[0m - [1mfine_tunedataset/frame_001.jpg[0m
[1mINFO    [0m | [32m2024-04-24 23:51:54[0m | [36mautotrain.preprocessor.dreambooth[0m:[36m_save_concept_images[0m:[36m62[0m - [1mSaving concept images[0m
[1mINFO    [0m | [32m2024-04-24 23:51:54[0m | [36mautotrain.preprocessor.dreambooth[0m:[36m_save_concept_images[0m:[36m63[0m - [1mfine_tunedataset/frame_002.jpg[0m
[1mINFO    [0m | [32m2024-04-24 23:51:54[0m | [36mautotrain.preprocessor.dreambooth[0m:[36m_save_concept_images[0m:[36m62[0m - [1mSaving concept images

## **Step 4. Model Inference**

In [None]:
!python inference.py \
  --prompt="a close up of a pink flower in a vase against a yellow background" \
  --output="fine_tune_output.gif" \
  --spatial_unet_base="/content/drive/MyDrive/Hotshot-XL/Hotshot-XL/SDXL-512/unet" \
  --lora="/content/drive/MyDrive/Hotshot-XL/Hotshot-XL/sdxlfinetuned/pytorch_lora_weights.safetensors"

Downloading unet/config.json: 100% 1.73k/1.73k [00:00<00:00, 8.82MB/s]
Downloading (…)ch_model.safetensors: 100% 11.2G/11.2G [03:45<00:00, 49.8MB/s]
The config attributes {'attention_type': 'default', 'dropout': 0.0} were passed to UNet3DConditionModel, but are not expected and will be ignored. Please verify your config.json configuration file.
Downloading model_index.json: 100% 579/579 [00:00<00:00, 3.43MB/s]
Fetching 16 files:   0% 0/16 [00:00<?, ?it/s]
Downloading (…)_encoder/config.json: 100% 613/613 [00:00<00:00, 4.39MB/s]

Downloading (…)okenizer_config.json: 100% 737/737 [00:00<00:00, 3.42MB/s]

Downloading (…)cial_tokens_map.json: 100% 472/472 [00:00<00:00, 3.37MB/s]

Downloading (…)ncoder_2/config.json: 100% 623/623 [00:00<00:00, 3.48MB/s]

Downloading tokenizer/merges.txt:   0% 0.00/525k [00:00<?, ?B/s][A

Downloading tokenizer/vocab.json:   0% 0.00/1.06M [00:00<?, ?B/s][A[A


Downloading (…)cial_tokens_map.json: 100% 460/460 [00:00<00:00, 2.78MB/s]



Downloading (…)chedu