In [None]:
import os
from src.video_preprocessing.download_videos.youtube_download import preprocess_video
from src.video_preprocessing.scene_detection.scene_detect import detect_scenes
from src.video_preprocessing.scene_detection.ocr import extract_text_from_slide
from src.video_preprocessing.download_videos.download_utils import (
    transcribe_audio_files,
    extract_and_store_audio,
)
from src.ocr.pytesseract_image_to_text import extract_text_from_image

In [None]:
# Define options and input for downloading a video from youtube

# INSERT video name here
name = "biology_chapter_3_3"
# INSERT video URL here
url = "https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk"
# INSERT chunk length in seconds 30s --> 30, no splitting: None
chunks = None


opts_aud = {"format": "mp3/bestaudio/best", "keep-video": True}
opts_vid = {"format": "mp4/bestvideo/best"}

In [None]:
# Downloads the video creates the relevant datafolders and transcribes the video
data_path = preprocess_video(
    download=True,
    uploaded_vid="ignore",  # path to local file
    url=url,
    name=name,
    aud_opts=opts_aud,
    vid_opts=opts_vid,  # Video download settings
    audio_file=name + ".mp3",
    input_file=name + ".mp4",
    output="output.mp4",
    split_length=chunks,
)

In [None]:
#  Now that we have downloaded the video we want to perform scene_Detection:
detect_scenes(data_path)

In [None]:
# Extract the audio per detected scene
extract_and_store_audio(
    os.path.join(data_path, "scene_snippets"),
    os.path.join(data_path, "audio_chunks"),
)

In [None]:
# Transcribe the different snippets snippets:
audio_dir = os.path.join(data_path, "audio_chunks")
transcriptions_dir = os.path.join(data_path, "transcriptions")

model_type = "tiny"  # change to 'large' if you want more accurate results,
# change to 'medium.en' or 'large.en' for all english language tasks,
# and change to 'small' or 'base' for faster inference
lang = "en"

# Run whisper on all .wav files in audio_dir
transcribe_audio_files(audio_dir, transcriptions_dir, model_type=model_type, lang=lang)

In [None]:
import os

import glob

directory = "/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/extracted_keyframes"
file_pattern = "*.jpg"

for file_path in glob.glob(f"{directory}/{file_pattern}"):
    text = extract_text_from_image(file_path)
    print(f"Text for the following path:{file_path} is: {text}")


# extract_text_from_slide(extracted_keyframe_dir)

In [None]:
from llm.ollama_implementation.ollama_experiment import generate_response

import pandas as pd
import time

# Read the .csv file
df = pd.read_csv(
    "/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/transcriptions/biology_chapter_3_3-Scene-055.csv"
)

# Combine all content from the "text" column into one string
transcription = " ".join(df["text"].astype(str))
text = extract_text_from_image(
    "/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/extracted_keyframes/biology_chapter_3_3-Scene-055-01.jpg"
)
start_time = time.time()
response = generate_response(slide_content=text, transcription=transcription)
# Calculate the elapsed time
elapsed_time = time.time() - start_time

# Print the elapsed time
print("Elapsed Time:", elapsed_time, "seconds")

print(response)

In [None]:
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
import torch


def get_model_info(model_ID, device):
    # Save the model to device
    model = CLIPModel.from_pretrained(model_ID).to(device)
    # Get the processor
    processor = CLIPProcessor.from_pretrained(model_ID)
    # Get the tokenizer
    tokenizer = CLIPTokenizer.from_pretrained(model_ID)
    # Return model, processor & tokenizer
    return model, processor, tokenizer


# Set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Define the model ID
model_ID = "openai/clip-vit-base-patch32"
# Get model, processor & tokenizer
model, processor, tokenizer = get_model_info(model_ID, device)

In [None]:
from PIL import Image


def get_image(image_path):
    image = Image.open(image_path)
    # Convert the image to RGB
    rgb_image = image.convert("RGB")
    return rgb_image


def get_single_image_embedding(text, my_image, processor, model, device):
    image = processor(text=text, images=my_image, return_tensors="pt")[
        "pixel_values"
    ].to(device)
    embedding = model.get_image_features(image)
    # convert the embeddings to numpy array
    return embedding.cpu().detach().numpy()


one_image = get_image(
    image_path="/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/extracted_keyframes/biology_chapter_3_3-Scene-055-01.jpg"
)


one_vector = get_single_image_embedding(
    response, one_image, processor, model, device
)  # Simple test

In [None]:
print(one_vector)