In [13]:
%load_ext autoreload
%autoreload 2

import os
from src.video_preprocessing.download_videos.youtube_download import preprocess_video
from src.video_preprocessing.scene_detection.scene_detect import detect_scenes
from src.video_preprocessing.download_videos.download_utils import (
    transcribe_audio_files,
    extract_and_store_audio,
    transcription_to_text,
    create_metadata,
)
from src.ocr.pytesseract_image_to_text import extract_text_from_image
from src.llm.ollama_implementation.ollama_experiment import (
    prompt_llm_summary,
    generate_caption_using_llava,
    prompt_llm_extensive_summary,
    extract_json
)

from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image

import torch
from loguru import logger
import pickle

from src.clip.clip_model import CLIPEmbeddingsModel

import tqdm
from pathlib import Path

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Download Pipeline
Download a video from a specific URL on YouTube, then run:
- Scene detection
- Keyframe detection

The resulting data will be stored under `/data/raw/<NAME>`

In [8]:
# Define options and input for downloading a video from youtube

# INSERT video name here
name = "biology_chapter_3_3_treshhold_5"
# INSERT video URL here
url = "https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk"
# INSERT chunk length in seconds 30s --> 30, no splitting: None
chunks = None

opts_aud = {"format": "mp3/bestaudio/best", "keep-video": True}
opts_vid = {"format": "mp4/bestvideo/best"}

In [3]:
# Downloads the video creates the relevant datafolders and transcribes the video
data_path = preprocess_video(
    download=True,
    uploaded_vid="ignore",  # path to local file
    url=url,
    name=name,
    aud_opts=opts_aud,
    vid_opts=opts_vid,  # Video download settings
    audio_file=name + ".mp3",
    input_file=name + ".mp4",
    output="output.mp4",
    split_length=chunks,
)

[32m2024-06-27 12:23:19.131[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m49[0m - [1mStarting AutoCaptioning...[0m
[32m2024-06-27 12:23:19.132[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m50[0m - [1mResults will be stored in data/raw/biology_chapter_3_3_treshhold_5[0m
[32m2024-06-27 12:23:19.133[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m58[0m - [1mCreated chunks folders[0m


[youtube] Extracting URL: https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk
[youtube] DZSEErNZ1d4: Downloading webpage
[youtube] DZSEErNZ1d4: Downloading ios player API JSON
[youtube] DZSEErNZ1d4: Downloading player a95aa57a
[youtube] DZSEErNZ1d4: Downloading m3u8 information
[info] DZSEErNZ1d4: Downloading 1 format(s): 18
[download] Destination: /Users/haseeb/Desktop/Prak_New/afm-vlm/data/raw/biology_chapter_3_3_treshhold_5/biology_chapter_3_3_treshhold_5.mp4
[download] 100% of   85.73MiB in 00:00:23 at 3.68MiB/s     


[32m2024-06-27 12:23:44.811[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m91[0m - [1mVideo is not splitted:[0m
[32m2024-06-27 12:23:44.812[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m95[0m - [1mVideo downloaded successfully![0m


In [None]:
#  Now that we have downloaded the video we want to perform scene_Detection:
detect_scenes(data_path)

In [None]:
# Extract the audio per detected scene
extract_and_store_audio(
    os.path.join(data_path, "scene_snippets"),
    os.path.join(data_path, "audio_chunks"),
)

# Audio Transcription using Whisper

For Faster Inference Please Use Tiny!

In [None]:
# Transcribe the different snippets:
audio_dir = os.path.join(data_path, "audio_chunks")
transcriptions_dir = os.path.join(data_path, "transcriptions")

model_type = "tiny"  # change to 'large' if you want more accurate results,
# change to 'medium.en' or 'large.en' for all english language tasks,
# and change to 'small' or 'base' for faster inference
lang = "en"

# Run whisper on all .wav files in audio_dir
transcribe_audio_files(audio_dir, transcriptions_dir, model_type=model_type, lang=lang)

# Load the CLIP Model

In [9]:
# create instance
clip_model = CLIPEmbeddingsModel()

<Figure size 800x2000 with 0 Axes>

In [10]:
# get current directory
# Get the path of the current notebook
notebook_path = Path().resolve()
image_path = os.path.join(notebook_path, "data", "raw", name, "extracted_keyframes")

images = []
# make a list out of the images
for image in os.listdir(image_path):
    if image.endswith(".jpg"):
        images.append(os.path.join(image_path, image))

# load and process the dataset
image_dataset = clip_model.load_and_process_dataset(images)

logger.info(f"Image_dataset: {image_dataset}")

[32m2024-06-28 11:08:16.937[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mImage_dataset: [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x16B3EC340>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x16B3EDF90>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x16B3EDFF0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x16B3EC3A0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x16B3EDFC0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x16B3EDEA0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x16B3EE020>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x16B3EE080>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x16B3EE0E0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x16B3EE140>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x16B3EE1A

# Starting the Analysis of the Information Contained in the Video

### Inputs

* **Transcriptions**: [insert description or link to transcription]
* **Extraction from Slides using OCR**: [insert description or link to extracted content]
* **Textual Interpretation of Visual Information using LLAVA**: [insert description or link to 
interpreted information]

In [14]:
import json
import tqdm

# Transform transcription file
keyframes = {}
ocr_extracted_text = []

timestamp_file_path = os.path.join(
    os.path.dirname(image_path), "extracted_keyframes", name + "-Scenes.csv"
)

for filename in tqdm.tqdm(os.listdir(image_path)):
    # Check if the file ends with the specified extension
    print('running...')
    if filename.endswith(".jpg"):
        filepath = os.path.join(image_path, filename)
        keyframe_num = int(filename.split("-")[2])

        transcription_file_path = os.path.join(
            os.path.dirname(image_path),
            "transcriptions",
            filename.replace("-01.jpg", ".csv"),
        )

        transcription, timestamps = transcription_to_text(
            keyframe_num, transcription_file_path, timestamp_file_path
        )
        # logger.info(f"Transcription_text: {transcription}")

        # Extract text using OCR:
        ocr_text = extract_text_from_image(filepath)
        # logger.info(f"OCR_results: {ocr_text}")

       #Extract textual understanding of Visual features using LLAVA:
        #llava_results = generate_caption_using_llava(filepath)
        # logger.info(f"LLava_results: {llava_results}")

        
        #DUMMY Llava results
        llava_results = 'this is talking about cell structure'
        
        try:
            clip_llm_summary = prompt_llm_summary(
                slide_content=ocr_text,
                transcription=transcription,
                llava_output=llava_results,
            )
            clip_llm_summary = extract_json(clip_llm_summary)

            extensive_summary = prompt_llm_extensive_summary(
                slide_content=ocr_text,
                transcription=transcription,
                llava_output=llava_results,
            )
            extensive_summary = extract_json(extensive_summary)
        except KeyError:
            print('Output summaries not in proper format')
        
        
        break

        # # Alternative that goes faster.
        # ocr_text = "ocr_text"
        # llava_results = "llava_results"
        # clip_llm_summary = "ontrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum"
        # extensive_summary = "extensive_summary"

        # generate embeddings
        opened_image = Image.open(filepath)
        

        # embeddings = clip_model.generate_image_embeddings(
        #     clip_llm_summary, opened_image
        # )
        # clip_text_embedding = embeddings["text_embeds"]
        # clip_image_embedding = embeddings["image_embeds"]

        # keyframe, keyframe_metadata = create_metadata(
        #     keyframe_num,
        #     filepath,
        #     timestamps,
        #     transcription,
        #     ocr_text,
        #     llava_results,
        #     clip_llm_summary,
        #     extensive_summary,
        #     clip_text_embedding,
        #     clip_image_embedding,
        # )
        # keyframes[keyframe] = keyframe_metadata
        # print(keyframes)

    # Save keyframes dictionary as Pickle

# Save with pickle
# with open("bio_3_3_th5.pickle", "wb") as file:
#     pickle.dump(keyframes, file)

  0%|          | 0/87 [00:00<?, ?it/s]

running...
transcriptions...

Transport Across Membranes

+ Exocytosis: a membrane-bound vesicle fuses with the
membrane and expels the large molecule outside the cell

(2) Exocytosis


this is talking about cell structure


  0%|          | 0/87 [04:00<?, ?it/s]

extensive: 

Here is the combined summary in JSON format:

{
"Summary": "Transport Across Membranes: The lecture discussed transport across membranes, specifically exocytosis, where a membrane-bound vesicle fuses with the membrane and expels large molecules outside the cell. This process is important for cellular communication and waste removal."
}
Transport Across Membranes: The lecture discussed transport across membranes, specifically exocytosis, where a membrane-bound vesicle fuses with the membrane and expels large molecules outside the cell. This process is important for cellular communication and waste removal.





# Test 1: Search for exact similar Text.

In [None]:
test_image_path = os.path.join(
    base_dir,
    "data",
    "raw",
    "biology_chapter_3_3",
    "extracted_keyframes",
    "biology_chapter_3_3-Scene-097-01.jpg",
)

test_text_description = extract_text_from_image(test_image_path)
# Search for similar images in database
clip_model.search_similar_images(test_text_description)

# Test 2: Search for for a slightly different Text.

In [None]:
query_text = "plasma membrane and stuff going on"

clip_model.search_similar_images(query_text)

In [None]:
# #######OLD


# # Generated one embedding?
# def get_model_info(model_ID, device):
#     # Save the model to device
#     model = CLIPModel.from_pretrained(model_ID).to(device)
#     # Get the processor
#     processor = CLIPProcessor.from_pretrained(model_ID)
#     # Get the tokenizer
#     tokenizer = CLIPTokenizer.from_pretrained(model_ID)
#     # Return model, processor & tokenizer
#     return model, processor, tokenizer


# # Set the device
# device = "cuda" if torch.cuda.is_available() else "cpu"
# # Define the model ID
# model_ID = "openai/clip-vit-base-patch32"
# # Get model, processor & tokenizer
# model, processor, tokenizer = get_model_info(model_ID, device)


# def get_image(image_path):
#     image = Image.open(image_path)
#     # Convert the image to RGB
#     rgb_image = image.convert("RGB")
#     return rgb_image


# def get_single_image_embedding(text, my_image, processor, model, device):
#     image = processor(text=text, images=my_image, return_tensors="pt")[
#         "pixel_values"
#     ].to(device)
#     embedding = model.get_image_features(image)
#     # convert the embeddings to numpy array
#     return embedding.cpu().detach().numpy()


# one_image = get_image(
#     image_path="/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/extracted_keyframes/biology_chapter_3_3-Scene-055-01.jpg"
# )

# one_vector = get_single_image_embedding(
#     response, one_image, processor, model, device
# )  # Simple test
# print(one_vector)