# This pipeline is here for extensive data generation. Data Analysis is done in the other notebook.

In [20]:
%load_ext autoreload
%autoreload 2

import os
from src.video_preprocessing.download_videos.youtube_download import preprocess_video
from src.video_preprocessing.scene_detection.scene_detect import detect_scenes
from src.video_preprocessing.download_videos.download_utils import (
    transcribe_audio_files,
    extract_and_store_audio,
)
from src.ocr.pytesseract_image_to_text import extract_text_from_image

from src.llm.ollama_implementation.ollama_experiment import (
    prompt_llm_summary,
    prompt_llm_extensive_summary,
    generate_caption_using_llava
)
from src.video_preprocessing.download_videos.download_utils import (
    transcription_to_text,
    create_metadata,
)

from PIL import Image

from loguru import logger
import pickle

from src.clip.clip_model import CLIPEmbeddingsModel

import tqdm
from pathlib import Path
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Download Pipeline
Download a video from a specific URL on YouTube, then run:
- Scene detection
- Keyframe detection

The resulting data will be stored under `/data/raw/<NAME>`

In [17]:
# Define options and input for downloading a video from youtube

# INSERT video name here
name = "biology_chapter_3_3"
# INSERT video URL here
url = "https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk"
# INSERT chunk length in seconds 30s --> 30, no splitting: None
chunks = None

opts_aud = {"format": "mp3/bestaudio/best", "keep-video": True}
opts_vid = {"format": "mp4/bestvideo/best"}

In [None]:
# Downloads the video creates the relevant datafolders and transcribes the video
data_path = preprocess_video(
    download=True,
    uploaded_vid="ignore",  # path to local file
    url=url,
    name=name,
    aud_opts=opts_aud,
    vid_opts=opts_vid,  # Video download settings
    audio_file=name + ".mp3",
    input_file=name + ".mp4",
    output="output.mp4",
    split_length=chunks,
)

# Scene Detection and Extaction

In [None]:
detect_scenes(data_path)

In [None]:
# Extract the audio per detected scene
extract_and_store_audio(
    os.path.join(data_path, "scene_snippets"),
    os.path.join(data_path, "audio_chunks"),
)

# Audio Transcription using Whisper

For Faster Inference Please Use Tiny!

In [None]:
# Transcribe the different snippets:
audio_dir = os.path.join(data_path, "audio_chunks")
transcriptions_dir = os.path.join(data_path, "transcriptions")

model_type = "tiny"  # change to 'large' if you want more accurate results,
# change to 'medium.en' or 'large.en' for all english language tasks,
# and change to 'small' or 'base' for faster inference
lang = "en"

# Run whisper on all .wav files in audio_dir
transcribe_audio_files(audio_dir, transcriptions_dir, model_type=model_type, lang=lang)

# Starting the Analysis of the Information Contained in the Video

### Inputs

* **Transcriptions**: [insert description or link to transcription]
* **Extraction from Slides using OCR**: [insert description or link to extracted content]
* **Textual Interpretation of Visual Information using LLAVA**: [insert description or link to 
interpreted information]

In [23]:
# Transform transcription file
keyframes = {}
ocr_extracted_text = []

notebook_path = Path().resolve()
image_path = os.path.join(
    notebook_path, "data", "raw", "biology_chapter_3_3", "extracted_keyframes"
)

timestamp_file_path = os.path.join(
    os.path.dirname(image_path), "extracted_keyframes", name + "-Scenes.csv"
)


# create instance
clip_model = CLIPEmbeddingsModel()
i = 0
for filename in tqdm.tqdm(os.listdir(image_path)):
    # Check if the file ends with the specified extension
    print(filename)
    if i ==2:
        break
    
    if filename.endswith(".jpg"):
        filepath = os.path.join(image_path, filename)
        keyframe_num = int(filename.split("-")[2])

        # Extract text using OCR:
        ocr_text = extract_text_from_image(filepath)

        # logger.info(f"OCR_results: {ocr_text}")

        transcription_file_path = os.path.join(
            os.path.dirname(image_path),
            "transcriptions",
            filename.replace("-01.jpg", ".csv"),
        )

        transcription, timestamps = transcription_to_text(
            keyframe_num, transcription_file_path, timestamp_file_path
        )
        logger.info(f"Transcription_text: {transcription}")
        
        # Extract textual understanding of Visual features using LLAVA:

        # llava_results = generate_caption_using_llava(filepath)
        # #llava_results = "llava_results"
        # logger.info(f"LLava_results: {llava_results}")

        # short_llm_summary = prompt_llm_summary(
        #     slide_content=ocr_text,
        #     transcription=transcription,
        #     llava_output=llava_results,
        # )
        # print('short...\n')
        # print(short_llm_summary)
    
        # extensive_llm_summary = prompt_llm_extensive_summary(
        #     slide_content=ocr_extracted_text,
        #     transcription=transcription,
        #     llava_output=llava_results,
        # )
        
        # print('long...\n')
        # print(extensive_llm_summary)

        # Alternative that goes faster.
        ocr_text = "ocr_text"
        llava_results = "llava_results"
        clip_llm_summary = "clip_llm_summary"
        extensive_summary = "extensive_summary"
        short_llm_summary = 'short'
        extensive_llm_summary = 'extensive_llm_summary'

        # generate embeddings
        opened_image = Image.open(filepath)

        embeddings = clip_model.generate_image_embeddings(
            short_llm_summary, opened_image
        )
        
        # these are the embeddings with a standard  tokenizer
        # P.S: This is still in the clip model class, but will be of course moved to a separate class
        standard_text_embedding = clip_model.generate_dataset_embeddings_standard_tokenizer(
                short_llm_summary
        )
        extensive_text_embedding = clip_model.generate_dataset_embeddings_standard_tokenizer(
                extensive_llm_summary
        )
        
        ocr_text_embedding = clip_model.generate_dataset_embeddings_standard_tokenizer(
                ocr_text
        )
        
        transcription_text_embedding = clip_model.generate_dataset_embeddings_standard_tokenizer(
                transcription
        )
        
        llava_text_embedding = clip_model.generate_dataset_embeddings_standard_tokenizer(
                llava_results
        )
        
        ocr_transcription_embedding = clip_model.generate_dataset_embeddings_standard_tokenizer(
                str(ocr_text) + str(transcription)
        )
        
        ocr_transcription_llava_embedding = clip_model.generate_dataset_embeddings_standard_tokenizer(
                str(ocr_text) + str(transcription) + str(llava_results)
        )
        
        clip_text_embedding = embeddings["text_embeds"]
        clip_image_embedding = embeddings["image_embeds"]

        keyframe, keyframe_metadata = create_metadata(
            keyframe_num,
            filepath,
            timestamps,
            transcription,
            ocr_extracted_text,
            llava_results,
            short_llm_summary,
            extensive_llm_summary,
            clip_text_embedding,
            clip_image_embedding,
            standard_text_embedding,
            extensive_text_embedding,
            ocr_text_embedding,
            transcription_text_embedding,
            llava_text_embedding,
            ocr_transcription_embedding,
            ocr_transcription_llava_embedding
        )
        keyframes[keyframe] = keyframe_metadata
        print(keyframes)
        break

    # Save keyframes dictionary as Pickle

# Save with pickle
with open("data_generation_pipeline.pickle", "wb") as file:
    pickle.dump(keyframes, file)

  0%|          | 0/87 [00:00<?, ?it/s]python(52117) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


biology_chapter_3_3-Scene-015-01.jpg


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[32m2024-06-30 11:56:28.054[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m42[0m - [1mTranscription_text:  Carbohydrates are a macronutrient and our main energy source.  It's the main energy source that we use to quickly produce ATP, our own body's energy molecule.  It's relatively easy to get the amount of carbohydrates that we need each day in our diet.  They come in food sources that tend to be relatively easy to prepare and to obtain,  such as bread, cereals, rice, pasta, fruits and vegetables.[0m
[32m2024-06-30 11:58:20.774[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m48[0m - [1mLLava_results:  The slide appears to be from an academic lecture, possibly rela

Exception 'Summary'
short...

Carbohydrates main energy source, food sources bread cereal rice pasta fruits vegetables, easy to prepare obtain.
long...

Carbohydrates are a macronutrient and our main energy source. They're relatively easy to get from food sources like bread, cereals, rice, pasta, fruits, and vegetables. The body needs a variety of nutrients to function properly, including carbohydrates, proteins, fats, vitamins, minerals, and water. Each of these macronutrients plays a crucial role in maintaining health. Meeting daily nutritional targets, or 'macros,' is important for optimal health.


[32m2024-06-30 12:07:45.919[0m | [1mINFO    [0m | [36msrc.clip.clip_model[0m:[36mprocess_clip_tensors[0m:[36m115[0m - [1mImage embeddings shape: torch.Size([1, 512])[0m
[32m2024-06-30 12:07:45.931[0m | [1mINFO    [0m | [36msrc.clip.clip_model[0m:[36mprocess_clip_tensors[0m:[36m116[0m - [1mText embeddings shape: torch.Size([1, 512])[0m
[32m2024-06-30 12:07:52.139[0m | [1mINFO    [0m | [36msrc.clip.clip_model[0m:[36mgenerate_dataset_embeddings_standard_tokenizer[0m:[36m93[0m - [1mText embeddings with standard tokenizer: tensor([-2.8992e-01, -1.8542e-01, -1.0617e-02,  4.3979e-01,  1.4515e-01,
         2.5133e-01,  9.6185e-02, -1.7294e-01, -3.4280e-01, -4.3420e-02,
        -1.2435e-01, -3.7903e-01, -2.8439e-01, -2.0356e-01,  2.3523e-01,
        -2.8651e-02,  7.2277e-01,  3.3137e-02, -1.3040e-01, -2.0986e-01,
         4.2924e-02, -6.9515e-02,  2.3026e-01, -2.5335e-01,  2.1338e-01,
        -1.2715e-01, -3.3277e-02,  9.0647e-02, -1.1528e-01, -2.9312e-01,
   

TypeError: create_metadata() takes 12 positional arguments but 16 were given

<Figure size 800x2000 with 0 Axes>

In [None]:
# Assuming keyframes is your dictionary
keyframes_count = len(keyframes)
print("Number of keyframes:", keyframes_count)