In [20]:
%load_ext autoreload
%autoreload 2

from src.video_preprocessing.download_videos.youtube_download import preprocess_video
from src.video_preprocessing.scene_detection.scene_detect import detect_scenes
from src.video_preprocessing.download_videos.download_utils import (
    transcribe_audio_files,
    extract_and_store_audio,
)
from src.ocr.pytesseract_image_to_text import extract_text_from_image

from src.llm.ollama_implementation.ollama_experiment import (
    prompt_llm_summary,
    prompt_llm_extensive_summary,
)
from src.video_preprocessing.download_videos.download_utils import (
    transcription_to_text,
    create_metadata,
)

from PIL import Image

from loguru import logger
import pickle

from src.clip.clip_model import CLIPEmbeddingsModel

import tqdm
from pathlib import Path
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Download Pipeline
Download a video from a specific URL on YouTube, then run:
- Scene detection
- Keyframe detection

The resulting data will be stored under `/data/raw/<NAME>`

In [21]:
# Define options and input for downloading a video from youtube

# INSERT video name here
name = "biology_chapter_3_3"
# INSERT video URL here
url = "https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk"
# INSERT chunk length in seconds 30s --> 30, no splitting: None
chunks = None

opts_aud = {"format": "mp3/bestaudio/best", "keep-video": True}
opts_vid = {"format": "mp4/bestvideo/best"}

In [22]:
# Downloads the video creates the relevant datafolders and transcribes the video
data_path = preprocess_video(
    download=True,
    uploaded_vid="ignore",  # path to local file
    url=url,
    name=name,
    aud_opts=opts_aud,
    vid_opts=opts_vid,  # Video download settings
    audio_file=name + ".mp3",
    input_file=name + ".mp4",
    output="output.mp4",
    split_length=chunks,
)

[32m2024-06-30 17:49:08.521[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m49[0m - [1mStarting AutoCaptioning...[0m
[32m2024-06-30 17:49:08.522[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m50[0m - [1mResults will be stored in data/raw/biology_chapter_3_3[0m
[32m2024-06-30 17:49:08.522[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m58[0m - [1mCreated chunks folders[0m


[youtube] Extracting URL: https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk
[youtube] DZSEErNZ1d4: Downloading webpage
[youtube] DZSEErNZ1d4: Downloading ios player API JSON
[youtube] DZSEErNZ1d4: Downloading m3u8 information
[info] DZSEErNZ1d4: Downloading 1 format(s): 18
[download] C:\Users\baatout\PycharmProjects\afm-vlm\data\raw\biology_chapter_3_3\biology_chapter_3_3.mp4 has already been downloaded
[download] 100% of   85.73MiB


[32m2024-06-30 17:49:10.864[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m91[0m - [1mVideo is not splitted:[0m
[32m2024-06-30 17:49:10.865[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m95[0m - [1mVideo downloaded successfully![0m


In [None]:
#  Now that we have downloaded the video we want to perform scene_Detection:
detect_scenes(data_path)

[32m2024-06-25 20:21:09.819[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m29[0m - [1mFound file[0m
[32m2024-06-25 20:21:09.835[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m33[0m - [1mName:biology_chapter_3_3.mp4,dirname:C:\Users\baatout\PycharmProjects\afm-vlm\data/raw\biology_chapter_3_3\biology_chapter_3_3.mp4[0m
[32m2024-06-25 20:21:09.837[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m35[0m - [1mRunning scene_detection:[0m


In [None]:
# Extract the audio per detected scene
extract_and_store_audio(
    os.path.join(data_path, "scene_snippets"),
    os.path.join(data_path, "audio_chunks"),
)

# Audio Transcription using Whisper

For Faster Inference Please Use Tiny!

In [12]:
# Transcribe the different snippets:
audio_dir = os.path.join(data_path, "audio_chunks")
transcriptions_dir = os.path.join(data_path, "transcriptions")

model_type = "tiny"  # change to 'large' if you want more accurate results,
# change to 'medium.en' or 'large.en' for all english language tasks,
# and change to 'small' or 'base' for faster inference
lang = "en"

# Run whisper on all .wav files in audio_dir
transcribe_audio_files(audio_dir, transcriptions_dir, model_type=model_type, lang=lang)

[32m2024-06-25 23:57:07.443[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.download_utils[0m:[36mtranscribe_audio_files[0m:[36m361[0m - [1mStarting pooling:[0m
100%|██████████| 141/141 [04:53<00:00,  2.08s/it]


# Load the CLIP Model

In [23]:
# create instance
clip_model = CLIPEmbeddingsModel()

<Figure size 800x2000 with 0 Axes>

In [24]:
# get current directory
# Get the path of the current notebook
notebook_path = Path().resolve()
image_path = os.path.join(
    notebook_path, "data", "raw", "biology_chapter_3_3_treshhold_5", "extracted_keyframes"
)

# Starting the Analysis of the Information Contained in the Video

### Inputs

* **Transcriptions**: [insert description or link to transcription]
* **Extraction from Slides using OCR**: [insert description or link to extracted content]
* **Textual Interpretation of Visual Information using LLAVA**: [insert description or link to 
interpreted information]

In [8]:
# Transform transcription file
keyframes = {}
ocr_extracted_text = []

for filename in tqdm.tqdm(os.listdir(image_path)):
    # Check if the file ends with the specified extension
    if filename.endswith(".jpg"):
        filepath = os.path.join(image_path, filename)
        keyframe_num = int(filename.split("-")[2])

        # Extract text using OCR:
        ocr_text = extract_text_from_image(filepath)

        # logger.info(f"OCR_results: {ocr_text}")

        transcription_file_path = os.path.join(
            os.path.dirname(image_path),
            "transcriptions",
            filename.replace("-01.jpg", ".csv"),
        )

        transcription, timestamps = transcription_to_text(transcription_file_path)
        logger.info(f"Transcription_text: {transcription}")

        # Extract textual understanding of Visual features using LLAVA:

        #llava_results = generate_caption_using_llava(filepath)
        llava_results = "llava_results"
        logger.info(f"LLava_results: {llava_results}")

        clip_llm_summary = prompt_llm_summary(
            slide_content=ocr_text,
            transcription=transcription,
            llava_output=llava_results,
        )

        extensive_summary = prompt_llm_extensive_summary(
            slide_content=ocr_extracted_text,
            transcription=transcription,
            llava_output=llava_results,
        )

        # Alternative that goes faster.
        # ocr_text = "ocr_text"
        # llava_results = "llava_results"
        # clip_llm_summary = "clip_llm_summary"
        # extensive_summary = "extensive_summary"

        # generate embeddings
        opened_image = Image.open(filepath)

        embeddings = clip_model.generate_image_embeddings(
            clip_llm_summary, opened_image
        )
        clip_text_embedding = embeddings["text_embeds"]
        clip_image_embedding = embeddings["image_embeds"]

        keyframe, keyframe_metadata = create_metadata(
            keyframe_num,
            filepath,
            timestamps,
            transcription,
            ocr_extracted_text,
            llava_results,
            clip_llm_summary,
            extensive_summary,
            clip_text_embedding,
            clip_image_embedding,
        )
        keyframes[keyframe] = keyframe_metadata
        # print(keyframes)

    # Save keyframes dictionary as Pickle

# Save with pickle
with open("data.pickle", "wb") as file:
    pickle.dump(keyframes, file)

  0%|          | 0/87 [00:00<?, ?it/s]


TypeError: transcription_to_text() missing 2 required positional arguments: 'transcription_file_path' and 'timestamp_file_path'

In [9]:
# Assuming keyframes is your dictionary
keyframes_count = len(keyframes)
print("Number of keyframes:", keyframes_count)

Number of keyframes: 0
