In [1]:
%load_ext autoreload
%autoreload 2

import os
from src.video_preprocessing.download_videos.youtube_download import preprocess_video
from src.video_preprocessing.scene_detection.scene_detect import detect_scenes
from src.video_preprocessing.download_videos.download_utils import (
    transcribe_audio_files,
    extract_and_store_audio,
)
from src.ocr.pytesseract_image_to_text import extract_text_from_image

from PIL import Image

import torch
from loguru import logger
import pickle

from src.clip.clip_model import CLIPEmbeddingsModel

import tqdm
from pathlib import Path
import os

# Download Pipeline
Download a video from a specific URL on YouTube, then run:
- Scene detection
- Keyframe detection

The resulting data will be stored under `/data/raw/<NAME>`

In [2]:
# Define options and input for downloading a video from youtube

# INSERT video name here
name = "biology_chapter_3_3"
# INSERT video URL here
url = "https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk"
# INSERT chunk length in seconds 30s --> 30, no splitting: None
chunks = None

opts_aud = {"format": "mp3/bestaudio/best", "keep-video": True}
opts_vid = {"format": "mp4/bestvideo/best"}

In [3]:
# Downloads the video creates the relevant datafolders and transcribes the video
data_path = preprocess_video(
    download=True,
    uploaded_vid="ignore",  # path to local file
    url=url,
    name=name,
    aud_opts=opts_aud,
    vid_opts=opts_vid,  # Video download settings
    audio_file=name + ".mp3",
    input_file=name + ".mp4",
    output="output.mp4",
    split_length=chunks,
)

[32m2024-06-26 11:12:32.228[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m49[0m - [1mStarting AutoCaptioning...[0m
[32m2024-06-26 11:12:32.243[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m50[0m - [1mResults will be stored in data/raw/biology_chapter_3_3[0m
[32m2024-06-26 11:12:32.243[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m58[0m - [1mCreated chunks folders[0m


[youtube] Extracting URL: https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk
[youtube] DZSEErNZ1d4: Downloading webpage
[youtube] DZSEErNZ1d4: Downloading ios player API JSON
[youtube] DZSEErNZ1d4: Downloading m3u8 information
[info] DZSEErNZ1d4: Downloading 1 format(s): 18
[download] C:\Users\baatout\PycharmProjects\afm-vlm\data\raw\biology_chapter_3_3\biology_chapter_3_3.mp4 has already been downloaded
[download] 100% of   85.73MiB


[32m2024-06-26 11:12:35.519[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m91[0m - [1mVideo is not splitted:[0m
[32m2024-06-26 11:12:35.519[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m95[0m - [1mVideo downloaded successfully![0m


In [None]:
#  Now that we have downloaded the video we want to perform scene_Detection:
detect_scenes(data_path)

[32m2024-06-25 20:21:09.819[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m29[0m - [1mFound file[0m
[32m2024-06-25 20:21:09.835[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m33[0m - [1mName:biology_chapter_3_3.mp4,dirname:C:\Users\baatout\PycharmProjects\afm-vlm\data/raw\biology_chapter_3_3\biology_chapter_3_3.mp4[0m
[32m2024-06-25 20:21:09.837[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m35[0m - [1mRunning scene_detection:[0m


In [None]:
# Extract the audio per detected scene
extract_and_store_audio(
    os.path.join(data_path, "scene_snippets"),
    os.path.join(data_path, "audio_chunks"),
)

# Audio Transcription using Whisper

For Faster Inference Please Use Tiny!

In [12]:
# Transcribe the different snippets:
audio_dir = os.path.join(data_path, "audio_chunks")
transcriptions_dir = os.path.join(data_path, "transcriptions")

model_type = "tiny"  # change to 'large' if you want more accurate results,
# change to 'medium.en' or 'large.en' for all english language tasks,
# and change to 'small' or 'base' for faster inference
lang = "en"

# Run whisper on all .wav files in audio_dir
transcribe_audio_files(audio_dir, transcriptions_dir, model_type=model_type, lang=lang)

[32m2024-06-25 23:57:07.443[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.download_utils[0m:[36mtranscribe_audio_files[0m:[36m361[0m - [1mStarting pooling:[0m
100%|██████████| 141/141 [04:53<00:00,  2.08s/it]


# Load the CLIP Model

In [4]:
# create instance
clip_model = CLIPEmbeddingsModel()

<Figure size 800x2000 with 0 Axes>

In [5]:
# get current directory
# Get the path of the current notebook
notebook_path = Path().resolve()
image_path = os.path.join(
    notebook_path, "data", "raw", "biology_chapter_3_3", "extracted_keyframes"
)

images = []
# make a list out of the images
for image in os.listdir(image_path):
    if image.endswith(".jpg"):
        images.append(os.path.join(image_path, image))

# load and process the dataset
image_dataset = clip_model.load_and_process_dataset(images)

logger.info(f"Image_dataset: {image_dataset}")

[32m2024-06-26 11:12:42.635[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m17[0m - [1mImage_dataset: [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x237CAB1BCE0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x237CAB1BCB0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x237CAAFDDC0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x237CAAFDD30>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x237CAAFDCA0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x237CAAFDC10>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x237CAAFDB80>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x237CAAFDAC0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x237CAAFDA00>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=482x360 at 0x237CAAFD910>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=4

# Starting the Analysis of the Information Contained in the Video

### Inputs

* **Transcriptions**: [insert description or link to transcription]
* **Extraction from Slides using OCR**: [insert description or link to extracted content]
* **Textual Interpretation of Visual Information using LLAVA**: [insert description or link to 
interpreted information]

In [None]:
from src.llm.ollama_implementation.ollama_experiment import (
    prompt_llm_summary,
    generate_caption_using_llava,
    prompt_llm_extensive_summary,
)
from src.video_preprocessing.download_videos.download_utils import (
    transcription_to_text,
    create_metadata,
)

# Transform transcription file
keyframes = {}
ocr_extracted_text = []

for filename in tqdm.tqdm(os.listdir(image_path)):
    # Check if the file ends with the specified extension
    if filename.endswith(".jpg"):
        filepath = os.path.join(image_path, filename)
        keyframe_num = int(filename.split("-")[2])

        # Extract text using OCR:
        ocr_text = extract_text_from_image(filepath)

        # logger.info(f"OCR_results: {ocr_text}")

        transcription_file_path = os.path.join(
            os.path.dirname(image_path),
            "transcriptions",
            filename.replace("-01.jpg", ".csv"),
        )

        transcription, timestamps = transcription_to_text(transcription_file_path)
        logger.info(f"Transcription_text: {transcription}")

        # Extract textual understanding of Visual features using LLAVA:

        #llava_results = generate_caption_using_llava(filepath)
        llava_results = "llava_results"
        logger.info(f"LLava_results: {llava_results}")

        clip_llm_summary = prompt_llm_summary(
            slide_content=ocr_text,
            transcription=transcription,
            llava_output=llava_results,
        )

        extensive_summary = prompt_llm_extensive_summary(
            slide_content=ocr_extracted_text,
            transcription=transcription,
            llava_output=llava_results,
        )

        # Alternative that goes faster.
        # ocr_text = "ocr_text"
        # llava_results = "llava_results"
        # clip_llm_summary = "clip_llm_summary"
        # extensive_summary = "extensive_summary"

        # generate embeddings
        opened_image = Image.open(filepath)

        embeddings = clip_model.generate_image_embeddings(
            clip_llm_summary, opened_image
        )
        clip_text_embedding = embeddings["text_embeds"]
        clip_image_embedding = embeddings["image_embeds"]

        keyframe, keyframe_metadata = create_metadata(
            keyframe_num,
            filepath,
            timestamps,
            transcription,
            ocr_extracted_text,
            llava_results,
            clip_llm_summary,
            extensive_summary,
            clip_text_embedding,
            clip_image_embedding,
        )
        keyframes[keyframe] = keyframe_metadata
        # print(keyframes)

    # Save keyframes dictionary as Pickle

# Save with pickle
with open("data.pickle", "wb") as file:
    pickle.dump(keyframes, file)

  0%|          | 0/143 [00:00<?, ?it/s][32m2024-06-26 11:06:04.504[0m | [1mINFO    [0m | [36msrc.ocr.pytesseract_image_to_text[0m:[36mextract_text_from_image[0m:[36m17[0m - [1mExtracted text: Chapter 3:

Nutrients & Membrane Transport
**Is It Possible to Supplement Your
Way to Better Performance & Health?

Biology 1020:
CURRENT TOPICS IN
BIOLOGY

[0m
[32m2024-06-26 11:06:04.519[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mTranscription_text:  Welcome to Chapter 3, Nutrients, Membranes and Trane.[0m
[32m2024-06-26 11:06:04.527[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m42[0m - [1mLLava_results: llava_results[0m


In [3]:
# Assuming keyframes is your dictionary
keyframes_count = len(keyframes)
print("Number of keyframes:", keyframes_count)

NameError: name 'keyframes' is not defined

# Test 1: Search for exact similar Text.

In [64]:
# print pickle file 
with open("data.pickle", "rb") as file:
    data = pickle.load(file)
    
logger.info(f"Data: {data}")

[32m2024-06-27 09:46:44.145[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mData: {115: {'img_path': '/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/extracted_keyframes/biology_chapter_3_3-Scene-115-01.jpg', 'timestamps': [0, 4], 'transcription': ' Here we see a closer look at that.', 'ocr_extracted_text': '3.3 Transport Across Membranes ( of9)\n\n+ Diffusion: passive transport from area of high\nconcentration to low concentration\n\n— Very small, hydrophobic molecules\n\n', 'llava_result': ' The slide appears to be from an academic lecture, possibly related to biology or chemistry. It features a title "3. Transport Across Membranes" and a subtitle "Diffusion: passive transport from area of high concentration to low concentration."\n\nThe slide includes two main figures:\n\n1. A diagram labeled "Figure 2," which illustrates the process of diffusion. The diagram shows a membrane with molecules on one side, indicating that they are c

In [None]:
# set image paths for the experiments
extracted_data_path = [data[key]['img_path'] for key in data.keys() if
                       'img_path' in data[key]]
clip_model.img_paths = extracted_data_path

In [None]:
# Embedded with standard Tokenizer: Only OCR



In [None]:
# Embedded with standard Tokenizer: OCR * Transcriptions 



In [None]:
# Embedded with standard Tokenizer: OCR * LLAVA



In [None]:
# Embedded with standard Tokenizer - clip_llm_summary: OCR * Transcriptions * LLAVA 



In [None]:
# Embedded with standard Tokenizer - extensive_summary: OCR * Transcriptions * LLAVA



In [None]:
# Embedded with CLIP - clip_llm_summary: OCR * Transcriptions * LLAVA 

extracted_data_text = [data[key]['clip_text_embedding'] for key in data.keys() if
                       'clip_text_embedding' in data[key]]

clip_text_embeddings = [data[0] for data in extracted_data_text]

clip_model.text_embeddings = clip_text_embeddings

if isinstance(clip_model.text_embeddings, list):
    for i, text_embedding in enumerate(clip_model.text_embeddings):
        clip_model.text_embeddings[i] = torch.tensor(text_embedding)

# create one single torch for sim search 
clip_model.text_embeddings = torch.stack(clip_model.text_embeddings, dim=0)
print(clip_model.text_embeddings.shape)

In [None]:
####################TEST 1: Search for exact similar Text.####################
base_dir = "../.."

test_image_path = os.path.join(
    base_dir,
    "data",
    "raw",
    "biology_chapter_3_3",
    "extracted_keyframes",
    "biology_chapter_3_3-Scene-097-01.jpg",
)

test_text_description = extract_text_from_image(test_image_path)

prompt = "subcellular structures"

print("Test Text Description: ", test_text_description)

clip_model.search_similar_images(prompt)

In [78]:
# Embedded with CLIP - extensive_summary: OCR * Transcriptions * LLAVA 
extracted_data_extensive_summary = [data[key]['llm_long_summary'] for key in data.keys() if
                       'llm_long_summary' in data[key]]
logger.info(f"Extracted_data llm_long_summary: {extracted_data_extensive_summary}")

#clip_text_embeddings_extensive = [data[0] for data in extracted_data_extensive_summary]

#logger.info(f"Clip_text_embeddings_extensive: {clip_text_embeddings_extensive}")

clip_model.text_embeddings = extracted_data_extensive_summary

logger.info(f"Clip_model.text_embeddings: {clip_model.text_embeddings}")

if isinstance(clip_model.text_embeddings, list):
    for i, text_embedding in enumerate(clip_model.text_embeddings):
        clip_model.text_embeddings[i] = torch.tensor(text_embedding)

# create one single torch for sim search 
clip_model.text_embeddings = torch.stack(clip_model.text_embeddings, dim=0)


[32m2024-06-27 10:31:10.379[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mExtracted_data llm_long_summary: ['Here is a summary of the lecture content:\n\nTransport Across Membranes: Diffusion\nDiffusion is a passive transport process where molecules move from an area of high concentration to an area of low concentration without any external force or energy input. This process occurs in very small, hydrophobic molecules that can easily cross cell membranes. The movement of molecules is driven by the concentration gradient, with molecules moving down their concentration gradient until equilibrium is reached.', 'The lecture discussed how many liters of water are lost by an average human each day. The correct answer is three liters, which is likely an estimate based on various factors such as food production, transportation, and other human activities that consume water resources. The slide featured a humorous image of Yoda with the caption "Fight fatigue w

TypeError: new(): invalid data type 'str'

In [65]:
import pandas as pd

# Load csv file
filename = "employees.csv"
df = pd.read_csv(filename)

In [66]:
df['Name']

0         John Doe
1       Jane Smith
2      Emily Jones
3    Michael Brown
4      Chris Davis
Name: Name, dtype: object

In [67]:
for i, row in df.iterrows():
    print(row['Name'])
    # get prompt 
    prompt = row['Name']
    # search for similar images
    clip_model.search_similar_images(prompt)

John Doe


AttributeError: 'list' object has no attribute 'shape'