In [1]:
import os
from src.video_preprocessing.download_videos.youtube_download import preprocess_video
from src.video_preprocessing.scene_detection.scene_detect import detect_scenes
from src.video_preprocessing.scene_detection.ocr import extract_text_from_slide
from src.video_preprocessing.download_videos.download_utils import (
    transcribe_audio_files,
    extract_and_store_audio,
    transcription_to_text,
)
from src.ocr.pytesseract_image_to_text import extract_text_from_image
from src.llm.ollama_implementation.ollama_experiment import (
    prompt_llm_summary,
    generate_caption_using_llava,
)
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image

import torch
import pandas as pd
import time
from loguru import logger

[32m2024-06-10 09:49:18.352[0m | [1mINFO    [0m | [36msrc.ocr.pytesseract_image_to_text[0m:[36m<module>[0m:[36m17[0m - [1mExtracted text: Lecture overview

1 R programming basics

1. Get 2 Data wrangling
3 Tidy data
2. Look 4 Low dimensional visualization
5 High dimensi i
3. Conclude 7 Empirical Statistical Assessment

8 Analytical Statistical Assessment
9 Statistical Assessment for Big Data
Case Study
10 Linear regression
11 Classification
12 Supervised Learning

Julien Gagneur Graphically supported hypotheses 3/70

[0m


# Download Pipeline
Download a video from a specific URL on YouTube, then run:
- Scene detection
- Keyframe detection

The resulting data will be stored under `/data/raw/<NAME>`

In [3]:
# Define options and input for downloading a video from youtube

# INSERT video name here
name = "biology_chapter_3_3"
# INSERT video URL here
url = "https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk"
# INSERT chunk length in seconds 30s --> 30, no splitting: None
chunks = None

opts_aud = {"format": "mp3/bestaudio/best", "keep-video": True}
opts_vid = {"format": "mp4/bestvideo/best"}

In [4]:
# Downloads the video creates the relevant datafolders and transcribes the video
data_path = preprocess_video(
    download=True,
    uploaded_vid="ignore",  # path to local file
    url=url,
    name=name,
    aud_opts=opts_aud,
    vid_opts=opts_vid,  # Video download settings
    audio_file=name + ".mp3",
    input_file=name + ".mp4",
    output="output.mp4",
    split_length=chunks,
)

[32m2024-06-10 09:49:31.063[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m49[0m - [1mStarting AutoCaptioning...[0m
[32m2024-06-10 09:49:31.063[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m50[0m - [1mResults will be stored in data/raw/biology_chapter_3_3[0m
[32m2024-06-10 09:49:31.063[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m58[0m - [1mCreated chunks folders[0m


[youtube] Extracting URL: https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk
[youtube] DZSEErNZ1d4: Downloading webpage
[youtube] DZSEErNZ1d4: Downloading ios player API JSON
[youtube] DZSEErNZ1d4: Downloading m3u8 information
[info] DZSEErNZ1d4: Downloading 1 format(s): 22
[download] C:\Users\baatout\PycharmProjects\afm-vlm\data\raw\biology_chapter_3_3\biology_chapter_3_3.mp4 has already been downloaded
[download] 100% of  126.17MiB


[32m2024-06-10 09:49:34.224[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m91[0m - [1mVideo is not splitted:[0m
[32m2024-06-10 09:49:34.224[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m95[0m - [1mVideo downloaded successfully![0m


In [None]:
#  Now that we have downloaded the video we want to perform scene_Detection:
detect_scenes(data_path)

[32m2024-06-10 09:49:42.322[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m29[0m - [1mFound file[0m
[32m2024-06-10 09:49:42.322[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m33[0m - [1mName:biology_chapter_3_3.mp4,dirname:C:\Users\baatout\PycharmProjects\afm-vlm\data/raw\biology_chapter_3_3\biology_chapter_3_3.mp4[0m
[32m2024-06-10 09:49:42.322[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m35[0m - [1mRunning scene_detection:[0m


In [None]:
# Extract the audio per detected scene
extract_and_store_audio(
    os.path.join(data_path, "scene_snippets"),
    os.path.join(data_path, "audio_chunks"),
)

# Audio Transcription using Whisper

For Faster Inference Please Use Tiny!

In [None]:
# Transcribe the different snippets snippets:
audio_dir = os.path.join(data_path, "audio_chunks")
transcriptions_dir = os.path.join(data_path, "transcriptions")

model_type = "tiny"  # change to 'large' if you want more accurate results,
# change to 'medium.en' or 'large.en' for all english language tasks,
# and change to 'small' or 'base' for faster inference
lang = "en"

# Run whisper on all .wav files in audio_dir
transcribe_audio_files(audio_dir, transcriptions_dir, model_type=model_type, lang=lang)

# Starting the Analysis of the Information Contained in the Video

### Inputs

* **Transcriptions**: [insert description or link to transcription]
* **Extraction from Slides using OCR**: [insert description or link to extracted content]
* **Textual Interpretation of Visual Information using LLAVA**: [insert description or link to 
interpreted information]

In [6]:
transcription_file_path = "/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/transcriptions/biology_chapter_3_3-Scene-055.csv"
image_path = "/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/extracted_keyframes/biology_chapter_3_3-Scene-055-01.jpg"

start_time = time.time()
# Transform transcription file
transcription = transcription_to_text(transcription_file_path)
logger.info(f"Transcription_text: {transcription}")

# Extract text using OCR:
ocr_extracted_text = extract_text_from_image(image_path)
logger.info(f"OCR_results: {ocr_extracted_text}")

# Extract textual understanding of Visual features using LLAVA:

llava_results = generate_caption_using_llava(image_path)
logger.info(f"LLava_results: {llava_results}")

response = prompt_llm_summary(
    slide_content=ocr_extracted_text,
    transcription=transcription,
    llava_output=llava_results,
)
# Calculate the elapsed time
elapsed_time = time.time() - start_time

# Print the elapsed time
logger.info(f"Elapsed Time: {elapsed_time} seconds")

# print the resposne of the Slide:
logger.info(f"LLM_Summary: {response}")

[32m2024-06-07 19:28:50.613[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mElapsed Time: 42.34112215042114 seconds[0m
[32m2024-06-07 19:28:50.617[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mLLM_Summary: Here is a summary of the lecture content:

**Slide Summary:**
The slide discusses micronutrients, specifically vitamins, which are organic substances that help speed up chemical reactions in the body. Most vitamins cannot be synthesized by the body and must be obtained from food. Vitamin D is an exception, as it can be synthesized with sunlight. The slide highlights the importance of micronutrients for maintaining bodily functions.

**Key Topics:**
Vitamins, micronutrients, organic substances, coenzymes, vitamin deficiencies, cancer prevention, heart disease prevention, aging process, sunlight, vitamin D synthesis, supplementation.

**Queryable Information:**
Tags: Vitamins, Micronutrients, Organic Substances, Coenzym

Here is a summary of the lecture content:

**Slide Summary:**
The slide discusses micronutrients, specifically vitamins, which are organic substances that help speed up chemical reactions in the body. Most vitamins cannot be synthesized by the body and must be obtained from food. Vitamin D is an exception, as it can be synthesized with sunlight. The slide highlights the importance of micronutrients for maintaining bodily functions.

**Key Topics:**
Vitamins, micronutrients, organic substances, coenzymes, vitamin deficiencies, cancer prevention, heart disease prevention, aging process, sunlight, vitamin D synthesis, supplementation.

**Queryable Information:**
Tags: Vitamins, Micronutrients, Organic Substances, Coenzymes, Vitamin Deficiencies, Cancer Prevention, Heart Disease Prevention, Aging Process, Sunlight, Vitamin D Synthesis, Supplementation.
Categories: Nutrition, Health, Biology
Specific Concepts: Vitamin Functionality, Chemical Reactions, Body Functions, Nutrient Deficiencies.

In [10]:
def get_model_info(model_ID, device):
    # Save the model to device
    model = CLIPModel.from_pretrained(model_ID).to(device)
    # Get the processor
    processor = CLIPProcessor.from_pretrained(model_ID)
    # Get the tokenizer
    tokenizer = CLIPTokenizer.from_pretrained(model_ID)
    # Return model, processor & tokenizer
    return model, processor, tokenizer


# Set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Define the model ID
model_ID = "openai/clip-vit-base-patch32"
# Get model, processor & tokenizer
model, processor, tokenizer = get_model_info(model_ID, device)


def get_image(image_path):
    image = Image.open(image_path)
    # Convert the image to RGB
    rgb_image = image.convert("RGB")
    return rgb_image


def get_single_image_embedding(text, my_image, processor, model, device):
    image = processor(text=text, images=my_image, return_tensors="pt")[
        "pixel_values"
    ].to(device)
    embedding = model.get_image_features(image)
    # convert the embeddings to numpy array
    return embedding.cpu().detach().numpy()


one_image = get_image(
    image_path="/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/extracted_keyframes/biology_chapter_3_3-Scene-055-01.jpg"
)

one_vector = get_single_image_embedding(
    response, one_image, processor, model, device
)  # Simple test

Token indices sequence length is longer than the specified maximum sequence length for this model (196 > 77). Running this sequence through the model will result in indexing errors


In [11]:
# Generated one embedding?

print(one_vector)

[[-1.92160636e-01  1.17445782e-01  2.37807691e-01  3.56596202e-01
  -1.15230247e-01  3.00290510e-02  3.14957350e-02  1.24661297e-01
   7.92421460e-01  2.04853714e-01 -1.33029491e-01  2.43656605e-01
   1.50548339e-01 -6.20990276e-01 -6.07537255e-02  3.02564174e-01
   2.77734697e-01  4.29132402e-01  1.79494053e-01  4.43889290e-01
  -1.29297864e+00  4.07451212e-01 -1.65697247e-01  3.57256830e-03
   4.37120080e-01 -7.21258521e-02  2.38586500e-01  2.18698323e-01
  -3.34979206e-01  1.16147801e-01  7.78390914e-02 -5.59364915e-01
   1.27671987e-01 -1.49906814e-01  4.41960812e-01 -1.45538524e-01
  -2.98688948e-01 -1.57914698e-01  1.23107433e-03 -1.05231392e+00
  -1.07995301e-01  4.40774336e-02 -5.25035799e-01 -1.05685741e-01
  -6.65739998e-02 -2.97406018e-01  2.07281530e-01  1.82302266e-01
  -6.35890126e-01 -8.69800568e-01  7.43384540e-01 -6.68020666e-01
   3.35767508e-01 -4.33679044e-01  3.02345365e-01  2.06971914e-01
  -3.19822848e-01 -4.21288460e-01  5.48620448e-02  2.46448308e-01
   2.98584

# Embeddings Generation using CLIP

### Inputs

* **Keyframes (images) **: [insert description or link to transcription]
* **Extraction from Slides using OCR**: [insert description or link to extracted content]
* **Prompt**: [insert description or link to 
interpreted information]

In [42]:
import torch
import clip
from PIL import Image
import requests
from io import BytesIO

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [43]:
import os
from PIL import Image
from src.clip.image_dataset import image_urls

#from src.clip.load_images import load_images_from_path

base_dir = os.path.dirname(os.path.abspath("."))

relative_image_path_1 = os.path.join(base_dir, 'afm-vlm', 'data', 'raw', 'biology_chapter_3_3', 'extracted_keyframes',
                                     'biology_chapter_3_3-Scene-039-01.jpg')
relative_image_path_2 = os.path.join(base_dir, 'afm-vlm', 'data', 'raw', 'biology_chapter_3_3', 'extracted_keyframes',
                                     'biology_chapter_3_3-Scene-099-01.jpg')
relative_image_path_3 = os.path.join(base_dir, 'afm-vlm', 'data', 'raw', 'biology_chapter_3_3', 'extracted_keyframes',
                                     'biology_chapter_3_3-Scene-016-01.jpg')

image_paths = [relative_image_path_1, relative_image_path_2, relative_image_path_3]


def load_images_from_path(image_paths):
    images = []
    for path in image_paths:
        images.append(Image.open(path))
    return images


image_dataset = load_images_from_path(image_paths)

image_dataset

[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=966x720>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=966x720>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=966x720>]

Generate OCR Captions

In [44]:
ocr_extracted_text = []
for path in image_paths:
    extract_text_from_image(path)
    ocr_extracted_text.append(extract_text_from_image(path))
    logger.info(f"OCR_results: {ocr_extracted_text}")
#logger.info(f"OCR_results: {ocr_extracted_text}")

[32m2024-06-10 11:51:46.400[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mOCR_results: ['Nutrients: Macronutrients\n\n¢ Fats: source of stored energy\n— Cushion and protect vital organs\n— Insulate the body in cold weather\n'][0m
[32m2024-06-10 11:51:47.097[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mOCR_results: ['Nutrients: Macronutrients\n\n¢ Fats: source of stored energy\n— Cushion and protect vital organs\n— Insulate the body in cold weather\n', 'Which Describes Active Transport?\n\nA. K+ will move from high concentration\n\nto low concentration; ATP is used. ‘Retive transport\nK\n\nB. K+ will move from low Low concentration\nconcentration to high\nconcentration; ATP is used.\n\nC. K+ will move from high concentration\nto low concentration; ATP is not\nused.\n\nb\nD. K+ will move from low concentration\nto high concentration; ATP is not\nused.\n'][0m
[32m2024-06-10 11:51:47.474[0m | [1mINFO    [0m | [36m__m

In [50]:
dataset_images_preprocessed = torch.cat([preprocess(image).unsqueeze(0) for image in image_dataset], dim=0).to(device)

In [51]:
dataset_images_preprocessed

tensor([[[[1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303],
          [1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303],
          [1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303],
          ...,
          [1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303],
          [1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303],
          [1.9303, 1.9303, 1.9303,  ..., 1.9303, 1.9303, 1.9303]],

         [[2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749],
          [2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749],
          [2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749],
          ...,
          [2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749],
          [2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749],
          [2.0749, 2.0749, 2.0749,  ..., 2.0749, 2.0749, 2.0749]],

         [[2.1459, 2.1459, 2.1459,  ..., 2.1459, 2.1459, 2.1459],
          [2.1459, 2.1459, 2.1459,  ..., 2.1459, 2.1459, 2.1459],
          [2.1459, 2.1459, 2.1459,  ..., 2

In [52]:
# generate embeddings
with torch.no_grad():
    dataset_image_embeddings  = model.encode_image(dataset_images_preprocessed)

In [55]:
# load test keyframe image
test_image_path = os.path.join(base_dir, 'afm-vlm', 'data', 'raw', 'biology_chapter_3_3', 'extracted_keyframes',
                                     'biology_chapter_3_3-Scene-099-01.jpg')
test_image = Image.open(test_image_path)
test_image_preprocessed = preprocess(test_image).unsqueeze(0).to(device)

# generate embedding for the test keyframe image
test_image_embedding = model.encode_image(test_image_preprocessed)

In [56]:
# Compute the cosine similarity between the test image embedding and each dataset image embedding
cosine_similarity = torch.nn.functional.cosine_similarity(test_image_embedding, dataset_image_embeddings)

# Get the index of the image with the highest similarity
max_similarity_index = cosine_similarity.argmax().item()
max_similarity_index


1