In [1]:
import os
from src.video_preprocessing.download_videos.youtube_download import preprocess_video
from src.video_preprocessing.scene_detection.scene_detect import detect_scenes
from src.video_preprocessing.scene_detection.ocr import extract_text_from_slide
from src.video_preprocessing.download_videos.download_utils import (
    transcribe_audio_files,
    extract_and_store_audio,
    transcription_to_text,
)
from src.ocr.pytesseract_image_to_text import extract_text_from_image
from src.llm.ollama_implementation.ollama_experiment import (
    prompt_llm_summary,
    generate_caption_using_llava,
)
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from PIL import Image

import torch
import pandas as pd
import time
from loguru import logger

[32m2024-06-11 07:40:10.865[0m | [1mINFO    [0m | [36msrc.ocr.pytesseract_image_to_text[0m:[36m<module>[0m:[36m17[0m - [1mExtracted text: Lecture overview

1 R programming basics

1. Get 2 Data wrangling
3 Tidy data
2. Look 4 Low dimensional visualization
5 High dimensi i
3. Conclude 7 Empirical Statistical Assessment

8 Analytical Statistical Assessment
9 Statistical Assessment for Big Data
Case Study
10 Linear regression
11 Classification
12 Supervised Learning

Julien Gagneur Graphically supported hypotheses 3/70

[0m


# Download Pipeline
Download a video from a specific URL on YouTube, then run:
- Scene detection
- Keyframe detection

The resulting data will be stored under `/data/raw/<NAME>`

In [9]:
# Define options and input for downloading a video from youtube

# INSERT video name here
name = "biology_chapter_3_3"
# INSERT video URL here
url = "https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk"
# INSERT chunk length in seconds 30s --> 30, no splitting: None
chunks = None

opts_aud = {"format": "mp3/bestaudio/best", "keep-video": True}
opts_vid = {"format": "mp4/bestvideo/best"}

In [10]:
# Downloads the video creates the relevant datafolders and transcribes the video
data_path = preprocess_video(
    download=True,
    uploaded_vid="ignore",  # path to local file
    url=url,
    name=name,
    aud_opts=opts_aud,
    vid_opts=opts_vid,  # Video download settings
    audio_file=name + ".mp3",
    input_file=name + ".mp4",
    output="output.mp4",
    split_length=chunks,
)

[32m2024-06-10 12:59:53.782[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m49[0m - [1mStarting AutoCaptioning...[0m
[32m2024-06-10 12:59:53.783[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m50[0m - [1mResults will be stored in data/raw/biology_chapter_3_3[0m
[32m2024-06-10 12:59:53.785[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m58[0m - [1mCreated chunks folders[0m


[youtube] Extracting URL: https://youtu.be/DZSEErNZ1d4?si=f6YxKQ9rP6iqgTfk
[youtube] DZSEErNZ1d4: Downloading webpage
[youtube] DZSEErNZ1d4: Downloading ios player API JSON
[youtube] DZSEErNZ1d4: Downloading m3u8 information
[info] DZSEErNZ1d4: Downloading 1 format(s): 22
[download] C:\Users\baatout\PycharmProjects\afm-vlm\data\raw\biology_chapter_3_3\biology_chapter_3_3.mp4 has already been downloaded
[download] 100% of  126.17MiB


[32m2024-06-10 12:59:55.663[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m91[0m - [1mVideo is not splitted:[0m
[32m2024-06-10 12:59:55.664[0m | [1mINFO    [0m | [36msrc.video_preprocessing.download_videos.youtube_download[0m:[36mpreprocess_video[0m:[36m95[0m - [1mVideo downloaded successfully![0m


In [None]:
# Â Now that we have downloaded the video we want to perform scene_Detection:
detect_scenes(data_path)

[32m2024-06-10 09:49:42.322[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m29[0m - [1mFound file[0m
[32m2024-06-10 09:49:42.322[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m33[0m - [1mName:biology_chapter_3_3.mp4,dirname:C:\Users\baatout\PycharmProjects\afm-vlm\data/raw\biology_chapter_3_3\biology_chapter_3_3.mp4[0m
[32m2024-06-10 09:49:42.322[0m | [1mINFO    [0m | [36msrc.video_preprocessing.scene_detection.scene_detect[0m:[36mdetect_scenes[0m:[36m35[0m - [1mRunning scene_detection:[0m


In [None]:
# Extract the audio per detected scene
extract_and_store_audio(
    os.path.join(data_path, "scene_snippets"),
    os.path.join(data_path, "audio_chunks"),
)

# Audio Transcription using Whisper

For Faster Inference Please Use Tiny!

In [2]:
# Transcribe the different snippets snippets:
audio_dir = os.path.join(data_path, "audio_chunks")
transcriptions_dir = os.path.join(data_path, "transcriptions")

model_type = "tiny"  # change to 'large' if you want more accurate results,
# change to 'medium.en' or 'large.en' for all english language tasks,
# and change to 'small' or 'base' for faster inference
lang = "en"

# Run whisper on all .wav files in audio_dir
transcribe_audio_files(audio_dir, transcriptions_dir, model_type=model_type, lang=lang)

NameError: name 'data_path' is not defined

# Starting the Analysis of the Information Contained in the Video

### Inputs

* **Transcriptions**: [insert description or link to transcription]
* **Extraction from Slides using OCR**: [insert description or link to extracted content]
* **Textual Interpretation of Visual Information using LLAVA**: [insert description or link to 
interpreted information]

In [3]:
transcription_file_path = "/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/transcriptions/biology_chapter_3_3-Scene-055.csv"
image_path = "/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/extracted_keyframes/biology_chapter_3_3-Scene-055-01.jpg"

start_time = time.time()
# Transform transcription file
transcription = transcription_to_text(transcription_file_path)
logger.info(f"Transcription_text: {transcription}")

# Extract text using OCR:
ocr_extracted_text = extract_text_from_image(image_path)
logger.info(f"OCR_results: {ocr_extracted_text}")

# Extract textual understanding of Visual features using LLAVA:

llava_results = generate_caption_using_llava(image_path)
logger.info(f"LLava_results: {llava_results}")

response = prompt_llm_summary(
    slide_content=ocr_extracted_text,
    transcription=transcription,
    llava_output=llava_results,
)
# Calculate the elapsed time
elapsed_time = time.time() - start_time

# Print the elapsed time
logger.info(f"Elapsed Time: {elapsed_time} seconds")

# print the resposne of the Slide:
logger.info(f"LLM_Summary: {response}")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/transcriptions/biology_chapter_3_3-Scene-055.csv'

In [4]:
def get_model_info(model_ID, device):
    # Save the model to device
    model = CLIPModel.from_pretrained(model_ID).to(device)
    # Get the processor
    processor = CLIPProcessor.from_pretrained(model_ID)
    # Get the tokenizer
    tokenizer = CLIPTokenizer.from_pretrained(model_ID)
    # Return model, processor & tokenizer
    return model, processor, tokenizer


# Set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Define the model ID
model_ID = "openai/clip-vit-base-patch32"
# Get model, processor & tokenizer
model, processor, tokenizer = get_model_info(model_ID, device)


def get_image(image_path):
    image = Image.open(image_path)
    # Convert the image to RGB
    rgb_image = image.convert("RGB")
    return rgb_image


def get_single_image_embedding(text, my_image, processor, model, device):
    image = processor(text=text, images=my_image, return_tensors="pt")[
        "pixel_values"
    ].to(device)
    embedding = model.get_image_features(image)
    # convert the embeddings to numpy array
    return embedding.cpu().detach().numpy()


one_image = get_image(
    image_path="/Users/magic-rabbit/Documents/AFM/afm-vlm/data/raw/biology_chapter_3_3/extracted_keyframes/biology_chapter_3_3-Scene-055-01.jpg"
)

one_vector = get_single_image_embedding(
    response, one_image, processor, model, device
)  # Simple test

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\magic-rabbit\\Documents\\AFM\\afm-vlm\\data\\raw\\biology_chapter_3_3\\extracted_keyframes\\biology_chapter_3_3-Scene-055-01.jpg'

In [None]:
# Generated one embedding?

print(one_vector)