# THE PLAN
1. A function, that will accept minio url
2. Download the video from minio to a temp directory
3. Extract Audio
4. Load OpenAI's Whisper model
5. Covert Audio to Text
6. append .srt artifacts,i.e, every sentence is a new line and has timestamps (maybe?)
7. Upload the .srt file to minio
8. Use the text to generate summary, and chapters for the video
9. Summary can be stored in DB as video.description (so 2 things,1> pass the video id & 2> move DB to PG or smthing)
10. IDK how to store the chapters, maybe in DB

In [1]:
%pip install minio openai-whisper ffmpeg-python ollama SQLAlchemy -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
MINIO_IP="192.168.0.118"
MINIO_PORT="9010"
MINIO_ACCESS_KEY="oloom_key"
MINIO_SECRET_KEY="KhGjEmfDW07wMN34SYKJkv539o7Rfq8yoGK0efeS"
OLLAMA_URL="https://ollama.homelab.subhranshu.com" 

In [None]:
from tempfile import mkdtemp
import shutil
from minio import Minio
import os
import whisper
import ffmpeg
from tqdm import tqdm
import time
from sqlalchemy import create_engine, text
from sqlalchemy.orm import Session, DeclarativeBase
from pathlib import Path
from ollama import Client

class Base(DeclarativeBase):
    pass

class VideoProcessor:
    def __init__(self,video_id,file_name):
        NOTEBOOK_DIR = Path().resolve() 
        BASE_DIR = NOTEBOOK_DIR.parent   
        DB_PATH = os.path.join(BASE_DIR, "web", "db.sqlite")
        SQLITE_URL = f"sqlite:///{DB_PATH}"

        engine = create_engine(SQLITE_URL)
        Base.metadata.create_all(engine)

        temp_dir = mkdtemp(prefix=f"oloom_")

        print(f"Temp dir: {temp_dir}")
        print(f"Temp video path: {temp_dir}/{file_name}")

        self.video_id = video_id
        self.file_name = file_name
        self.temp_dir = temp_dir
        self.video_path = f"{temp_dir}/{file_name}"
        self.minio_client = Minio(
            f"{MINIO_IP}:{MINIO_PORT}",
            access_key=MINIO_ACCESS_KEY,
            secret_key=MINIO_SECRET_KEY,
            secure=False
        )
        self.model = whisper.load_model("turbo")
        self.ollama_client = Client(host=OLLAMA_URL)
        self.db = engine


        if not self.minio_client.bucket_exists("oloom"):
            raise Error("Minio bucket does not exist")
        
        self.download_video()
        self.extract_audio()
        combined_text = self.transcribe()
        summary_text = self.summarize(combined_text)
        self.update_db(summary_text)
        self.upload_vtt()
        self.cleanup()

    def download_video(self):
        try:
            self.minio_client.fget_object(bucket_name="oloom",object_name=self.file_name,file_path=self.video_path)
            return True
        except Exception as e:
            print(f"Error downloading video: {e}")
            raise e

    def extract_audio(self):
        try:
            audio = ffmpeg.input(self.video_path)
            audio = ffmpeg.output(audio, f"{self.temp_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k",loglevel="quiet")
            ffmpeg.run(audio, overwrite_output=True)
            print(f"Audio path: {self.temp_dir}/output.wav")
            return True
        except Exception as e:
            print(f"Error extracting audio: {e}")
            raise e

    def transcribe(self):
        def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
            assert seconds >= 0, "non-negative timestamp expected"
            milliseconds = round(seconds * 1000.0)

            hours = milliseconds // 3_600_000
            milliseconds -= hours * 3_600_000

            minutes = milliseconds // 60_000
            milliseconds -= minutes * 60_000

            seconds = milliseconds // 1_000
            milliseconds -= seconds * 1_000

            hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
            return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"

        try:
            audio_path = f"{self.temp_dir}/output.wav"
            start_time = time.time()
            result = self.model.transcribe(audio_path,verbose=True)
            print(f"Time taken for transcription: {time.time() - start_time:.2f} seconds")
            segments = result["segments"]
            combined_text = result["text"]
            srt_path = f"{self.temp_dir}/transcript.vtt"

            with open(srt_path, "w", encoding='utf-8') as f:
                f.write("WEBVTT\n\n")
                for segment in tqdm(segments, total=len(segments), desc="Writing VTT"):
                    text = segment['text'].strip().replace('-->', '->')
                    
                    f.write(
                        f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
                        f"{text}\n\n"
                    )

            print(f"VTT file: {srt_path}")
            return combined_text

        except Exception as e:
            print(f"Error transcribing video: {e}")
            raise e

    def summarize(self,text):
        try:
            messages = [
                {
                    "role":"user",
                    "content": f"You are a helpful assistant. Summarize the following text in a concise manner. Avoid including intro text. \n\n{text}"
                }
            ]
            model = 'llama3.2:3b'
            response = self.ollama_client.chat(model=model, messages=messages)
            return response['message']['content']
        
        except Exception as e:
            print(f"Error summarizing text: {e}")
            raise e
    
    def update_db(self,summary_text):
        try:
            with Session(self.db) as session:
                # Query all videos with their associated users
                sql_query = "UPDATE web_video SET description=:summary WHERE id=:id"
                session.execute(text(sql_query), {'summary': summary_text, 'id': self.video_id})
                session.commit()
                return True
        except Exception as e:
            print(f"Error updating database: {e}")
            raise e

    def upload_vtt(self):
        try:
            vtt_object_name = f"{"/".join(self.file_name.split("/")[0:-1])}/transcript.vtt"
            vtt_path = f"{self.temp_dir}/transcript.vtt"
            txt_path = f"{self.temp_dir}/transcript.txt"
            self.minio_client.fput_object(bucket_name="oloom",object_name=vtt_object_name,file_path=vtt_path,content_type="text/vtt")
            return True
        except Exception as e:
            print(f"Error uploading SRT: {e}")
            raise e

    def cleanup(self):
        print(f"Cleaning up: {self.temp_dir}")
        shutil.rmtree(self.temp_dir)


In [4]:
start_t = time.time()
VideoProcessor = VideoProcessor(
  "7377ff6f-aa18-4e7c-bb24-4636bb04b511",
  "5fbb04c0-45bb-457e-8470-03eb6a8e314e/adf94035-ccca-460f-a7f6-6e172b631ef4/video.mp4"
)
end_t = time.time()
print(f"Time taken: {end_t - start_t}")

Temp dir: /var/folders/b9/x0sm9z492qd6qs1dyjpcyydw0000gn/T/oloom_3_35lvpx
Temp video path: /var/folders/b9/x0sm9z492qd6qs1dyjpcyydw0000gn/T/oloom_3_35lvpx/5fbb04c0-45bb-457e-8470-03eb6a8e314e/adf94035-ccca-460f-a7f6-6e172b631ef4/video.mp4
Audio path: /var/folders/b9/x0sm9z492qd6qs1dyjpcyydw0000gn/T/oloom_3_35lvpx/output.wav
Detecting language using up to the first 30 seconds. Use `--language` to specify the language




Detected language: English
[00:00.000 --> 00:07.740]  Yesterday, after achieving a, quote, ludicrous rate of progress, Elon Musk released his AI chatbot in large language model, Grok4,
[00:07.860 --> 00:12.320]  and claims it's the smartest AI in the world, along with the Trust Me Bro benchmarks to back it up.
[00:12.380 --> 00:18.020]  It can achieve perfect SAT scores every time, and outperforms almost every grad student in every discipline.
[00:18.300 --> 00:23.780]  Vibe coders have been dropping all kinds of crazy demos with it, like this 3D first-person shooter built in four hours,
[00:24.040 --> 00:29.300]  and Elon himself claims it's even better than Cursor. All you have to do is copy and paste your entire codebase into it.
[00:29.300 --> 00:33.500]  In addition, Super Grok4 Heavy can run in parallel to solve complex problems,
[00:33.640 --> 00:37.280]  while your obsolete monkey brain looks in awe at this beautiful futuristic UI.
[00:37.740 --> 00:42.080]  It all sounds amazi

Writing VTT: 100%|██████████| 47/47 [00:00<00:00, 435170.61it/s]

VTT file: /var/folders/b9/x0sm9z492qd6qs1dyjpcyydw0000gn/T/oloom_3_35lvpx/transcript.vtt





Cleaning up: /var/folders/b9/x0sm9z492qd6qs1dyjpcyydw0000gn/T/oloom_3_35lvpx
Time taken: 170.44099497795105
