In [None]:
print("Importing libraries ...")
import os
import sys
import torch
import ffmpeg
import numpy as np
from pydub import AudioSegment
print("Importing Hugging Face libraries ...")
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration

In [None]:
def split_audio(audio, chunk_length_ms=30000):
    return [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]


audio_path='test_video2.mp4'
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
audio = AudioSegment.from_file(audio_path).set_frame_rate(16000).set_channels(1)

chunks = split_audio(audio)
transcriptions = []
for chunk in chunks:
    audio_samples = np.array(chunk.get_array_of_samples()).astype(np.float32) / 32768.0
    input_features = processor(audio_samples, sampling_rate=16000, return_tensors="pt").input_features
    print(f"Performing transcription ...")
    predicted_ids = model.generate(input_features, max_length=448)
    transcriptions.append(processor.batch_decode(predicted_ids, skip_special_tokens=True)[0])
full_transcription = " ".join(transcriptions)
print("STEP 3: Text Summarization")

print(f"Loading summarization pipeline ...")
facebook_bart = pipeline("summarization", model="facebook/bart-large-cnn")
facebook_roberta = pipeline("summarization", model="facebook/bart-large-mnli")
microsoft_deberta = pipeline("summarization", model="microsoft/deberta-v3-base")

print(f"Summarizing text ...")

summary1 = facebook_bart(transcription, max_length=150, min_length=30, do_sample=False)
summary2 = facebook_roberta(transcription, max_length=150, min_length=30, do_sample=False)
summary3 = microsoft_deberta(transcription, max_length=150, min_length=30, do_sample=False)

print("Parse summary into bullet points")
bullet_points = "\n".join([f"- {point.strip()}" for point in summary[0]['summary_text'].split('.') if point])


