In [1]:
import os
import ffmpeg
from pydub import AudioSegment
from transformers import pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_audio(video_path, output_audio_path="extracted_audio.wav"):
    """
    Extracts audio from a video file using FFmpeg.
    """
    ffmpeg.input(video_path).output(output_audio_path, format="wav").run(overwrite_output=True)
    return output_audio_path

In [3]:
def split_audio(audio_path, output_folder="audio_chunks", chunk_length_ms=30000):
    """
    Splits the extracted audio into 30-second chunks and saves them.
    """
    os.makedirs(output_folder, exist_ok=True)

    audio = AudioSegment.from_wav(audio_path)
    chunks = [audio[i : i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]

    chunk_paths = []
    for idx, chunk in enumerate(chunks):
        chunk_filename = os.path.join(output_folder, f"chunk_{idx}.wav")
        chunk.export(chunk_filename, format="wav")
        chunk_paths.append(chunk_filename)
    
    return chunk_paths

In [4]:
def transcribe_audio(audio_chunks, output_folder="audio_chunks"):
    """
    Transcribes speech from multiple short audio chunks and saves text files.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Load Whisper model
    asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=0 if device == "cuda" else -1)

    transcriptions = []
    
    for idx, chunk in enumerate(audio_chunks):
        result = asr_pipeline(chunk)
        text = result["text"]

        # Save each transcription next to its corresponding audio file
        text_filename = os.path.join(output_folder, f"chunk_{idx}.txt")
        with open(text_filename, "w", encoding="utf-8") as f:
            f.write(text)

        transcriptions.append((chunk, text, text_filename))
    
    return transcriptions

In [5]:
def process_video(video_path, output_folder="output"):
    """
    Full pipeline: Extracts audio, splits it into chunks, and transcribes speech.
    """
    os.makedirs(output_folder, exist_ok=True)

    print(f"Processing video: {video_path}")

    # Step 1: Extract audio
    audio_path = extract_audio(video_path, os.path.join(output_folder, "full_audio.wav"))
    print(f"Audio extracted: {audio_path}")

    # Step 2: Split into 30-second chunks
    audio_chunks = split_audio(audio_path, output_folder)
    print(f"Audio split into {len(audio_chunks)} chunks.")

    # Step 3: Transcribe each chunk and save next to audio
    transcript_info = transcribe_audio(audio_chunks, output_folder)

    print("\n✅ Process Completed!")
    for chunk, text, text_file in transcript_info:
        print(f"🔹 Audio: {chunk} -> 🔹 Text: {text_file}")

    return audio_chunks, transcript_info

In [6]:
# Example Usage
if __name__ == "__main__":
    video_file = "Sample_1.mp4"  # Replace with your video file path
    process_video(video_file, output_folder="transcriptions")


Processing video: Sample_1.mp4


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

Audio extracted: transcriptions/full_audio.wav
Audio split into 6 chunks.


Device set to use cuda:0
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.



✅ Process Completed!
🔹 Audio: transcriptions/chunk_0.wav -> 🔹 Text: transcriptions/chunk_0.txt
🔹 Audio: transcriptions/chunk_1.wav -> 🔹 Text: transcriptions/chunk_1.txt
🔹 Audio: transcriptions/chunk_2.wav -> 🔹 Text: transcriptions/chunk_2.txt
🔹 Audio: transcriptions/chunk_3.wav -> 🔹 Text: transcriptions/chunk_3.txt
🔹 Audio: transcriptions/chunk_4.wav -> 🔹 Text: transcriptions/chunk_4.txt
🔹 Audio: transcriptions/chunk_5.wav -> 🔹 Text: transcriptions/chunk_5.txt


In [27]:
import os
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

In [28]:
# Set Paths
data_dir = "/media/arupreza/Assets/LLM Projects/AccentFlow/transcriptions"
output_dir = os.path.join(data_dir, "corrected_texts")  # Save corrected texts here
os.makedirs(output_dir, exist_ok=True)

In [29]:
# Load Grammarly CoEdit-Large Model (T5-based)
model_name = "grammarly/coedit-large"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)

In [30]:
# Wrap Hugging Face model into LangChain
llm = HuggingFacePipeline.from_model_id(
    model_id=model_name, 
    task="text2text-generation",
    model_kwargs={"trust_remote_code": True}
)

Device has 1 GPUs available. Provide device={deviceId} to `from_model_id` to use availableGPUs for execution. deviceId is -1 (default) for CPU and can be a positive integer associated with CUDA device id.
Device set to use cpu


In [None]:
# Define LangChain Prompt
grammar_prompt = PromptTemplate(
    input_variables=["text"],
    template="Fix grammar: {text}"
)

In [None]:
# LangChain Chain for Grammar Correction
grammar_chain = LLMChain(llm=llm, prompt=grammar_prompt)

In [None]:
def correct_grammar_langchain(text):
    """
    Uses LangChain with Grammarly CoEdit-Large to correct grammar.
    """
    response = grammar_chain.run({"text": text})
    return response.strip()

In [None]:
def process_text_files():
    """
    Processes all .txt files in the input directory using LangChain, corrects grammar, and saves output.
    """
    for filename in os.listdir(data_dir):
        if filename.endswith(".txt"):
            file_path = os.path.join(data_dir, filename)
            output_path = os.path.join(output_dir, filename.replace(".txt", "_corrected.txt"))

            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read().strip()

            corrected_text = correct_grammar_langchain(text)

            with open(output_path, "w", encoding="utf-8") as f:
                f.write(corrected_text)

            print(f"✅ Processed: {filename} → {output_path}")

In [None]:
# Run the LangChain-based processing function
process_text_files()