In [1]:
import whisper
import subprocess
from pathlib import Path
from googletrans import Translator
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load Whisper model
model = whisper.load_model("medium")
translator = Translator()

# Load T5 model for grammar correction (text improvement)
t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")

def extract_audio(video_path: str, output_audio: str = "extracted_audio.wav") -> str:
    """Extracts audio from a video file using FFmpeg."""
    command = [
        "ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_audio
    ]
    try:
        subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error extracting audio: {e.stderr.decode()}")
        return None
    
    return output_audio

def transcribe_audio(audio_path: str) -> str:
    """Transcribes speech from an audio file using Whisper AI."""
    result = model.transcribe(audio_path)
    return result["text"]

def translate_text(text: str, target_language: str = 'ta') -> str:
    """Translates the transcribed text from English to the target language."""
    try:
        translation = translator.translate(text, src='en', dest=target_language)
        return translation.text
    except Exception as e:
        print(f"Error translating text: {str(e)}")
        return None

def correct_grammar(text: str) -> str:
    """Corrects grammatical issues using T5 (Text-to-Text Transfer Transformer)."""
    # Preprocess the input text for grammar correction
    input_text = f"grammar correction: {text}"
    input_ids = t5_tokenizer.encode(input_text, return_tensors="pt", truncation=True, padding="longest")
    
    # Generate the corrected output using T5
    output_ids = t5_model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)
    
    # Decode the corrected text
    corrected_text = t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return corrected_text

def process_video(video_path: str, target_language: str):
    """Processes the video file: extracts audio, transcribes it, translates the text, and checks for grammar."""
    print(f"Processing: {video_path}")
    
    # Extract audio
    audio_path = extract_audio(video_path, "temp_audio1.wav")
    if not audio_path:
        return
    
    # Transcribe
    transcribed_text = transcribe_audio(audio_path)
    
    # Translate
    translated_text = translate_text(transcribed_text, target_language)
    
    # Grammar check and correction on the translated text
    corrected_text = correct_grammar(translated_text)
    
    # Cleanup
    Path(audio_path).unlink(missing_ok=True)
    
    # Print the output
    print("\n--- Transcription ---\n", transcribed_text)
    print("\n--- Translated Text ---\n", translated_text)
    print("\n--- Corrected Text ---\n", corrected_text)


# Example usage in Jupyter Notebook
video_file_path = "/home/aravinds/directory_env/video.mp4"  # Change to your video file path
target_lang = "ta"  # Hindi (Change this for different languages)

process_video(video_file_path, target_lang)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Processing: /home/aravinds/directory_env/video.mp4


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



--- Transcription ---
  Make your public speaking instantly better than everybody else. Now imagine if I started like this, hey, today I'm going to talk about my subject and I feel very excited about it. You're already bored. You're not paying attention. You're just going to take your phone out and start scrolling. So one thing you want to do is look at the audience. In this case, I'm looking at the camera. You want to have a more powerful voice and most importantly, you want to show enthusiasm for the topic you're talking about. Hey everyone, today we're going to be talking about public speaking. I want to teach you all the different ways where I've been able to develop my public speaking skills.

--- Translated Text ---
 உங்கள் பொது பேசுவதை எல்லோரையும் விட உடனடியாக சிறப்பாக ஆக்குங்கள்.இப்போது நான் இப்படி தொடங்கினால் கற்பனை செய்து பாருங்கள், ஏய், இன்று நான் எனது விஷயத்தைப் பற்றி பேசப் போகிறேன், அதைப் பற்றி நான் மிகவும் உற்சாகமாக உணர்கிறேன்.நீங்கள் ஏற்கனவே சலித்துவிட்டீர்கள்.நீங்கள் க