In [1]:

!pip install moviepy speechrecognition google-cloud-speech vosk



In [None]:
# The code installs several Python libraries needed for audio and video processing:

# moviepy: Used for video editing and extracting audio from videos.

# speechrecognition: Used for transcribing audio into text.

# google-cloud-speech: Google’s cloud API for speech-to-text, though it doesn't appear to be used in the provided snippet.

# vosk: A speech recognition library for real-time transcription, which is actually used in the code.

import os
import moviepy.editor as mp
from vosk import Model, KaldiRecognizer
import wave
import json

# Function to extract audio from video and save as a WAV file
def extract_audio_from_video(video_path, audio_output_path):
    video = mp.VideoFileClip(video_path)
    video.audio.write_audiofile(audio_output_path)

# Function to transcribe audio using Vosk
def transcribe_audio(audio_path, search_word):
    # Load Vosk model (make sure the model is downloaded from Vosk's official page)
    if not os.path.exists("vosk-model-small-en-us-0.15"):
        print("Please download the model from https://alphacephei.com/vosk/models and unpack it to 'vosk-model-small-en-us-0.15' directory.")
        return

    model = Model("vosk-model-small-en-us-0.15")
    
    # Open the audio file
    wf = wave.open(audio_path, "rb")
    
    # Ensure the audio is mono-channel and with a proper sample rate
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000:
        print("Audio file must be WAV format mono PCM.")
        return
    
    recognizer = KaldiRecognizer(model, wf.getframerate())
    recognizer.SetWords(True)  # Enable word timestamps

    transcription = ""
    word_found = False
    search_word = search_word.lower()

    # Process the audio file in chunks
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break

        if recognizer.AcceptWaveform(data):
            result = json.loads(recognizer.Result())
            transcription += result.get("text", "")
            # Check if the word is found in the result
            for word in result.get("result", []):
                if search_word in word["word"].lower():
                    word_found = True
                    print(f"Found '{search_word}' at timestamp {word['start']}s - {word['end']}s")

    # Final partial results
    final_result = json.loads(recognizer.FinalResult())
    transcription += final_result.get("text", "")
    
    # Search the final transcript if not already found
    if not word_found and search_word in transcription.lower():
        print(f"'{search_word}' found in the audio.")

    wf.close()

    return transcription

# Main workflow
video_path = "test_video.mp4"
audio_output_path = "extracted_audio.wav"
search_word = "example"

# Step 1: Extract audio from the video
extract_audio_from_video(video_path, audio_output_path)

# Step 2: Transcribe and search for the word in the audio
transcription = transcribe_audio(audio_output_path, search_word)

# Print the full transcription if needed
print("\nFull transcription:\n", transcription)


ModuleNotFoundError: No module named 'moviepy.editor'