# Demo of the pipeline


## 1. Load Demo Video

In [1]:
import os
from moviepy.editor import VideoFileClip

video_path = "demo_video.mov"

file_size_mb = os.path.getsize(video_path) / (1024 * 1024)
clip = VideoFileClip(video_path)

duration = clip.duration
width, height = clip.size  
fps = clip.fps  

print(f"File Name: {os.path.basename(video_path)}")
print(f"File Size: {file_size_mb:.2f} MB")
print(f"Video Duration: {duration:.2f} seconds")
print(f"Resolution: {width}x{height}")
print(f"FPS (if available): {fps}")


File Name: demo_video.mov
File Size: 13.83 MB
Video Duration: 10.80 seconds
Resolution: 1280x720
FPS (if available): 30.0


# Transcribe Video

In [2]:
from moviepy.editor import VideoFileClip
from transcribe import Transcriber

# 1) Extract audio from the video (using MoviePy for quick demo).
video_path = "demo_video.mov"
output_audio_path = "demo_audio.wav"

clip = VideoFileClip(video_path)
clip.audio.write_audiofile(output_audio_path, codec='pcm_s16le', logger=None)  # Writes a WAV by default

# 2) Create a Transcriber instance
transcriber = Transcriber(model_name="base", device="cpu")

# 3) Transcribe the extracted audio
result = transcriber.transcribe_audio(output_audio_path)

# 4) Display the results
transcriber.display_full_text(result)

Detected language: English


100%|██████████| 1080/1080 [00:00<00:00, 2764.81frames/s]


--- Transcribed Text ---
 Hello, my name is Ezra and I'm trying to demo this cool technology. The quick brown fox jumps over the lazy dog.





## Translate Text

In [3]:
from translate import Translator

# Example transcribed text from the previous step (or supply your own)
transcribed_text = result.get("text", "")

# Create a Translator for English-to-Spanish
translator = Translator(source_lang="en", target_lang="es")

# Perform the translation
translated_text = translator.translate_text(transcribed_text)

# Print the result
print("Original Text:", transcribed_text)
print("Translated Text:", translated_text)

Original Text:  Hello, my name is Ezra and I'm trying to demo this cool technology. The quick brown fox jumps over the lazy dog.
Translated Text: Hola, me llamo Ezra y estoy tratando de demoler esta tecnología genial. El zorro marrón salta sobre el perrito perezoso.


## Speak the Text

In [6]:
# In demo.ipynb

from moviepy.editor import VideoFileClip
from speech import Speaker


# Extract audio from demo_video.mov if you haven't done so yet:
video_path = "demo_video.mov"
extracted_audio_path = "demo_audio.wav"

clip = VideoFileClip(video_path)
clip.audio.write_audiofile(extracted_audio_path, codec="pcm_s16le")

# Initialize the Speaker with Spanish output
speaker = Speaker(
    model_name="tts_models/multilingual/multi-dataset/xtts_v2",
    language_code="es"
)

MoviePy - Writing audio in demo_audio.wav


                                                        

MoviePy - Done.
Loading TTS model: tts_models/multilingual/multi-dataset/xtts_v2
 > You must confirm the following:
 | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"
 | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]
 > Downloading model to /Users/ezraapple/Library/Application Support/tts/tts_models--multilingual--multi-dataset--xtts_v2


100%|█████████▉| 1.87G/1.87G [00:44<00:00, 41.6MiB/s]
100%|██████████| 1.87G/1.87G [00:44<00:00, 41.8MiB/s]
100%|██████████| 4.37k/4.37k [00:00<00:00, 20.9kiB/s]

100%|██████████| 361k/361k [00:00<00:00, 1.19MiB/s]
100%|██████████| 32.0/32.0 [00:00<00:00, 101iB/s]
 91%|█████████ | 7.04M/7.75M [00:00<00:00, 37.2MiB/s]

 > Model's license - CPML
 > Check https://coqui.ai/cpml.txt for more info.
 > Using model: xtts


GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


TTS model loaded successfully.


In [7]:
# Generate speech using the speaker's voice from the extracted audio
output_speech_path = speaker.generate_speech(
    text=translated_text,
    speaker_wav=extracted_audio_path,
    output_path="demo_tts.wav"
)

print(f"Cloned speech saved to: {output_speech_path}")

Generating speech with voice cloned from: demo_audio.wav
 > Text splitted to sentences.
['Hola, me llamo Ezra y estoy tratando de demoler esta tecnología genial.', 'El zorro marrón salta sobre el perrito perezoso.']


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 > Processing time: 12.710139036178589
 > Real-time factor: 1.2908003212405024
Cloned speech saved to: demo_tts.wav
