# Demo of the pipeline


In [1]:
import os
print(os.getcwd())

/Users/ezraapple/Projects/dubbing_demo


## 1. Load Demo Video

In [2]:
import os
from moviepy.editor import VideoFileClip

video_path = "demo_video.mov"

file_size_mb = os.path.getsize(video_path) / (1024 * 1024)
clip = VideoFileClip(video_path)

duration = clip.duration
width, height = clip.size  
fps = clip.fps  

print(f"File Name: {os.path.basename(video_path)}")
print(f"File Size: {file_size_mb:.2f} MB")
print(f"Video Duration: {duration:.2f} seconds")
print(f"Resolution: {width}x{height}")
print(f"FPS (if available): {fps}")


File Name: demo_video.mov
File Size: 13.83 MB
Video Duration: 10.80 seconds
Resolution: 1280x720
FPS (if available): 30.0


# Transcribe Video

In [3]:
from moviepy.editor import VideoFileClip
from transcribe import Transcriber

# 1) Extract audio from the video (using MoviePy for quick demo).
video_path = "demo_video.mov"
output_audio_path = "demo_audio.wav"

clip = VideoFileClip(video_path)
clip.audio.write_audiofile(output_audio_path, codec='pcm_s16le', logger=None)  # Writes a WAV by default

# 2) Create a Transcriber instance
transcriber = Transcriber(model_name="base", device="cpu")

# 3) Transcribe the extracted audio
result = transcriber.transcribe_audio(output_audio_path)

# 4) Display the results
transcriber.display_full_text(result)

Detected language: English


100%|██████████| 1080/1080 [00:00<00:00, 2765.07frames/s]


--- Transcribed Text ---
 Hello, my name is Ezra and I'm trying to demo this cool technology. The quick brown fox jumps over the lazy dog.





## Translate Text

In [4]:
from translate import Translator

# Example transcribed text from the previous step (or supply your own)
transcribed_text = result.get("text", "")

# Create a Translator for English-to-Spanish
translator = Translator(source_lang="en", target_lang="es")

# Perform the translation
translated_text = translator.translate_text(transcribed_text)

# Print the result
print("Original Text:", transcribed_text)
print("Translated Text:", translated_text)

Original Text:  Hello, my name is Ezra and I'm trying to demo this cool technology. The quick brown fox jumps over the lazy dog.
Translated Text: Hola, me llamo Ezra y estoy tratando de demoler esta tecnología genial. El zorro marrón salta sobre el perrito perezoso.


## Speak the Text

In [5]:
# In demo.ipynb
# 
# from moviepy.editor import VideoFileClip
# from speech import Speaker
# 
# 
# # Extract audio from demo_video.mov if you haven't done so yet:
# video_path = "demo_video.mov"
# extracted_audio_path = "demo_audio.wav"
# 
# clip = VideoFileClip(video_path)
# clip.audio.write_audiofile(extracted_audio_path, codec="pcm_s16le")
# 
# # Initialize the Speaker with Spanish output
# speaker = Speaker(
#     model_name="tts_models/multilingual/multi-dataset/xtts_v2",
#     language_code="es"
# )

In [6]:
# Generate speech using the speaker's voice from the extracted audio
# output_speech_path = speaker.generate_speech(
#     text=translated_text,
#     speaker_wav=extracted_audio_path,
#     output_path="demo_tts.wav"
# )
# 
# print(f"Cloned speech saved to: {output_speech_path}")

## Match the Lips

In [24]:
# demo.ipynb

from lipmatch import LipMatcher

# Paths to your files
original_video_path = "demo_video.mp4"  # The original video with the speaker
new_audio_path = "demo_tts.wav"        # The newly generated Spanish audio
output_video_path = "final_demo_lipsynced.mp4"

# 1) Create a LipMatcher instance
matcher = LipMatcher(
    wav2lip_repo_path="Wav2Lip", 
    checkpoint_path="Wav2Lip/checkpoints/wav2lip.pth"
)

In [25]:

# 2) Run the lip synchronization
synced_video = matcher.match_lips(
    original_video_path=original_video_path, 
    new_audio_path=new_audio_path, 
    output_video_path=output_video_path,
    pads=[0, 20, 0, 0],
    no_smoothing=True
)

print(f"Final lip-synced video: {synced_video}")

Running Wav2Lip command:
python Wav2Lip/inference.py --checkpoint_path Wav2Lip/checkpoints/wav2lip.pth --face demo_video.mp4 --audio demo_tts.wav --outfile final_demo_lipsynced.mp4 --pads 0 20 0 0 --nosmooth
Using mps for inference.
Reading video frames...
Number of frames available for inference: 322
(80, 839)
Length of mel chunks: 311


  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/20 [00:00<?, ?it/s][A
  5%|▌         | 1/20 [00:03<01:10,  3.70s/it][A
 10%|█         | 2/20 [00:06<00:58,  3.24s/it][A
 15%|█▌        | 3/20 [00:09<00:55,  3.26s/it][A
 20%|██        | 4/20 [00:13<00:52,  3.31s/it][A
 25%|██▌       | 5/20 [00:16<00:49,  3.31s/it][A
 30%|███       | 6/20 [00:19<00:45,  3.28s/it][A
 35%|███▌      | 7/20 [00:23<00:43,  3.31s/it][A
 40%|████      | 8/20 [00:26<00:39,  3.30s/it][A
 45%|████▌     | 9/20 [00:29<00:34,  3.16s/it][A
 50%|█████     | 10/20 [00:32<00:31,  3.15s/it][A
 55%|█████▌    | 11/20 [00:35<00:28,  3.16s/it][A
 60%|██████    | 12/20 [00:38<00:25,  3.18s/it][A
 65%|██████▌   | 13/20 [00:41<00:22,  3.15s/it][A
 70%|███████   | 14/20 [00:44<00:18,  3.03s/it][A
 75%|███████▌  | 15/20 [00:47<00:15,  3.08s/it][A
 80%|████████  | 16/20 [00:50<00:12,  3.08s/it][A
 85%|████████▌ | 17/20 [00:54<00:09,  3.17s/it][A
 90%|█████████ | 18/20 [00:57<00:06,  3.19s/it][A
 95%|██████

Load checkpoint from: Wav2Lip/checkpoints/wav2lip.pth
Model loaded
Finished inference - Now to ffmpeg command
Lip-synced video saved to: final_demo_lipsynced.mp4
Final lip-synced video: final_demo_lipsynced.mp4
